diff --git a/.devcontainer/post_create_command.sh b/.devcontainer/post_create_command.sh index d879876d8..c53c26bb9 100755 --- a/.devcontainer/post_create_command.sh +++ b/.devcontainer/post_create_command.sh @@ -1,13 +1,13 @@ #!/bin/bash -npm add -g pnpm@9.12.2 +npm add -g pnpm@10.8.0 cd web && pnpm install pipx install poetry echo 'alias start-api="cd /workspaces/dify/api && poetry run python -m flask run --host 0.0.0.0 --port=5001 --debug"' >> ~/.bashrc echo 'alias start-worker="cd /workspaces/dify/api && poetry run python -m celery -A app.celery worker -P gevent -c 1 --loglevel INFO -Q dataset,generation,mail,ops_trace,app_deletion"' >> ~/.bashrc echo 'alias start-web="cd /workspaces/dify/web && pnpm dev"' >> ~/.bashrc -echo 'alias start-containers="cd /workspaces/dify/docker && docker-compose -f docker-compose.middleware.yaml -p dify up -d"' >> ~/.bashrc -echo 'alias stop-containers="cd /workspaces/dify/docker && docker-compose -f docker-compose.middleware.yaml -p dify down"' >> ~/.bashrc +echo 'alias start-containers="cd /workspaces/dify/docker && docker-compose -f docker-compose.middleware.yaml -p dify --env-file middleware.env up -d"' >> ~/.bashrc +echo 'alias stop-containers="cd /workspaces/dify/docker && docker-compose -f docker-compose.middleware.yaml -p dify --env-file middleware.env down"' >> ~/.bashrc source /home/vscode/.bashrc diff --git a/.github/workflows/api-tests.yml b/.github/workflows/api-tests.yml index b9547b645..dca8e640c 100644 --- a/.github/workflows/api-tests.yml +++ b/.github/workflows/api-tests.yml @@ -53,9 +53,14 @@ jobs: - name: Run dify config tests run: poetry run -P api python dev/pytest/pytest_config_tests.py + - name: Cache MyPy + uses: actions/cache@v4 + with: + path: api/.mypy_cache + key: mypy-${{ matrix.python-version }}-${{ runner.os }}-${{ hashFiles('api/poetry.lock') }} + - name: Run mypy - run: | - poetry run -C api python -m mypy --install-types --non-interactive . + run: dev/run-mypy - name: Set up dotenvs run: | diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index d73a782c9..625930b5f 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -82,7 +82,7 @@ jobs: uses: actions/setup-node@v4 if: steps.changed-files.outputs.any_changed == 'true' with: - node-version: 20 + node-version: 22 cache: pnpm cache-dependency-path: ./web/package.json @@ -153,6 +153,7 @@ jobs: env: BASH_SEVERITY: warning DEFAULT_BRANCH: main + FILTER_REGEX_INCLUDE: pnpm-lock.yaml GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} IGNORE_GENERATED_FILES: true IGNORE_GITIGNORED_FILES: true diff --git a/.github/workflows/tool-test-sdks.yaml b/.github/workflows/tool-test-sdks.yaml index 93edb2737..a6e48d135 100644 --- a/.github/workflows/tool-test-sdks.yaml +++ b/.github/workflows/tool-test-sdks.yaml @@ -18,7 +18,7 @@ jobs: strategy: matrix: - node-version: [16, 18, 20] + node-version: [16, 18, 20, 22] defaults: run: diff --git a/.github/workflows/translate-i18n-base-on-english.yml b/.github/workflows/translate-i18n-base-on-english.yml index 80b78a131..3f8082eb6 100644 --- a/.github/workflows/translate-i18n-base-on-english.yml +++ b/.github/workflows/translate-i18n-base-on-english.yml @@ -33,7 +33,7 @@ jobs: - name: Set up Node.js if: env.FILES_CHANGED == 'true' - uses: actions/setup-node@v2 + uses: actions/setup-node@v4 with: node-version: 'lts/*' diff --git a/.github/workflows/vdb-tests.yml b/.github/workflows/vdb-tests.yml index 3e7374722..5e3f7a557 100644 --- a/.github/workflows/vdb-tests.yml +++ b/.github/workflows/vdb-tests.yml @@ -76,7 +76,6 @@ jobs: milvus-standalone pgvecto-rs pgvector - opengauss chroma elasticsearch diff --git a/.github/workflows/web-tests.yml b/.github/workflows/web-tests.yml index e32db548a..85e8b9947 100644 --- a/.github/workflows/web-tests.yml +++ b/.github/workflows/web-tests.yml @@ -31,25 +31,26 @@ jobs: uses: tj-actions/changed-files@v45 with: files: web/** - # to run pnpm, should install package canvas, but it always install failed on amd64 under ubuntu-latest - # - name: Install pnpm - # uses: pnpm/action-setup@v4 - # with: - # version: 10 - # run_install: false - # - name: Setup Node.js - # uses: actions/setup-node@v4 - # if: steps.changed-files.outputs.any_changed == 'true' - # with: - # node-version: 20 - # cache: pnpm - # cache-dependency-path: ./web/package.json + - name: Install pnpm + if: steps.changed-files.outputs.any_changed == 'true' + uses: pnpm/action-setup@v4 + with: + version: 10 + run_install: false - # - name: Install dependencies - # if: steps.changed-files.outputs.any_changed == 'true' - # run: pnpm install --frozen-lockfile + - name: Setup Node.js + uses: actions/setup-node@v4 + if: steps.changed-files.outputs.any_changed == 'true' + with: + node-version: 22 + cache: pnpm + cache-dependency-path: ./web/package.json - # - name: Run tests - # if: steps.changed-files.outputs.any_changed == 'true' - # run: pnpm test + - name: Install dependencies + if: steps.changed-files.outputs.any_changed == 'true' + run: pnpm install --frozen-lockfile + + - name: Run tests + if: steps.changed-files.outputs.any_changed == 'true' + run: pnpm test diff --git a/.gitignore b/.gitignore index 7c5f4851c..819a24958 100644 --- a/.gitignore +++ b/.gitignore @@ -103,6 +103,7 @@ celerybeat.pid # Environments .env +.env-local .venv env/ venv/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ea6649701..5d4ba3648 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,7 +18,7 @@ Need to update an existing model runtime, tool, or squash some bugs? Head over t Join the fun, contribute, and let's build something awesome together! 💡✨ -Don't forget to link an existing issue or open an new issue in the PR's description. +Don't forget to link an existing issue or open a new issue in the PR's description. ### Bug reports @@ -68,7 +68,7 @@ How we prioritize: 4. Please add tests for your changes accordingly 5. Ensure your code passes the existing tests 6. Please link the issue in the PR description, `fixes #` -7. Get merrged! +7. Get merged! ### Setup the project #### Frontend @@ -90,4 +90,4 @@ We recommend reviewing this document carefully before proceeding with the setup, Feel free to reach out if you encounter any issues during the setup process. ## Getting Help -If you ever get stuck or got a burning question while contributing, simply shoot your queries our way via the related GitHub issue, or hop onto our [Discord](https://discord.gg/8Tpq4AcN9c) for a quick chat. +If you ever get stuck or get a burning question while contributing, simply shoot your queries our way via the related GitHub issue, or hop onto our [Discord](https://discord.gg/8Tpq4AcN9c) for a quick chat. diff --git a/CONTRIBUTING_ES.md b/CONTRIBUTING_ES.md new file mode 100644 index 000000000..261aa0fda --- /dev/null +++ b/CONTRIBUTING_ES.md @@ -0,0 +1,93 @@ +# CONTRIBUIR + +Así que estás buscando contribuir a Dify - eso es fantástico, estamos ansiosos por ver lo que haces. Como una startup con personal y financiación limitados, tenemos grandes ambiciones de diseñar el flujo de trabajo más intuitivo para construir y gestionar aplicaciones LLM. Cualquier ayuda de la comunidad cuenta, realmente. + +Necesitamos ser ágiles y enviar rápidamente dado donde estamos, pero también queremos asegurarnos de que colaboradores como tú obtengan una experiencia lo más fluida posible al contribuir. Hemos elaborado esta guía de contribución con ese propósito, con el objetivo de familiarizarte con la base de código y cómo trabajamos con los colaboradores, para que puedas pasar rápidamente a la parte divertida. + +Esta guía, como Dify mismo, es un trabajo en constante progreso. Agradecemos mucho tu comprensión si a veces se queda atrás del proyecto real, y damos la bienvenida a cualquier comentario para que podamos mejorar. + +En términos de licencia, por favor tómate un minuto para leer nuestro breve [Acuerdo de Licencia y Colaborador](./LICENSE). La comunidad también se adhiere al [código de conducta](https://github.com/langgenius/.github/blob/main/CODE_OF_CONDUCT.md). + +## Antes de empezar + +¿Buscas algo en lo que trabajar? Explora nuestros [buenos primeros issues](https://github.com/langgenius/dify/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22) y elige uno para comenzar. + +¿Tienes un nuevo modelo o herramienta genial para añadir? Abre un PR en nuestro [repositorio de plugins](https://github.com/langgenius/dify-plugins) y muéstranos lo que has construido. + +¿Necesitas actualizar un modelo existente, herramienta o corregir algunos errores? Dirígete a nuestro [repositorio oficial de plugins](https://github.com/langgenius/dify-official-plugins) y haz tu magia. + +¡Únete a la diversión, contribuye y construyamos algo increíble juntos! 💡✨ + +No olvides vincular un issue existente o abrir uno nuevo en la descripción del PR. + +### Informes de errores + +> [!IMPORTANT] +> Por favor, asegúrate de incluir la siguiente información al enviar un informe de error: + +- Un título claro y descriptivo +- Una descripción detallada del error, incluyendo cualquier mensaje de error +- Pasos para reproducir el error +- Comportamiento esperado +- **Logs**, si están disponibles, para problemas del backend, esto es realmente importante, puedes encontrarlos en los logs de docker-compose +- Capturas de pantalla o videos, si es aplicable + +Cómo priorizamos: + + | Tipo de Issue | Prioridad | + | ------------------------------------------------------------ | --------------- | + | Errores en funciones principales (servicio en la nube, no poder iniciar sesión, aplicaciones que no funcionan, fallos de seguridad) | Crítica | + | Errores no críticos, mejoras de rendimiento | Prioridad Media | + | Correcciones menores (errores tipográficos, UI confusa pero funcional) | Prioridad Baja | + +### Solicitudes de funcionalidades + +> [!NOTE] +> Por favor, asegúrate de incluir la siguiente información al enviar una solicitud de funcionalidad: + +- Un título claro y descriptivo +- Una descripción detallada de la funcionalidad +- Un caso de uso para la funcionalidad +- Cualquier otro contexto o capturas de pantalla sobre la solicitud de funcionalidad + +Cómo priorizamos: + + | Tipo de Funcionalidad | Prioridad | + | ------------------------------------------------------------ | --------------- | + | Funcionalidades de alta prioridad etiquetadas por un miembro del equipo | Prioridad Alta | + | Solicitudes populares de funcionalidades de nuestro [tablero de comentarios de la comunidad](https://github.com/langgenius/dify/discussions/categories/feedbacks) | Prioridad Media | + | Funcionalidades no principales y mejoras menores | Prioridad Baja | + | Valiosas pero no inmediatas | Futura-Funcionalidad | +## Enviando tu PR + +### Proceso de Pull Request + +1. Haz un fork del repositorio +2. Antes de redactar un PR, por favor crea un issue para discutir los cambios que quieres hacer +3. Crea una nueva rama para tus cambios +4. Por favor añade pruebas para tus cambios en consecuencia +5. Asegúrate de que tu código pasa las pruebas existentes +6. Por favor vincula el issue en la descripción del PR, `fixes #` +7. ¡Fusiona tu código! +### Configuración del proyecto + +#### Frontend + +Para configurar el servicio frontend, por favor consulta nuestra [guía completa](https://github.com/langgenius/dify/blob/main/web/README.md) en el archivo `web/README.md`. Este documento proporciona instrucciones detalladas para ayudarte a configurar el entorno frontend correctamente. + +#### Backend + +Para configurar el servicio backend, por favor consulta nuestras [instrucciones detalladas](https://github.com/langgenius/dify/blob/main/api/README.md) en el archivo `api/README.md`. Este documento contiene una guía paso a paso para ayudarte a poner en marcha el backend sin problemas. + +#### Otras cosas a tener en cuenta + +Recomendamos revisar este documento cuidadosamente antes de proceder con la configuración, ya que contiene información esencial sobre: +- Requisitos previos y dependencias +- Pasos de instalación +- Detalles de configuración +- Consejos comunes de solución de problemas + +No dudes en contactarnos si encuentras algún problema durante el proceso de configuración. +## Obteniendo Ayuda + +Si alguna vez te quedas atascado o tienes una pregunta urgente mientras contribuyes, simplemente envíanos tus consultas a través del issue relacionado de GitHub, o únete a nuestro [Discord](https://discord.gg/8Tpq4AcN9c) para una charla rápida. \ No newline at end of file diff --git a/CONTRIBUTING_FR.md b/CONTRIBUTING_FR.md new file mode 100644 index 000000000..c3418f86c --- /dev/null +++ b/CONTRIBUTING_FR.md @@ -0,0 +1,93 @@ +# CONTRIBUER + +Vous cherchez donc à contribuer à Dify - c'est fantastique, nous avons hâte de voir ce que vous allez faire. En tant que startup avec un personnel et un financement limités, nous avons de grandes ambitions pour concevoir le flux de travail le plus intuitif pour construire et gérer des applications LLM. Toute aide de la communauté compte, vraiment. + +Nous devons être agiles et livrer rapidement compte tenu de notre position, mais nous voulons aussi nous assurer que des contributeurs comme vous obtiennent une expérience aussi fluide que possible lors de leur contribution. Nous avons élaboré ce guide de contribution dans ce but, visant à vous familiariser avec la base de code et comment nous travaillons avec les contributeurs, afin que vous puissiez rapidement passer à la partie amusante. + +Ce guide, comme Dify lui-même, est un travail en constante évolution. Nous apprécions grandement votre compréhension si parfois il est en retard par rapport au projet réel, et nous accueillons tout commentaire pour nous aider à nous améliorer. + +En termes de licence, veuillez prendre une minute pour lire notre bref [Accord de Licence et de Contributeur](./LICENSE). La communauté adhère également au [code de conduite](https://github.com/langgenius/.github/blob/main/CODE_OF_CONDUCT.md). + +## Avant de vous lancer + +Vous cherchez quelque chose à réaliser ? Parcourez nos [problèmes pour débutants](https://github.com/langgenius/dify/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22) et choisissez-en un pour commencer ! + +Vous avez un nouveau modèle ou un nouvel outil à ajouter ? Ouvrez une PR dans notre [dépôt de plugins](https://github.com/langgenius/dify-plugins) et montrez-nous ce que vous avez créé. + +Vous devez mettre à jour un modèle existant, un outil ou corriger des bugs ? Rendez-vous sur notre [dépôt officiel de plugins](https://github.com/langgenius/dify-official-plugins) et faites votre magie ! + +Rejoignez l'aventure, contribuez, et construisons ensemble quelque chose d'extraordinaire ! 💡✨ + +N'oubliez pas de lier un problème existant ou d'ouvrir un nouveau problème dans la description de votre PR. + +### Rapports de bugs + +> [!IMPORTANT] +> Veuillez vous assurer d'inclure les informations suivantes lors de la soumission d'un rapport de bug : + +- Un titre clair et descriptif +- Une description détaillée du bug, y compris tous les messages d'erreur +- Les étapes pour reproduire le bug +- Comportement attendu +- **Logs**, si disponibles, pour les problèmes de backend, c'est vraiment important, vous pouvez les trouver dans les logs de docker-compose +- Captures d'écran ou vidéos, si applicable + +Comment nous priorisons : + + | Type de Problème | Priorité | + | ------------------------------------------------------------ | --------------- | + | Bugs dans les fonctions principales (service cloud, impossibilité de se connecter, applications qui ne fonctionnent pas, failles de sécurité) | Critique | + | Bugs non critiques, améliorations de performance | Priorité Moyenne | + | Corrections mineures (fautes de frappe, UI confuse mais fonctionnelle) | Priorité Basse | + +### Demandes de fonctionnalités + +> [!NOTE] +> Veuillez vous assurer d'inclure les informations suivantes lors de la soumission d'une demande de fonctionnalité : + +- Un titre clair et descriptif +- Une description détaillée de la fonctionnalité +- Un cas d'utilisation pour la fonctionnalité +- Tout autre contexte ou captures d'écran concernant la demande de fonctionnalité + +Comment nous priorisons : + + | Type de Fonctionnalité | Priorité | + | ------------------------------------------------------------ | --------------- | + | Fonctionnalités hautement prioritaires étiquetées par un membre de l'équipe | Priorité Haute | + | Demandes populaires de fonctionnalités de notre [tableau de feedback communautaire](https://github.com/langgenius/dify/discussions/categories/feedbacks) | Priorité Moyenne | + | Fonctionnalités non essentielles et améliorations mineures | Priorité Basse | + | Précieuses mais non immédiates | Fonctionnalité Future | +## Soumettre votre PR + +### Processus de Pull Request + +1. Forkez le dépôt +2. Avant de rédiger une PR, veuillez créer un problème pour discuter des changements que vous souhaitez apporter +3. Créez une nouvelle branche pour vos changements +4. Veuillez ajouter des tests pour vos changements en conséquence +5. Assurez-vous que votre code passe les tests existants +6. Veuillez lier le problème dans la description de la PR, `fixes #` +7. Faites fusionner votre code ! +### Configuration du projet + +#### Frontend + +Pour configurer le service frontend, veuillez consulter notre [guide complet](https://github.com/langgenius/dify/blob/main/web/README.md) dans le fichier `web/README.md`. Ce document fournit des instructions détaillées pour vous aider à configurer correctement l'environnement frontend. + +#### Backend + +Pour configurer le service backend, veuillez consulter nos [instructions détaillées](https://github.com/langgenius/dify/blob/main/api/README.md) dans le fichier `api/README.md`. Ce document contient un guide étape par étape pour vous aider à faire fonctionner le backend sans problème. + +#### Autres choses à noter + +Nous recommandons de revoir attentivement ce document avant de procéder à la configuration, car il contient des informations essentielles sur : +- Prérequis et dépendances +- Étapes d'installation +- Détails de configuration +- Conseils courants de dépannage + +N'hésitez pas à nous contacter si vous rencontrez des problèmes pendant le processus de configuration. +## Obtenir de l'aide + +Si jamais vous êtes bloqué ou avez une question urgente en contribuant, envoyez-nous simplement vos questions via le problème GitHub concerné, ou rejoignez notre [Discord](https://discord.gg/8Tpq4AcN9c) pour une discussion rapide. \ No newline at end of file diff --git a/CONTRIBUTING_KR.md b/CONTRIBUTING_KR.md new file mode 100644 index 000000000..fcf44d495 --- /dev/null +++ b/CONTRIBUTING_KR.md @@ -0,0 +1,93 @@ +# 기여하기 + +Dify에 기여하려고 하시는군요 - 정말 멋집니다, 당신이 무엇을 할지 기대가 됩니다. 인력과 자금이 제한된 스타트업으로서, 우리는 LLM 애플리케이션을 구축하고 관리하기 위한 가장 직관적인 워크플로우를 설계하고자 하는 큰 야망을 가지고 있습니다. 커뮤니티의 모든 도움은 정말 중요합니다. + +우리는 현재 상황에서 민첩하게 빠르게 배포해야 하지만, 동시에 당신과 같은 기여자들이 기여하는 과정에서 최대한 원활한 경험을 얻을 수 있도록 하고 싶습니다. 우리는 이러한 목적으로 이 기여 가이드를 작성했으며, 여러분이 코드베이스와 우리가 기여자들과 어떻게 협업하는지에 대해 친숙해질 수 있도록 돕고, 빠르게 재미있는 부분으로 넘어갈 수 있도록 하고자 합니다. + +이 가이드는 Dify 자체와 마찬가지로 끊임없이 진행 중인 작업입니다. 때로는 실제 프로젝트보다 뒤처질 수 있다는 점을 이해해 주시면 감사하겠으며, 개선을 위한 피드백은 언제든지 환영합니다. + +라이센스 측면에서, 간략한 [라이센스 및 기여자 동의서](./LICENSE)를 읽어보는 시간을 가져주세요. 커뮤니티는 또한 [행동 강령](https://github.com/langgenius/.github/blob/main/CODE_OF_CONDUCT.md)을 준수합니다. + +## 시작하기 전에 + +처리할 작업을 찾고 계신가요? [초보자를 위한 이슈](https://github.com/langgenius/dify/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22)를 살펴보고 시작할 것을 선택하세요! + +추가할 새로운 모델 런타임이나 도구가 있나요? 우리의 [플러그인 저장소](https://github.com/langgenius/dify-plugins)에 PR을 열고 당신이 만든 것을 보여주세요. + +기존 모델 런타임, 도구를 업데이트하거나 버그를 수정해야 하나요? 우리의 [공식 플러그인 저장소](https://github.com/langgenius/dify-official-plugins)로 가서 당신의 마법을 펼치세요! + +함께 즐기고, 기여하고, 멋진 것을 함께 만들어 봅시다! 💡✨ + +PR 설명에 기존 이슈를 연결하거나 새 이슈를 여는 것을 잊지 마세요. + +### 버그 보고 + +> [!IMPORTANT] +> 버그 보고서를 제출할 때 다음 정보를 포함해 주세요: + +- 명확하고 설명적인 제목 +- 오류 메시지를 포함한 버그에 대한 상세한 설명 +- 버그를 재현하는 단계 +- 예상되는 동작 +- 가능한 경우 **로그**, 백엔드 이슈의 경우 매우 중요합니다. docker-compose 로그에서 찾을 수 있습니다 +- 해당되는 경우 스크린샷 또는 비디오 + +우선순위 결정 방법: + + | 이슈 유형 | 우선순위 | + | ------------------------------------------------------------ | --------------- | + | 핵심 기능의 버그(클라우드 서비스, 로그인 불가, 애플리케이션 작동 불능, 보안 취약점) | 중대 | + | 비중요 버그, 성능 향상 | 중간 우선순위 | + | 사소한 수정(오타, 혼란스럽지만 작동하는 UI) | 낮은 우선순위 | + +### 기능 요청 + +> [!NOTE] +> 기능 요청을 제출할 때 다음 정보를 포함해 주세요: + +- 명확하고 설명적인 제목 +- 기능에 대한 상세한 설명 +- 해당 기능의 사용 사례 +- 기능 요청에 관한 기타 컨텍스트 또는 스크린샷 + +우선순위 결정 방법: + + | 기능 유형 | 우선순위 | + | ------------------------------------------------------------ | --------------- | + | 팀 구성원에 의해 레이블이 지정된 고우선순위 기능 | 높은 우선순위 | + | 우리의 [커뮤니티 피드백 보드](https://github.com/langgenius/dify/discussions/categories/feedbacks)에서 인기 있는 기능 요청 | 중간 우선순위 | + | 비핵심 기능 및 사소한 개선 | 낮은 우선순위 | + | 가치 있지만 즉시 필요하지 않은 기능 | 미래 기능 | +## PR 제출하기 + +### Pull Request 프로세스 + +1. 저장소를 포크하세요 +2. PR을 작성하기 전에, 변경하고자 하는 내용에 대해 논의하기 위한 이슈를 생성해 주세요 +3. 변경 사항을 위한 새 브랜치를 만드세요 +4. 변경 사항에 대한 테스트를 적절히 추가해 주세요 +5. 코드가 기존 테스트를 통과하는지 확인하세요 +6. PR 설명에 이슈를 연결해 주세요, `fixes #<이슈_번호>` +7. 병합 완료! +### 프로젝트 설정하기 + +#### 프론트엔드 + +프론트엔드 서비스를 설정하려면, `web/README.md` 파일에 있는 우리의 [종합 가이드](https://github.com/langgenius/dify/blob/main/web/README.md)를 참조하세요. 이 문서는 프론트엔드 환경을 적절히 설정하는 데 도움이 되는 자세한 지침을 제공합니다. + +#### 백엔드 + +백엔드 서비스를 설정하려면, `api/README.md` 파일에 있는 우리의 [상세 지침](https://github.com/langgenius/dify/blob/main/api/README.md)을 참조하세요. 이 문서는 백엔드를 원활하게 실행하는 데 도움이 되는 단계별 가이드를 포함하고 있습니다. + +#### 기타 참고 사항 + +설정을 진행하기 전에 이 문서를 주의 깊게 검토하는 것을 권장합니다. 다음과 같은 필수 정보가 포함되어 있습니다: +- 필수 조건 및 종속성 +- 설치 단계 +- 구성 세부 정보 +- 일반적인 문제 해결 팁 + +설정 과정에서 문제가 발생하면 언제든지 연락해 주세요. +## 도움 받기 + +기여하는 동안 막히거나 긴급한 질문이 있으면, 관련 GitHub 이슈를 통해 질문을 보내거나, 빠른 대화를 위해 우리의 [Discord](https://discord.gg/8Tpq4AcN9c)에 참여하세요. \ No newline at end of file diff --git a/CONTRIBUTING_PT.md b/CONTRIBUTING_PT.md new file mode 100644 index 000000000..bba76c17e --- /dev/null +++ b/CONTRIBUTING_PT.md @@ -0,0 +1,93 @@ +# CONTRIBUINDO + +Então você está procurando contribuir para o Dify - isso é incrível, mal podemos esperar para ver o que você vai fazer. Como uma startup com equipe e financiamento limitados, temos grandes ambições de projetar o fluxo de trabalho mais intuitivo para construir e gerenciar aplicações LLM. Qualquer ajuda da comunidade conta, verdadeiramente. + +Precisamos ser ágeis e entregar rapidamente considerando onde estamos, mas também queremos garantir que colaboradores como você tenham uma experiência o mais tranquila possível ao contribuir. Montamos este guia de contribuição com esse propósito, visando familiarizá-lo com a base de código e como trabalhamos com os colaboradores, para que você possa rapidamente passar para a parte divertida. + +Este guia, como o próprio Dify, é um trabalho em constante evolução. Agradecemos muito a sua compreensão se às vezes ele ficar atrasado em relação ao projeto real, e damos as boas-vindas a qualquer feedback para que possamos melhorar. + +Em termos de licenciamento, por favor, dedique um minuto para ler nosso breve [Acordo de Licença e Contribuidor](./LICENSE). A comunidade também adere ao [código de conduta](https://github.com/langgenius/.github/blob/main/CODE_OF_CONDUCT.md). + +## Antes de começar + +Procurando algo para resolver? Navegue por nossos [problemas para iniciantes](https://github.com/langgenius/dify/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22) e escolha um para começar! + +Tem um novo modelo ou ferramenta para adicionar? Abra um PR em nosso [repositório de plugins](https://github.com/langgenius/dify-plugins) e mostre-nos o que você construiu. + +Precisa atualizar um modelo existente, ferramenta ou corrigir alguns bugs? Vá para nosso [repositório oficial de plugins](https://github.com/langgenius/dify-official-plugins) e faça sua mágica! + +Junte-se à diversão, contribua e vamos construir algo incrível juntos! 💡✨ + +Não se esqueça de vincular um problema existente ou abrir um novo problema na descrição do PR. + +### Relatórios de bugs + +> [!IMPORTANT] +> Por favor, certifique-se de incluir as seguintes informações ao enviar um relatório de bug: + +- Um título claro e descritivo +- Uma descrição detalhada do bug, incluindo quaisquer mensagens de erro +- Passos para reproduzir o bug +- Comportamento esperado +- **Logs**, se disponíveis, para problemas de backend, isso é realmente importante, você pode encontrá-los nos logs do docker-compose +- Capturas de tela ou vídeos, se aplicável + +Como priorizamos: + + | Tipo de Problema | Prioridade | + | ------------------------------------------------------------ | --------------- | + | Bugs em funções centrais (serviço em nuvem, não conseguir fazer login, aplicações não funcionando, falhas de segurança) | Crítica | + | Bugs não críticos, melhorias de desempenho | Prioridade Média | + | Correções menores (erros de digitação, interface confusa mas funcional) | Prioridade Baixa | + +### Solicitações de recursos + +> [!NOTE] +> Por favor, certifique-se de incluir as seguintes informações ao enviar uma solicitação de recurso: + +- Um título claro e descritivo +- Uma descrição detalhada do recurso +- Um caso de uso para o recurso +- Qualquer outro contexto ou capturas de tela sobre a solicitação de recurso + +Como priorizamos: + + | Tipo de Recurso | Prioridade | + | ------------------------------------------------------------ | --------------- | + | Recursos de alta prioridade conforme rotulado por um membro da equipe | Prioridade Alta | + | Solicitações populares de recursos do nosso [quadro de feedback da comunidade](https://github.com/langgenius/dify/discussions/categories/feedbacks) | Prioridade Média | + | Recursos não essenciais e melhorias menores | Prioridade Baixa | + | Valiosos mas não imediatos | Recurso Futuro | +## Enviando seu PR + +### Processo de Pull Request + +1. Faça um fork do repositório +2. Antes de elaborar um PR, por favor crie um problema para discutir as mudanças que você quer fazer +3. Crie um novo branch para suas alterações +4. Por favor, adicione testes para suas alterações conforme apropriado +5. Certifique-se de que seu código passa nos testes existentes +6. Por favor, vincule o problema na descrição do PR, `fixes #` +7. Faça o merge do seu código! +### Configurando o projeto + +#### Frontend + +Para configurar o serviço frontend, por favor consulte nosso [guia abrangente](https://github.com/langgenius/dify/blob/main/web/README.md) no arquivo `web/README.md`. Este documento fornece instruções detalhadas para ajudá-lo a configurar o ambiente frontend adequadamente. + +#### Backend + +Para configurar o serviço backend, por favor consulte nossas [instruções detalhadas](https://github.com/langgenius/dify/blob/main/api/README.md) no arquivo `api/README.md`. Este documento contém um guia passo a passo para ajudá-lo a colocar o backend em funcionamento sem problemas. + +#### Outras coisas a observar + +Recomendamos revisar este documento cuidadosamente antes de prosseguir com a configuração, pois ele contém informações essenciais sobre: +- Pré-requisitos e dependências +- Etapas de instalação +- Detalhes de configuração +- Dicas comuns de solução de problemas + +Sinta-se à vontade para entrar em contato se encontrar quaisquer problemas durante o processo de configuração. +## Obtendo Ajuda + +Se você ficar preso ou tiver uma dúvida urgente enquanto contribui, simplesmente envie suas perguntas através do problema relacionado no GitHub, ou entre no nosso [Discord](https://discord.gg/8Tpq4AcN9c) para uma conversa rápida. \ No newline at end of file diff --git a/CONTRIBUTING_TR.md b/CONTRIBUTING_TR.md new file mode 100644 index 000000000..4e216d22a --- /dev/null +++ b/CONTRIBUTING_TR.md @@ -0,0 +1,93 @@ +# KATKIDA BULUNMAK + +Demek Dify'a katkıda bulunmak istiyorsunuz - bu harika, ne yapacağınızı görmek için sabırsızlanıyoruz. Sınırlı personel ve finansmana sahip bir startup olarak, LLM uygulamaları oluşturmak ve yönetmek için en sezgisel iş akışını tasarlama konusunda büyük hedeflerimiz var. Topluluktan gelen her türlü yardım gerçekten önemli. + +Bulunduğumuz noktada çevik olmamız ve hızlı hareket etmemiz gerekiyor, ancak sizin gibi katkıda bulunanların mümkün olduğunca sorunsuz bir deneyim yaşamasını da sağlamak istiyoruz. Bu katkı rehberini bu amaçla hazırladık; sizi kod tabanıyla ve katkıda bulunanlarla nasıl çalıştığımızla tanıştırmayı, böylece hızlıca eğlenceli kısma geçebilmenizi hedefliyoruz. + +Bu rehber, Dify'ın kendisi gibi, sürekli gelişen bir çalışmadır. Bazen gerçek projenin gerisinde kalırsa anlayışınız için çok minnettarız ve gelişmemize yardımcı olacak her türlü geri bildirimi memnuniyetle karşılıyoruz. + +Lisanslama konusunda, lütfen kısa [Lisans ve Katkıda Bulunan Anlaşmamızı](./LICENSE) okumak için bir dakikanızı ayırın. Topluluk ayrıca [davranış kurallarına](https://github.com/langgenius/.github/blob/main/CODE_OF_CONDUCT.md) da uyar. + +## Başlamadan Önce + +Üzerinde çalışacak bir şey mi arıyorsunuz? [İlk katkıda bulunanlar için iyi sorunlarımıza](https://github.com/langgenius/dify/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22) göz atın ve başlamak için birini seçin! + +Eklenecek harika bir yeni model runtime'ı veya aracınız mı var? [Eklenti depomuzda](https://github.com/langgenius/dify-plugins) bir PR açın ve ne yaptığınızı bize gösterin. + +Mevcut bir model runtime'ını, aracı güncellemek veya bazı hataları düzeltmek mi istiyorsunuz? [Resmi eklenti depomuza](https://github.com/langgenius/dify-official-plugins) gidin ve sihrinizi gösterin! + +Eğlenceye katılın, katkıda bulunun ve birlikte harika bir şeyler inşa edelim! 💡✨ + +PR açıklamasında mevcut bir sorunu bağlamayı veya yeni bir sorun açmayı unutmayın. + +### Hata Raporları + +> [!IMPORTANT] +> Lütfen bir hata raporu gönderirken aşağıdaki bilgileri dahil ettiğinizden emin olun: + +- Net ve açıklayıcı bir başlık +- Hata mesajları dahil hatanın ayrıntılı bir açıklaması +- Hatayı tekrarlamak için adımlar +- Beklenen davranış +- Mümkünse **Loglar**, backend sorunları için, bu gerçekten önemlidir, bunları docker-compose loglarında bulabilirsiniz +- Uygunsa ekran görüntüleri veya videolar + +Nasıl önceliklendiriyoruz: + + | Sorun Türü | Öncelik | + | ------------------------------------------------------------ | --------------- | + | Temel işlevlerdeki hatalar (bulut hizmeti, giriş yapamama, çalışmayan uygulamalar, güvenlik açıkları) | Kritik | + | Kritik olmayan hatalar, performans artışları | Orta Öncelik | + | Küçük düzeltmeler (yazım hataları, kafa karıştırıcı ama çalışan UI) | Düşük Öncelik | + +### Özellik İstekleri + +> [!NOTE] +> Lütfen bir özellik isteği gönderirken aşağıdaki bilgileri dahil ettiğinizden emin olun: + +- Net ve açıklayıcı bir başlık +- Özelliğin ayrıntılı bir açıklaması +- Özellik için bir kullanım durumu +- Özellik isteği hakkında diğer bağlamlar veya ekran görüntüleri + +Nasıl önceliklendiriyoruz: + + | Özellik Türü | Öncelik | + | ------------------------------------------------------------ | --------------- | + | Bir ekip üyesi tarafından etiketlenen Yüksek Öncelikli Özellikler | Yüksek Öncelik | + | [Topluluk geri bildirim panosundan](https://github.com/langgenius/dify/discussions/categories/feedbacks) popüler özellik istekleri | Orta Öncelik | + | Temel olmayan özellikler ve küçük geliştirmeler | Düşük Öncelik | + | Değerli ama acil olmayan | Gelecek-Özellik | +## PR'nizi Göndermek + +### Pull Request Süreci + +1. Depoyu fork edin +2. Bir PR taslağı oluşturmadan önce, yapmak istediğiniz değişiklikleri tartışmak için lütfen bir sorun oluşturun +3. Değişiklikleriniz için yeni bir dal oluşturun +4. Lütfen değişiklikleriniz için uygun testler ekleyin +5. Kodunuzun mevcut testleri geçtiğinden emin olun +6. Lütfen PR açıklamasında sorunu bağlayın, `fixes #` +7. Kodunuzu birleştirin! +### Projeyi Kurma + +#### Frontend + +Frontend hizmetini kurmak için, lütfen `web/README.md` dosyasındaki kapsamlı [rehberimize](https://github.com/langgenius/dify/blob/main/web/README.md) bakın. Bu belge, frontend ortamını düzgün bir şekilde kurmanıza yardımcı olacak ayrıntılı talimatlar sağlar. + +#### Backend + +Backend hizmetini kurmak için, lütfen `api/README.md` dosyasındaki detaylı [talimatlarımıza](https://github.com/langgenius/dify/blob/main/api/README.md) bakın. Bu belge, backend'i sorunsuz bir şekilde çalıştırmanıza yardımcı olacak adım adım bir kılavuz içerir. + +#### Dikkat Edilecek Diğer Şeyler + +Kuruluma geçmeden önce bu belgeyi dikkatlice incelemenizi öneririz, çünkü şunlar hakkında temel bilgiler içerir: +- Ön koşullar ve bağımlılıklar +- Kurulum adımları +- Yapılandırma detayları +- Yaygın sorun giderme ipuçları + +Kurulum süreci sırasında herhangi bir sorunla karşılaşırsanız bizimle iletişime geçmekten çekinmeyin. +## Yardım Almak + +Katkıda bulunurken takılırsanız veya yanıcı bir sorunuz olursa, sorularınızı ilgili GitHub sorunu aracılığıyla bize gönderin veya hızlı bir sohbet için [Discord'umuza](https://discord.gg/8Tpq4AcN9c) katılın. \ No newline at end of file diff --git a/LICENSE b/LICENSE index e26bae0ca..329ee3028 100644 --- a/LICENSE +++ b/LICENSE @@ -10,8 +10,6 @@ a. Multi-tenant service: Unless explicitly authorized by Dify in writing, you ma b. LOGO and copyright information: In the process of using Dify's frontend, you may not remove or modify the LOGO or copyright information in the Dify console or applications. This restriction is inapplicable to uses of Dify that do not involve its frontend. - Frontend Definition: For the purposes of this license, the "frontend" of Dify includes all components located in the `web/` directory when running Dify from the raw source code, or the "web" image when running Dify with Docker. -Please contact business@dify.ai by email to inquire about licensing matters. - 2. As a contributor, you should agree that: a. The producer can adjust the open-source agreement to be more strict or relaxed as deemed necessary. diff --git a/README_CN.md b/README_CN.md index 33e34423f..6d3c60110 100644 --- a/README_CN.md +++ b/README_CN.md @@ -254,8 +254,6 @@ docker compose up -d - [Discord](https://discord.gg/FngNHpbcY7)。👉:分享您的应用程序并与社区交流。 - [X(Twitter)](https://twitter.com/dify_ai)。👉:分享您的应用程序并与社区交流。 - [商业许可](mailto:business@dify.ai?subject=[GitHub]Business%20License%20Inquiry)。👉:有关商业用途许可 Dify.AI 的商业咨询。 - - [微信]() 👉:扫描下方二维码,添加微信好友,备注 Dify,我们将邀请您加入 Dify 社区。 -wechat ## 安全问题 diff --git a/api/.env.example b/api/.env.example index a2b23e71c..e9d34d4c4 100644 --- a/api/.env.example +++ b/api/.env.example @@ -137,7 +137,7 @@ WEB_API_CORS_ALLOW_ORIGINS=http://127.0.0.1:3000,* CONSOLE_CORS_ALLOW_ORIGINS=http://127.0.0.1:3000,* # Vector database configuration -# support: weaviate, qdrant, milvus, myscale, relyt, pgvecto_rs, pgvector, pgvector, chroma, opensearch, tidb_vector, couchbase, vikingdb, upstash, lindorm, oceanbase, opengauss +# support: weaviate, qdrant, milvus, myscale, relyt, pgvecto_rs, pgvector, pgvector, chroma, opensearch, tidb_vector, couchbase, vikingdb, upstash, lindorm, oceanbase, opengauss, tablestore VECTOR_STORE=weaviate # Weaviate configuration @@ -189,6 +189,7 @@ TENCENT_VECTOR_DB_USERNAME=dify TENCENT_VECTOR_DB_DATABASE=dify TENCENT_VECTOR_DB_SHARD=1 TENCENT_VECTOR_DB_REPLICAS=2 +TENCENT_VECTOR_DB_ENABLE_HYBRID_SEARCH=false # ElasticSearch configuration ELASTICSEARCH_HOST=127.0.0.1 @@ -212,6 +213,12 @@ PGVECTOR_DATABASE=postgres PGVECTOR_MIN_CONNECTION=1 PGVECTOR_MAX_CONNECTION=5 +# TableStore Vector configuration +TABLESTORE_ENDPOINT=https://instance-name.cn-hangzhou.ots.aliyuncs.com +TABLESTORE_INSTANCE_NAME=instance-name +TABLESTORE_ACCESS_KEY_ID=xxx +TABLESTORE_ACCESS_KEY_SECRET=xxx + # Tidb Vector configuration TIDB_VECTOR_HOST=xxx.eu-central-1.xxx.aws.tidbcloud.com TIDB_VECTOR_PORT=4000 @@ -297,6 +304,7 @@ OCEANBASE_VECTOR_USER=root@test OCEANBASE_VECTOR_PASSWORD=difyai123456 OCEANBASE_VECTOR_DATABASE=test OCEANBASE_MEMORY_LIMIT=6G +OCEANBASE_ENABLE_HYBRID_SEARCH=false # openGauss configuration OPENGAUSS_HOST=127.0.0.1 diff --git a/api/commands.py b/api/commands.py index ea82fef39..e70d6e0b4 100644 --- a/api/commands.py +++ b/api/commands.py @@ -276,6 +276,7 @@ def migrate_knowledge_vector_database(): VectorType.ORACLE, VectorType.ELASTICSEARCH, VectorType.OPENGAUSS, + VectorType.TABLESTORE, } lower_collection_vector_types = { VectorType.ANALYTICDB, diff --git a/api/configs/feature/__init__.py b/api/configs/feature/__init__.py index 46ded0244..fa8e8c2bf 100644 --- a/api/configs/feature/__init__.py +++ b/api/configs/feature/__init__.py @@ -848,6 +848,11 @@ class AccountConfig(BaseSettings): default=5, ) + EDUCATION_ENABLED: bool = Field( + description="whether to enable education identity", + default=False, + ) + class FeatureConfig( # place the configs in alphabet order diff --git a/api/configs/feature/hosted_service/__init__.py b/api/configs/feature/hosted_service/__init__.py index 71d06f462..18ef1ed45 100644 --- a/api/configs/feature/hosted_service/__init__.py +++ b/api/configs/feature/hosted_service/__init__.py @@ -1,6 +1,6 @@ from typing import Optional -from pydantic import Field, NonNegativeInt, computed_field +from pydantic import Field, NonNegativeInt from pydantic_settings import BaseSettings diff --git a/api/configs/middleware/__init__.py b/api/configs/middleware/__init__.py index 3bd638bc7..15dfe0063 100644 --- a/api/configs/middleware/__init__.py +++ b/api/configs/middleware/__init__.py @@ -33,6 +33,7 @@ from .vdb.pgvector_config import PGVectorConfig from .vdb.pgvectors_config import PGVectoRSConfig from .vdb.qdrant_config import QdrantConfig from .vdb.relyt_config import RelytConfig +from .vdb.tablestore_config import TableStoreConfig from .vdb.tencent_vector_config import TencentVectorDBConfig from .vdb.tidb_on_qdrant_config import TidbOnQdrantConfig from .vdb.tidb_vector_config import TiDBVectorConfig @@ -283,5 +284,6 @@ class MiddlewareConfig( OceanBaseVectorConfig, BaiduVectorDBConfig, OpenGaussConfig, + TableStoreConfig, ): pass diff --git a/api/configs/middleware/vdb/oceanbase_config.py b/api/configs/middleware/vdb/oceanbase_config.py index 87427af96..9b11a2273 100644 --- a/api/configs/middleware/vdb/oceanbase_config.py +++ b/api/configs/middleware/vdb/oceanbase_config.py @@ -33,3 +33,9 @@ class OceanBaseVectorConfig(BaseSettings): description="Name of the OceanBase Vector database to connect to", default=None, ) + + OCEANBASE_ENABLE_HYBRID_SEARCH: bool = Field( + description="Enable hybrid search features (requires OceanBase >= 4.3.5.1). Set to false for compatibility " + "with older versions", + default=False, + ) diff --git a/api/configs/middleware/vdb/tablestore_config.py b/api/configs/middleware/vdb/tablestore_config.py new file mode 100644 index 000000000..c4dcc0d46 --- /dev/null +++ b/api/configs/middleware/vdb/tablestore_config.py @@ -0,0 +1,30 @@ +from typing import Optional + +from pydantic import Field +from pydantic_settings import BaseSettings + + +class TableStoreConfig(BaseSettings): + """ + Configuration settings for TableStore. + """ + + TABLESTORE_ENDPOINT: Optional[str] = Field( + description="Endpoint address of the TableStore server (e.g. 'https://instance-name.cn-hangzhou.ots.aliyuncs.com')", + default=None, + ) + + TABLESTORE_INSTANCE_NAME: Optional[str] = Field( + description="Instance name to access TableStore server (eg. 'instance-name')", + default=None, + ) + + TABLESTORE_ACCESS_KEY_ID: Optional[str] = Field( + description="AccessKey id for the instance name", + default=None, + ) + + TABLESTORE_ACCESS_KEY_SECRET: Optional[str] = Field( + description="AccessKey secret for the instance name", + default=None, + ) diff --git a/api/configs/middleware/vdb/tencent_vector_config.py b/api/configs/middleware/vdb/tencent_vector_config.py index 9cf4d07f6..a51823c3f 100644 --- a/api/configs/middleware/vdb/tencent_vector_config.py +++ b/api/configs/middleware/vdb/tencent_vector_config.py @@ -48,3 +48,8 @@ class TencentVectorDBConfig(BaseSettings): description="Name of the specific Tencent Vector Database to connect to", default=None, ) + + TENCENT_VECTOR_DB_ENABLE_HYBRID_SEARCH: bool = Field( + description="Enable hybrid search features", + default=False, + ) diff --git a/api/configs/packaging/__init__.py b/api/configs/packaging/__init__.py index 0ef5a724b..c7aedc5b8 100644 --- a/api/configs/packaging/__init__.py +++ b/api/configs/packaging/__init__.py @@ -9,7 +9,7 @@ class PackagingInfo(BaseSettings): CURRENT_VERSION: str = Field( description="Dify version", - default="1.1.3", + default="1.2.0", ) COMMIT_SHA: str = Field( diff --git a/api/configs/remote_settings_sources/__init__.py b/api/configs/remote_settings_sources/__init__.py index 4f3878d13..be5d5d896 100644 --- a/api/configs/remote_settings_sources/__init__.py +++ b/api/configs/remote_settings_sources/__init__.py @@ -1,5 +1,3 @@ -from typing import Optional - from pydantic import Field from .apollo import ApolloSettingsSourceInfo diff --git a/api/controllers/console/app/app_import.py b/api/controllers/console/app/app_import.py index 47acb47a2..a159d4c5c 100644 --- a/api/controllers/console/app/app_import.py +++ b/api/controllers/console/app/app_import.py @@ -8,6 +8,7 @@ from werkzeug.exceptions import Forbidden from controllers.console.app.wraps import get_app_model from controllers.console.wraps import ( account_initialization_required, + cloud_edition_billing_resource_check, setup_required, ) from extensions.ext_database import db @@ -23,6 +24,7 @@ class AppImportApi(Resource): @login_required @account_initialization_required @marshal_with(app_import_fields) + @cloud_edition_billing_resource_check("apps") def post(self): # Check user role first if not current_user.is_editor: diff --git a/api/controllers/console/auth/forgot_password.py b/api/controllers/console/auth/forgot_password.py index 773ee6572..dc0009f36 100644 --- a/api/controllers/console/auth/forgot_password.py +++ b/api/controllers/console/auth/forgot_password.py @@ -99,53 +99,64 @@ class ForgotPasswordResetApi(Resource): parser.add_argument("password_confirm", type=valid_password, required=True, nullable=False, location="json") args = parser.parse_args() - new_password = args["new_password"] - password_confirm = args["password_confirm"] - - if str(new_password).strip() != str(password_confirm).strip(): + # Validate passwords match + if args["new_password"] != args["password_confirm"]: raise PasswordMismatchError() - token = args["token"] - reset_data = AccountService.get_reset_password_data(token) - - if reset_data is None: + # Validate token and get reset data + reset_data = AccountService.get_reset_password_data(args["token"]) + if not reset_data: raise InvalidTokenError() - AccountService.revoke_reset_password_token(token) + # Revoke token to prevent reuse + AccountService.revoke_reset_password_token(args["token"]) + # Generate secure salt and hash password salt = secrets.token_bytes(16) - base64_salt = base64.b64encode(salt).decode() + password_hashed = hash_password(args["new_password"], salt) - password_hashed = hash_password(new_password, salt) - base64_password_hashed = base64.b64encode(password_hashed).decode() + email = reset_data.get("email", "") with Session(db.engine) as session: - account = session.execute(select(Account).filter_by(email=reset_data.get("email"))).scalar_one_or_none() - if account: - account.password = base64_password_hashed - account.password_salt = base64_salt - db.session.commit() - tenant = TenantService.get_join_tenants(account) - if not tenant and not FeatureService.get_system_features().is_allow_create_workspace: - tenant = TenantService.create_tenant(f"{account.name}'s Workspace") - TenantService.create_tenant_member(tenant, account, role="owner") - account.current_tenant = tenant - tenant_was_created.send(tenant) - else: - try: - account = AccountService.create_account_and_tenant( - email=reset_data.get("email", ""), - name=reset_data.get("email", ""), - password=password_confirm, - interface_language=languages[0], - ) - except WorkSpaceNotAllowedCreateError: - pass - except AccountRegisterError: - raise AccountInFreezeError() + account = session.execute(select(Account).filter_by(email=email)).scalar_one_or_none() + + if account: + self._update_existing_account(account, password_hashed, salt, session) + else: + self._create_new_account(email, args["password_confirm"]) return {"result": "success"} + def _update_existing_account(self, account, password_hashed, salt, session): + # Update existing account credentials + account.password = base64.b64encode(password_hashed).decode() + account.password_salt = base64.b64encode(salt).decode() + session.commit() + + # Create workspace if needed + if ( + not TenantService.get_join_tenants(account) + and FeatureService.get_system_features().is_allow_create_workspace + ): + tenant = TenantService.create_tenant(f"{account.name}'s Workspace") + TenantService.create_tenant_member(tenant, account, role="owner") + account.current_tenant = tenant + tenant_was_created.send(tenant) + + def _create_new_account(self, email, password): + # Create new account if allowed + try: + AccountService.create_account_and_tenant( + email=email, + name=email, + password=password, + interface_language=languages[0], + ) + except WorkSpaceNotAllowedCreateError: + pass + except AccountRegisterError: + raise AccountInFreezeError() + api.add_resource(ForgotPasswordSendEmailApi, "/forgot-password") api.add_resource(ForgotPasswordCheckApi, "/forgot-password/validity") diff --git a/api/controllers/console/datasets/datasets.py b/api/controllers/console/datasets/datasets.py index fed304f72..4644ac629 100644 --- a/api/controllers/console/datasets/datasets.py +++ b/api/controllers/console/datasets/datasets.py @@ -641,12 +641,10 @@ class DatasetRetrievalSettingApi(Resource): VectorType.RELYT | VectorType.TIDB_VECTOR | VectorType.CHROMA - | VectorType.TENCENT | VectorType.PGVECTO_RS | VectorType.BAIDU | VectorType.VIKINGDB | VectorType.UPSTASH - | VectorType.OCEANBASE ): return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]} case ( @@ -664,6 +662,9 @@ class DatasetRetrievalSettingApi(Resource): | VectorType.COUCHBASE | VectorType.MILVUS | VectorType.OPENGAUSS + | VectorType.OCEANBASE + | VectorType.TABLESTORE + | VectorType.TENCENT ): return { "retrieval_method": [ @@ -687,12 +688,10 @@ class DatasetRetrievalSettingMockApi(Resource): | VectorType.RELYT | VectorType.TIDB_VECTOR | VectorType.CHROMA - | VectorType.TENCENT | VectorType.PGVECTO_RS | VectorType.BAIDU | VectorType.VIKINGDB | VectorType.UPSTASH - | VectorType.OCEANBASE ): return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]} case ( @@ -708,6 +707,9 @@ class DatasetRetrievalSettingMockApi(Resource): | VectorType.PGVECTOR | VectorType.LINDORM | VectorType.OPENGAUSS + | VectorType.OCEANBASE + | VectorType.TABLESTORE + | VectorType.TENCENT ): return { "retrieval_method": [ diff --git a/api/controllers/console/datasets/website.py b/api/controllers/console/datasets/website.py index da995537e..33c926b4c 100644 --- a/api/controllers/console/datasets/website.py +++ b/api/controllers/console/datasets/website.py @@ -14,7 +14,12 @@ class WebsiteCrawlApi(Resource): def post(self): parser = reqparse.RequestParser() parser.add_argument( - "provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json" + "provider", + type=str, + choices=["firecrawl", "watercrawl", "jinareader"], + required=True, + nullable=True, + location="json", ) parser.add_argument("url", type=str, required=True, nullable=True, location="json") parser.add_argument("options", type=dict, required=True, nullable=True, location="json") @@ -34,7 +39,9 @@ class WebsiteCrawlStatusApi(Resource): @account_initialization_required def get(self, job_id: str): parser = reqparse.RequestParser() - parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args") + parser.add_argument( + "provider", type=str, choices=["firecrawl", "watercrawl", "jinareader"], required=True, location="args" + ) args = parser.parse_args() # get crawl status try: diff --git a/api/controllers/console/error.py b/api/controllers/console/error.py index bd4ae9dc7..b8fd1f035 100644 --- a/api/controllers/console/error.py +++ b/api/controllers/console/error.py @@ -103,6 +103,18 @@ class AccountInFreezeError(BaseHTTPException): ) +class EducationVerifyLimitError(BaseHTTPException): + error_code = "education_verify_limit" + description = "Rate limit exceeded" + code = 429 + + +class EducationActivateLimitError(BaseHTTPException): + error_code = "education_activate_limit" + description = "Rate limit exceeded" + code = 429 + + class CompilanceRateLimitError(BaseHTTPException): error_code = "compilance_rate_limit" description = "Rate limit exceeded for downloading compliance report." diff --git a/api/controllers/console/workspace/account.py b/api/controllers/console/workspace/account.py index f1ec0f3d2..d2cc14048 100644 --- a/api/controllers/console/workspace/account.py +++ b/api/controllers/console/workspace/account.py @@ -15,7 +15,13 @@ from controllers.console.workspace.error import ( InvalidInvitationCodeError, RepeatPasswordNotMatchError, ) -from controllers.console.wraps import account_initialization_required, enterprise_license_required, setup_required +from controllers.console.wraps import ( + account_initialization_required, + cloud_edition_billing_enabled, + enterprise_license_required, + only_edition_cloud, + setup_required, +) from extensions.ext_database import db from fields.member_fields import account_fields from libs.helper import TimestampField, timezone @@ -292,6 +298,79 @@ class AccountDeleteUpdateFeedbackApi(Resource): return {"result": "success"} +class EducationVerifyApi(Resource): + verify_fields = { + "token": fields.String, + } + + @setup_required + @login_required + @account_initialization_required + @only_edition_cloud + @cloud_edition_billing_enabled + @marshal_with(verify_fields) + def get(self): + account = current_user + + return BillingService.EducationIdentity.verify(account.id, account.email) + + +class EducationApi(Resource): + status_fields = { + "result": fields.Boolean, + } + + @setup_required + @login_required + @account_initialization_required + @only_edition_cloud + @cloud_edition_billing_enabled + def post(self): + account = current_user + + parser = reqparse.RequestParser() + parser.add_argument("token", type=str, required=True, location="json") + parser.add_argument("institution", type=str, required=True, location="json") + parser.add_argument("role", type=str, required=True, location="json") + args = parser.parse_args() + + return BillingService.EducationIdentity.activate(account, args["token"], args["institution"], args["role"]) + + @setup_required + @login_required + @account_initialization_required + @only_edition_cloud + @cloud_edition_billing_enabled + @marshal_with(status_fields) + def get(self): + account = current_user + + return BillingService.EducationIdentity.is_active(account.id) + + +class EducationAutoCompleteApi(Resource): + data_fields = { + "data": fields.List(fields.String), + "curr_page": fields.Integer, + "has_next": fields.Boolean, + } + + @setup_required + @login_required + @account_initialization_required + @only_edition_cloud + @cloud_edition_billing_enabled + @marshal_with(data_fields) + def get(self): + parser = reqparse.RequestParser() + parser.add_argument("keywords", type=str, required=True, location="args") + parser.add_argument("page", type=int, required=False, location="args", default=0) + parser.add_argument("limit", type=int, required=False, location="args", default=20) + args = parser.parse_args() + + return BillingService.EducationIdentity.autocomplete(args["keywords"], args["page"], args["limit"]) + + # Register API resources api.add_resource(AccountInitApi, "/account/init") api.add_resource(AccountProfileApi, "/account/profile") @@ -305,5 +384,8 @@ api.add_resource(AccountIntegrateApi, "/account/integrates") api.add_resource(AccountDeleteVerifyApi, "/account/delete/verify") api.add_resource(AccountDeleteApi, "/account/delete") api.add_resource(AccountDeleteUpdateFeedbackApi, "/account/delete/feedback") +api.add_resource(EducationVerifyApi, "/account/education/verify") +api.add_resource(EducationApi, "/account/education") +api.add_resource(EducationAutoCompleteApi, "/account/education/autocomplete") # api.add_resource(AccountEmailApi, '/account/email') # api.add_resource(AccountEmailVerifyApi, '/account/email-verify') diff --git a/api/controllers/console/workspace/plugin.py b/api/controllers/console/workspace/plugin.py index f4c32ede2..3700f007f 100644 --- a/api/controllers/console/workspace/plugin.py +++ b/api/controllers/console/workspace/plugin.py @@ -236,7 +236,7 @@ class PluginFetchManifestApi(Resource): @setup_required @login_required @account_initialization_required - @plugin_permission_required(debug_required=True) + @plugin_permission_required(install_required=True) def get(self): tenant_id = current_user.current_tenant_id @@ -260,7 +260,7 @@ class PluginFetchInstallTasksApi(Resource): @setup_required @login_required @account_initialization_required - @plugin_permission_required(debug_required=True) + @plugin_permission_required(install_required=True) def get(self): tenant_id = current_user.current_tenant_id @@ -281,7 +281,7 @@ class PluginFetchInstallTaskApi(Resource): @setup_required @login_required @account_initialization_required - @plugin_permission_required(debug_required=True) + @plugin_permission_required(install_required=True) def get(self, task_id: str): tenant_id = current_user.current_tenant_id @@ -295,7 +295,7 @@ class PluginDeleteInstallTaskApi(Resource): @setup_required @login_required @account_initialization_required - @plugin_permission_required(debug_required=True) + @plugin_permission_required(install_required=True) def post(self, task_id: str): tenant_id = current_user.current_tenant_id @@ -309,7 +309,7 @@ class PluginDeleteAllInstallTaskItemsApi(Resource): @setup_required @login_required @account_initialization_required - @plugin_permission_required(debug_required=True) + @plugin_permission_required(install_required=True) def post(self): tenant_id = current_user.current_tenant_id @@ -323,7 +323,7 @@ class PluginDeleteInstallTaskItemApi(Resource): @setup_required @login_required @account_initialization_required - @plugin_permission_required(debug_required=True) + @plugin_permission_required(install_required=True) def post(self, task_id: str, identifier: str): tenant_id = current_user.current_tenant_id @@ -337,7 +337,7 @@ class PluginUpgradeFromMarketplaceApi(Resource): @setup_required @login_required @account_initialization_required - @plugin_permission_required(debug_required=True) + @plugin_permission_required(install_required=True) def post(self): tenant_id = current_user.current_tenant_id @@ -360,7 +360,7 @@ class PluginUpgradeFromGithubApi(Resource): @setup_required @login_required @account_initialization_required - @plugin_permission_required(debug_required=True) + @plugin_permission_required(install_required=True) def post(self): tenant_id = current_user.current_tenant_id @@ -391,7 +391,7 @@ class PluginUninstallApi(Resource): @setup_required @login_required @account_initialization_required - @plugin_permission_required(debug_required=True) + @plugin_permission_required(install_required=True) def post(self): req = reqparse.RequestParser() req.add_argument("plugin_installation_id", type=str, required=True, location="json") diff --git a/api/controllers/console/workspace/workspace.py b/api/controllers/console/workspace/workspace.py index 6f440b047..fa89c8857 100644 --- a/api/controllers/console/workspace/workspace.py +++ b/api/controllers/console/workspace/workspace.py @@ -220,6 +220,23 @@ class WebappLogoWorkspaceApi(Resource): return {"id": upload_file.id}, 201 +class WorkspaceInfoApi(Resource): + @setup_required + @login_required + @account_initialization_required + # Change workspace name + def post(self): + parser = reqparse.RequestParser() + parser.add_argument("name", type=str, required=True, location="json") + args = parser.parse_args() + + tenant = Tenant.query.filter(Tenant.id == current_user.current_tenant_id).one_or_404() + tenant.name = args["name"] + db.session.commit() + + return {"result": "success", "tenant": marshal(WorkspaceService.get_tenant_info(tenant), tenant_fields)} + + api.add_resource(TenantListApi, "/workspaces") # GET for getting all tenants api.add_resource(WorkspaceListApi, "/all-workspaces") # GET for getting all tenants api.add_resource(TenantApi, "/workspaces/current", endpoint="workspaces_current") # GET for getting current tenant info @@ -227,3 +244,4 @@ api.add_resource(TenantApi, "/info", endpoint="info") # Deprecated api.add_resource(SwitchWorkspaceApi, "/workspaces/switch") # POST for switching tenant api.add_resource(CustomConfigWorkspaceApi, "/workspaces/custom-config") api.add_resource(WebappLogoWorkspaceApi, "/workspaces/custom-config/webapp-logo/upload") +api.add_resource(WorkspaceInfoApi, "/workspaces/info") # POST for changing workspace info diff --git a/api/controllers/console/wraps.py b/api/controllers/console/wraps.py index ed6e16b03..6caaae87f 100644 --- a/api/controllers/console/wraps.py +++ b/api/controllers/console/wraps.py @@ -54,6 +54,17 @@ def only_edition_self_hosted(view): return decorated +def cloud_edition_billing_enabled(view): + @wraps(view) + def decorated(*args, **kwargs): + features = FeatureService.get_features(current_user.current_tenant_id) + if not features.billing.enabled: + abort(403, "Billing feature is not enabled.") + return view(*args, **kwargs) + + return decorated + + def cloud_edition_billing_resource_check(resource: str): def interceptor(view): @wraps(view) diff --git a/api/controllers/service_api/__init__.py b/api/controllers/service_api/__init__.py index a754851c5..d97074e8b 100644 --- a/api/controllers/service_api/__init__.py +++ b/api/controllers/service_api/__init__.py @@ -6,5 +6,6 @@ bp = Blueprint("service_api", __name__, url_prefix="/v1") api = ExternalApi(bp) from . import index -from .app import app, audio, completion, conversation, file, message, workflow +from .app import annotation, app, audio, completion, conversation, file, message, workflow from .dataset import dataset, document, hit_testing, metadata, segment, upload_file +from .workspace import models diff --git a/api/controllers/service_api/app/annotation.py b/api/controllers/service_api/app/annotation.py new file mode 100644 index 000000000..cffa3665b --- /dev/null +++ b/api/controllers/service_api/app/annotation.py @@ -0,0 +1,107 @@ +from flask import request +from flask_restful import Resource, marshal, marshal_with, reqparse # type: ignore +from werkzeug.exceptions import Forbidden + +from controllers.service_api import api +from controllers.service_api.wraps import FetchUserArg, WhereisUserArg, validate_app_token +from extensions.ext_redis import redis_client +from fields.annotation_fields import ( + annotation_fields, +) +from libs.login import current_user +from models.model import App, EndUser +from services.annotation_service import AppAnnotationService + + +class AnnotationReplyActionApi(Resource): + @validate_app_token(fetch_user_arg=FetchUserArg(fetch_from=WhereisUserArg.JSON)) + def post(self, app_model: App, end_user: EndUser, action): + parser = reqparse.RequestParser() + parser.add_argument("score_threshold", required=True, type=float, location="json") + parser.add_argument("embedding_provider_name", required=True, type=str, location="json") + parser.add_argument("embedding_model_name", required=True, type=str, location="json") + args = parser.parse_args() + if action == "enable": + result = AppAnnotationService.enable_app_annotation(args, app_model.id) + elif action == "disable": + result = AppAnnotationService.disable_app_annotation(app_model.id) + else: + raise ValueError("Unsupported annotation reply action") + return result, 200 + + +class AnnotationReplyActionStatusApi(Resource): + @validate_app_token(fetch_user_arg=FetchUserArg(fetch_from=WhereisUserArg.QUERY)) + def get(self, app_model: App, end_user: EndUser, job_id, action): + job_id = str(job_id) + app_annotation_job_key = "{}_app_annotation_job_{}".format(action, str(job_id)) + cache_result = redis_client.get(app_annotation_job_key) + if cache_result is None: + raise ValueError("The job does not exist.") + + job_status = cache_result.decode() + error_msg = "" + if job_status == "error": + app_annotation_error_key = "{}_app_annotation_error_{}".format(action, str(job_id)) + error_msg = redis_client.get(app_annotation_error_key).decode() + + return {"job_id": job_id, "job_status": job_status, "error_msg": error_msg}, 200 + + +class AnnotationListApi(Resource): + @validate_app_token(fetch_user_arg=FetchUserArg(fetch_from=WhereisUserArg.QUERY)) + def get(self, app_model: App, end_user: EndUser): + page = request.args.get("page", default=1, type=int) + limit = request.args.get("limit", default=20, type=int) + keyword = request.args.get("keyword", default="", type=str) + + annotation_list, total = AppAnnotationService.get_annotation_list_by_app_id(app_model.id, page, limit, keyword) + response = { + "data": marshal(annotation_list, annotation_fields), + "has_more": len(annotation_list) == limit, + "limit": limit, + "total": total, + "page": page, + } + return response, 200 + + @validate_app_token(fetch_user_arg=FetchUserArg(fetch_from=WhereisUserArg.JSON)) + @marshal_with(annotation_fields) + def post(self, app_model: App, end_user: EndUser): + parser = reqparse.RequestParser() + parser.add_argument("question", required=True, type=str, location="json") + parser.add_argument("answer", required=True, type=str, location="json") + args = parser.parse_args() + annotation = AppAnnotationService.insert_app_annotation_directly(args, app_model.id) + return annotation + + +class AnnotationUpdateDeleteApi(Resource): + @validate_app_token(fetch_user_arg=FetchUserArg(fetch_from=WhereisUserArg.JSON)) + @marshal_with(annotation_fields) + def post(self, app_model: App, end_user: EndUser, annotation_id): + if not current_user.is_editor: + raise Forbidden() + + annotation_id = str(annotation_id) + parser = reqparse.RequestParser() + parser.add_argument("question", required=True, type=str, location="json") + parser.add_argument("answer", required=True, type=str, location="json") + args = parser.parse_args() + annotation = AppAnnotationService.update_app_annotation_directly(args, app_model.id, annotation_id) + return annotation + + @validate_app_token(fetch_user_arg=FetchUserArg(fetch_from=WhereisUserArg.QUERY)) + def delete(self, app_model: App, end_user: EndUser, annotation_id): + if not current_user.is_editor: + raise Forbidden() + + annotation_id = str(annotation_id) + AppAnnotationService.delete_app_annotation(app_model.id, annotation_id) + return {"result": "success"}, 200 + + +api.add_resource(AnnotationReplyActionApi, "/apps/annotation-reply/") +api.add_resource(AnnotationReplyActionStatusApi, "/apps/annotation-reply//status/") +api.add_resource(AnnotationListApi, "/apps/annotations") +api.add_resource(AnnotationUpdateDeleteApi, "/apps/annotations/") diff --git a/api/controllers/service_api/app/message.py b/api/controllers/service_api/app/message.py index c87fa7867..c0db1b95b 100644 --- a/api/controllers/service_api/app/message.py +++ b/api/controllers/service_api/app/message.py @@ -1,3 +1,4 @@ +import json import logging from flask_restful import Resource, fields, marshal_with, reqparse # type: ignore @@ -10,7 +11,7 @@ from controllers.service_api.app.error import NotChatAppError from controllers.service_api.wraps import FetchUserArg, WhereisUserArg, validate_app_token from core.app.entities.app_invoke_entities import InvokeFrom from fields.conversation_fields import message_file_fields -from fields.message_fields import agent_thought_fields, feedback_fields, retriever_resource_fields +from fields.message_fields import agent_thought_fields, feedback_fields from fields.raws import FilesContainedField from libs.helper import TimestampField, uuid_value from models.model import ApiToken, App, AppMode, EndUser # 二开部分End - 密钥额度限制,新增ApiToken @@ -19,6 +20,14 @@ from services.message_service import MessageService class MessageListApi(Resource): + def get_retriever_resources(self): + try: + if self.message_metadata: + return json.loads(self.message_metadata).get("retriever_resources", []) + return [] + except (json.JSONDecodeError, TypeError): + return [] + message_fields = { "id": fields.String, "conversation_id": fields.String, @@ -28,7 +37,7 @@ class MessageListApi(Resource): "answer": fields.String(attribute="re_sign_file_url_answer"), "message_files": fields.List(fields.Nested(message_file_fields)), "feedback": fields.Nested(feedback_fields, attribute="user_feedback", allow_null=True), - "retriever_resources": fields.List(fields.Nested(retriever_resource_fields)), + "retriever_resources": get_retriever_resources, "created_at": TimestampField, "agent_thoughts": fields.List(fields.Nested(agent_thought_fields)), "status": fields.String, diff --git a/api/controllers/service_api/app/workflow.py b/api/controllers/service_api/app/workflow.py index 394c048c3..d4499ed40 100644 --- a/api/controllers/service_api/app/workflow.py +++ b/api/controllers/service_api/app/workflow.py @@ -27,6 +27,7 @@ from core.model_runtime.errors.invoke import InvokeError from extensions.ext_database import db from fields.workflow_app_log_fields import workflow_app_log_pagination_fields from libs import helper +from libs.helper import TimestampField from models.model import ApiToken, App, AppMode, EndUser # 二开部分End - 密钥额度限制,ApiToken from models.workflow import WorkflowRun, WorkflowRunStatus from services.app_generate_service import AppGenerateService @@ -44,8 +45,8 @@ workflow_run_fields = { "error": fields.String, "total_steps": fields.Integer, "total_tokens": fields.Integer, - "created_at": fields.DateTime, - "finished_at": fields.DateTime, + "created_at": TimestampField, + "finished_at": TimestampField, "elapsed_time": fields.Float, } @@ -53,7 +54,7 @@ workflow_run_fields = { class WorkflowRunDetailApi(Resource): @validate_app_token @marshal_with(workflow_run_fields) - def get(self, app_model: App, workflow_id: str, api_token: ApiToken): # 二开部分End - 密钥额度限制,新增api_token,否则上传文件会报错 + def get(self, app_model: App, workflow_run_id: str, api_token: ApiToken): # 二开部分End - 密钥额度限制,新增api_token,否则上传文件会报错 """ Get a workflow task running detail """ @@ -61,7 +62,7 @@ class WorkflowRunDetailApi(Resource): if app_mode != AppMode.WORKFLOW: raise NotWorkflowAppError() - workflow_run = db.session.query(WorkflowRun).filter(WorkflowRun.id == workflow_id).first() + workflow_run = db.session.query(WorkflowRun).filter(WorkflowRun.id == workflow_run_id).first() return workflow_run @@ -166,6 +167,6 @@ class WorkflowAppLogApi(Resource): api.add_resource(WorkflowRunApi, "/workflows/run") -api.add_resource(WorkflowRunDetailApi, "/workflows/run/") +api.add_resource(WorkflowRunDetailApi, "/workflows/run/") api.add_resource(WorkflowTaskStopApi, "/workflows/tasks//stop") api.add_resource(WorkflowAppLogApi, "/workflows/logs") diff --git a/api/controllers/service_api/dataset/dataset.py b/api/controllers/service_api/dataset/dataset.py index 8ab743dc4..8d470b899 100644 --- a/api/controllers/service_api/dataset/dataset.py +++ b/api/controllers/service_api/dataset/dataset.py @@ -1,6 +1,6 @@ from flask import request from flask_restful import marshal, reqparse # type: ignore -from werkzeug.exceptions import NotFound +from werkzeug.exceptions import Forbidden, NotFound import services.dataset_service from controllers.service_api import api @@ -12,7 +12,7 @@ from core.provider_manager import ProviderManager from fields.dataset_fields import dataset_detail_fields from libs.login import current_user from models.dataset import Dataset, DatasetPermissionEnum -from services.dataset_service import DatasetService +from services.dataset_service import DatasetPermissionService, DatasetService def _validate_name(name): @@ -21,6 +21,12 @@ def _validate_name(name): return name +def _validate_description_length(description): + if len(description) > 400: + raise ValueError("Description cannot exceed 400 characters.") + return description + + class DatasetListApi(DatasetApiResource): """Resource for datasets.""" @@ -137,11 +143,151 @@ class DatasetListApi(DatasetApiResource): class DatasetApi(DatasetApiResource): """Resource for dataset.""" + def get(self, _, dataset_id): + dataset_id_str = str(dataset_id) + dataset = DatasetService.get_dataset(dataset_id_str) + if dataset is None: + raise NotFound("Dataset not found.") + try: + DatasetService.check_dataset_permission(dataset, current_user) + except services.errors.account.NoPermissionError as e: + raise Forbidden(str(e)) + data = marshal(dataset, dataset_detail_fields) + if data.get("permission") == "partial_members": + part_users_list = DatasetPermissionService.get_dataset_partial_member_list(dataset_id_str) + data.update({"partial_member_list": part_users_list}) + + # check embedding setting + provider_manager = ProviderManager() + configurations = provider_manager.get_configurations(tenant_id=current_user.current_tenant_id) + + embedding_models = configurations.get_models(model_type=ModelType.TEXT_EMBEDDING, only_active=True) + + model_names = [] + for embedding_model in embedding_models: + model_names.append(f"{embedding_model.model}:{embedding_model.provider.provider}") + + if data["indexing_technique"] == "high_quality": + item_model = f"{data['embedding_model']}:{data['embedding_model_provider']}" + if item_model in model_names: + data["embedding_available"] = True + else: + data["embedding_available"] = False + else: + data["embedding_available"] = True + + if data.get("permission") == "partial_members": + part_users_list = DatasetPermissionService.get_dataset_partial_member_list(dataset_id_str) + data.update({"partial_member_list": part_users_list}) + + return data, 200 + + def patch(self, _, dataset_id): + dataset_id_str = str(dataset_id) + dataset = DatasetService.get_dataset(dataset_id_str) + if dataset is None: + raise NotFound("Dataset not found.") + + parser = reqparse.RequestParser() + parser.add_argument( + "name", + nullable=False, + help="type is required. Name must be between 1 to 40 characters.", + type=_validate_name, + ) + parser.add_argument("description", location="json", store_missing=False, type=_validate_description_length) + parser.add_argument( + "indexing_technique", + type=str, + location="json", + choices=Dataset.INDEXING_TECHNIQUE_LIST, + nullable=True, + help="Invalid indexing technique.", + ) + parser.add_argument( + "permission", + type=str, + location="json", + choices=(DatasetPermissionEnum.ONLY_ME, DatasetPermissionEnum.ALL_TEAM, DatasetPermissionEnum.PARTIAL_TEAM), + help="Invalid permission.", + ) + parser.add_argument("embedding_model", type=str, location="json", help="Invalid embedding model.") + parser.add_argument( + "embedding_model_provider", type=str, location="json", help="Invalid embedding model provider." + ) + parser.add_argument("retrieval_model", type=dict, location="json", help="Invalid retrieval model.") + parser.add_argument("partial_member_list", type=list, location="json", help="Invalid parent user list.") + + parser.add_argument( + "external_retrieval_model", + type=dict, + required=False, + nullable=True, + location="json", + help="Invalid external retrieval model.", + ) + + parser.add_argument( + "external_knowledge_id", + type=str, + required=False, + nullable=True, + location="json", + help="Invalid external knowledge id.", + ) + + parser.add_argument( + "external_knowledge_api_id", + type=str, + required=False, + nullable=True, + location="json", + help="Invalid external knowledge api id.", + ) + args = parser.parse_args() + data = request.get_json() + + # check embedding model setting + if data.get("indexing_technique") == "high_quality": + DatasetService.check_embedding_model_setting( + dataset.tenant_id, data.get("embedding_model_provider"), data.get("embedding_model") + ) + + # The role of the current user in the ta table must be admin, owner, editor, or dataset_operator + DatasetPermissionService.check_permission( + current_user, dataset, data.get("permission"), data.get("partial_member_list") + ) + + dataset = DatasetService.update_dataset(dataset_id_str, args, current_user) + + if dataset is None: + raise NotFound("Dataset not found.") + + result_data = marshal(dataset, dataset_detail_fields) + tenant_id = current_user.current_tenant_id + + if data.get("partial_member_list") and data.get("permission") == "partial_members": + DatasetPermissionService.update_partial_member_list( + tenant_id, dataset_id_str, data.get("partial_member_list") + ) + # clear partial member list when permission is only_me or all_team_members + elif ( + data.get("permission") == DatasetPermissionEnum.ONLY_ME + or data.get("permission") == DatasetPermissionEnum.ALL_TEAM + ): + DatasetPermissionService.clear_partial_member_list(dataset_id_str) + + partial_member_list = DatasetPermissionService.get_dataset_partial_member_list(dataset_id_str) + result_data.update({"partial_member_list": partial_member_list}) + + return result_data, 200 + def delete(self, _, dataset_id): """ Deletes a dataset given its ID. Args: + _: ignore dataset_id (UUID): The ID of the dataset to be deleted. Returns: @@ -157,6 +303,7 @@ class DatasetApi(DatasetApiResource): try: if DatasetService.delete_dataset(dataset_id_str, current_user): + DatasetPermissionService.clear_partial_member_list(dataset_id_str) return {"result": "success"}, 204 else: raise NotFound("Dataset not found.") diff --git a/api/controllers/service_api/dataset/document.py b/api/controllers/service_api/dataset/document.py index 995444ee4..4cc92847f 100644 --- a/api/controllers/service_api/dataset/document.py +++ b/api/controllers/service_api/dataset/document.py @@ -341,7 +341,7 @@ class DocumentListApi(DatasetApiResource): search = f"%{search}%" query = query.filter(Document.name.like(search)) - query = query.order_by(desc(Document.created_at)) + query = query.order_by(desc(Document.created_at), desc(Document.position)) paginated_documents = query.paginate(page=page, per_page=limit, max_per_page=100, error_out=False) documents = paginated_documents.items diff --git a/api/controllers/service_api/dataset/segment.py b/api/controllers/service_api/dataset/segment.py index ee5178862..81bae2940 100644 --- a/api/controllers/service_api/dataset/segment.py +++ b/api/controllers/service_api/dataset/segment.py @@ -14,10 +14,20 @@ from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError from core.model_manager import ModelManager from core.model_runtime.entities.model_entities import ModelType from extensions.ext_database import db -from fields.segment_fields import segment_fields -from models.dataset import Dataset, DocumentSegment +from fields.segment_fields import child_chunk_fields, segment_fields +from models.dataset import Dataset from services.dataset_service import DatasetService, DocumentService, SegmentService from services.entities.knowledge_entities.knowledge_entities import SegmentUpdateArgs +from services.errors.chunk import ( + ChildChunkDeleteIndexError, + ChildChunkIndexingError, +) +from services.errors.chunk import ( + ChildChunkDeleteIndexError as ChildChunkDeleteIndexServiceError, +) +from services.errors.chunk import ( + ChildChunkIndexingError as ChildChunkIndexingServiceError, +) class SegmentApi(DatasetApiResource): @@ -71,7 +81,7 @@ class SegmentApi(DatasetApiResource): return {"error": "Segments is required"}, 400 def get(self, tenant_id, dataset_id, document_id): - """Create single segment.""" + """Get segments.""" # check dataset dataset_id = str(dataset_id) tenant_id = str(tenant_id) @@ -110,26 +120,13 @@ class SegmentApi(DatasetApiResource): status_list = args["status"] keyword = args["keyword"] - query = DocumentSegment.query.filter( - DocumentSegment.document_id == str(document_id), DocumentSegment.tenant_id == current_user.current_tenant_id + segments, total = SegmentService.get_segments( + document_id=document_id, + tenant_id=current_user.current_tenant_id, + status_list=args["status"], + keyword=args["keyword"], ) - if status_list: - query = query.filter(DocumentSegment.status.in_(status_list)) - - if keyword: - query = query.where(DocumentSegment.content.ilike(f"%{keyword}%")) - - total = query.count() - query = query.order_by(DocumentSegment.position) - paginated_segments = query.paginate( - page=page, - per_page=limit, - max_per_page=100, - error_out=False, - ) - segments = paginated_segments.items - response = { "data": marshal(segments, segment_fields), "doc_form": document.doc_form, @@ -158,9 +155,8 @@ class DatasetSegmentApi(DatasetApiResource): if not document: raise NotFound("Document not found.") # check segment - segment = DocumentSegment.query.filter( - DocumentSegment.id == str(segment_id), DocumentSegment.tenant_id == current_user.current_tenant_id - ).first() + segment_id = str(segment_id) + segment = SegmentService.get_segment_by_id(segment_id=segment_id, tenant_id=current_user.current_tenant_id) if not segment: raise NotFound("Segment not found.") SegmentService.delete_segment(segment, document, dataset) @@ -199,9 +195,7 @@ class DatasetSegmentApi(DatasetApiResource): raise ProviderNotInitializeError(ex.description) # check segment segment_id = str(segment_id) - segment = DocumentSegment.query.filter( - DocumentSegment.id == str(segment_id), DocumentSegment.tenant_id == current_user.current_tenant_id - ).first() + segment = SegmentService.get_segment_by_id(segment_id=segment_id, tenant_id=current_user.current_tenant_id) if not segment: raise NotFound("Segment not found.") @@ -210,12 +204,200 @@ class DatasetSegmentApi(DatasetApiResource): parser.add_argument("segment", type=dict, required=False, nullable=True, location="json") args = parser.parse_args() - SegmentService.segment_create_args_validate(args["segment"], document) - segment = SegmentService.update_segment(SegmentUpdateArgs(**args["segment"]), segment, document, dataset) - return {"data": marshal(segment, segment_fields), "doc_form": document.doc_form}, 200 + updated_segment = SegmentService.update_segment( + SegmentUpdateArgs(**args["segment"]), segment, document, dataset + ) + return {"data": marshal(updated_segment, segment_fields), "doc_form": document.doc_form}, 200 + + +class ChildChunkApi(DatasetApiResource): + """Resource for child chunks.""" + + @cloud_edition_billing_resource_check("vector_space", "dataset") + @cloud_edition_billing_knowledge_limit_check("add_segment", "dataset") + def post(self, tenant_id, dataset_id, document_id, segment_id): + """Create child chunk.""" + # check dataset + dataset_id = str(dataset_id) + tenant_id = str(tenant_id) + dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() + if not dataset: + raise NotFound("Dataset not found.") + + # check document + document_id = str(document_id) + document = DocumentService.get_document(dataset.id, document_id) + if not document: + raise NotFound("Document not found.") + + # check segment + segment_id = str(segment_id) + segment = SegmentService.get_segment_by_id(segment_id=segment_id, tenant_id=current_user.current_tenant_id) + if not segment: + raise NotFound("Segment not found.") + + # check embedding model setting + if dataset.indexing_technique == "high_quality": + try: + model_manager = ModelManager() + model_manager.get_model_instance( + tenant_id=current_user.current_tenant_id, + provider=dataset.embedding_model_provider, + model_type=ModelType.TEXT_EMBEDDING, + model=dataset.embedding_model, + ) + except LLMBadRequestError: + raise ProviderNotInitializeError( + "No Embedding Model available. Please configure a valid provider in the Settings -> Model Provider." + ) + except ProviderTokenNotInitError as ex: + raise ProviderNotInitializeError(ex.description) + + # validate args + parser = reqparse.RequestParser() + parser.add_argument("content", type=str, required=True, nullable=False, location="json") + args = parser.parse_args() + + try: + child_chunk = SegmentService.create_child_chunk(args.get("content"), segment, document, dataset) + except ChildChunkIndexingServiceError as e: + raise ChildChunkIndexingError(str(e)) + + return {"data": marshal(child_chunk, child_chunk_fields)}, 200 + + def get(self, tenant_id, dataset_id, document_id, segment_id): + """Get child chunks.""" + # check dataset + dataset_id = str(dataset_id) + tenant_id = str(tenant_id) + dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() + if not dataset: + raise NotFound("Dataset not found.") + + # check document + document_id = str(document_id) + document = DocumentService.get_document(dataset.id, document_id) + if not document: + raise NotFound("Document not found.") + + # check segment + segment_id = str(segment_id) + segment = SegmentService.get_segment_by_id(segment_id=segment_id, tenant_id=current_user.current_tenant_id) + if not segment: + raise NotFound("Segment not found.") + + parser = reqparse.RequestParser() + parser.add_argument("limit", type=int, default=20, location="args") + parser.add_argument("keyword", type=str, default=None, location="args") + parser.add_argument("page", type=int, default=1, location="args") + args = parser.parse_args() + + page = args["page"] + limit = min(args["limit"], 100) + keyword = args["keyword"] + + child_chunks = SegmentService.get_child_chunks(segment_id, document_id, dataset_id, page, limit, keyword) + + return { + "data": marshal(child_chunks.items, child_chunk_fields), + "total": child_chunks.total, + "total_pages": child_chunks.pages, + "page": page, + "limit": limit, + }, 200 + + +class DatasetChildChunkApi(DatasetApiResource): + """Resource for updating child chunks.""" + + @cloud_edition_billing_knowledge_limit_check("add_segment", "dataset") + def delete(self, tenant_id, dataset_id, document_id, segment_id, child_chunk_id): + """Delete child chunk.""" + # check dataset + dataset_id = str(dataset_id) + tenant_id = str(tenant_id) + dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() + if not dataset: + raise NotFound("Dataset not found.") + + # check document + document_id = str(document_id) + document = DocumentService.get_document(dataset.id, document_id) + if not document: + raise NotFound("Document not found.") + + # check segment + segment_id = str(segment_id) + segment = SegmentService.get_segment_by_id(segment_id=segment_id, tenant_id=current_user.current_tenant_id) + if not segment: + raise NotFound("Segment not found.") + + # check child chunk + child_chunk_id = str(child_chunk_id) + child_chunk = SegmentService.get_child_chunk_by_id( + child_chunk_id=child_chunk_id, tenant_id=current_user.current_tenant_id + ) + if not child_chunk: + raise NotFound("Child chunk not found.") + + try: + SegmentService.delete_child_chunk(child_chunk, dataset) + except ChildChunkDeleteIndexServiceError as e: + raise ChildChunkDeleteIndexError(str(e)) + + return {"result": "success"}, 200 + + @cloud_edition_billing_resource_check("vector_space", "dataset") + @cloud_edition_billing_knowledge_limit_check("add_segment", "dataset") + def patch(self, tenant_id, dataset_id, document_id, segment_id, child_chunk_id): + """Update child chunk.""" + # check dataset + dataset_id = str(dataset_id) + tenant_id = str(tenant_id) + dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() + if not dataset: + raise NotFound("Dataset not found.") + + # get document + document = DocumentService.get_document(dataset_id, document_id) + if not document: + raise NotFound("Document not found.") + + # get segment + segment = SegmentService.get_segment_by_id(segment_id=segment_id, tenant_id=current_user.current_tenant_id) + if not segment: + raise NotFound("Segment not found.") + + # get child chunk + child_chunk = SegmentService.get_child_chunk_by_id( + child_chunk_id=child_chunk_id, tenant_id=current_user.current_tenant_id + ) + if not child_chunk: + raise NotFound("Child chunk not found.") + + # validate args + parser = reqparse.RequestParser() + parser.add_argument("content", type=str, required=True, nullable=False, location="json") + args = parser.parse_args() + + try: + child_chunk = SegmentService.update_child_chunk( + args.get("content"), child_chunk, segment, document, dataset + ) + except ChildChunkIndexingServiceError as e: + raise ChildChunkIndexingError(str(e)) + + return {"data": marshal(child_chunk, child_chunk_fields)}, 200 api.add_resource(SegmentApi, "/datasets//documents//segments") api.add_resource( DatasetSegmentApi, "/datasets//documents//segments/" ) +api.add_resource( + ChildChunkApi, "/datasets//documents//segments//child_chunks" +) +api.add_resource( + DatasetChildChunkApi, + "/datasets//documents//segments//child_chunks/", +) diff --git a/api/controllers/service_api/workspace/models.py b/api/controllers/service_api/workspace/models.py new file mode 100644 index 000000000..373f8019f --- /dev/null +++ b/api/controllers/service_api/workspace/models.py @@ -0,0 +1,21 @@ +from flask_login import current_user # type: ignore +from flask_restful import Resource # type: ignore + +from controllers.service_api import api +from controllers.service_api.wraps import validate_dataset_token +from core.model_runtime.utils.encoders import jsonable_encoder +from services.model_provider_service import ModelProviderService + + +class ModelProviderAvailableModelApi(Resource): + @validate_dataset_token + def get(self, _, model_type): + tenant_id = current_user.current_tenant_id + + model_provider_service = ModelProviderService() + models = model_provider_service.get_models_by_model_type(tenant_id=tenant_id, model_type=model_type) + + return jsonable_encoder({"data": models}) + + +api.add_resource(ModelProviderAvailableModelApi, "/workspaces/current/models/model-types/") diff --git a/api/controllers/service_api/wraps.py b/api/controllers/service_api/wraps.py index 040885a1d..a25c1e460 100644 --- a/api/controllers/service_api/wraps.py +++ b/api/controllers/service_api/wraps.py @@ -72,22 +72,32 @@ def validate_app_token(view: Optional[Callable] = None, *, fetch_user_arg: Optio if tenant.status == TenantStatus.ARCHIVE: raise Forbidden("The workspace's status is archived.") - # ---------------------二开部分Begin 额度限制,API调用计费 --------------------- - tenantAccountJoin = ( - db.session.query(TenantAccountJoin) - .filter( - TenantAccountJoin.tenant_id == app_model.tenant_id, - TenantAccountJoin.role == TenantAccountRole.OWNER, - ) - .first() - ) - if not tenantAccountJoin: - raise Forbidden("The workspace has not owner") + tenant_account_join = ( + db.session.query(Tenant, TenantAccountJoin) + .filter(Tenant.id == api_token.tenant_id) + .filter(TenantAccountJoin.tenant_id == Tenant.id) + .filter(TenantAccountJoin.role.in_(["owner"])) + .filter(Tenant.status == TenantStatus.NORMAL) + .one_or_none() + ) # TODO: only owner information is required, so only one is returned. + if tenant_account_join: + tenant, ta = tenant_account_join + account = db.session.query(Account).filter(Account.id == ta.account_id).first() + # Login admin + if account: + account.current_tenant = tenant + current_app.login_manager._update_request_context_with_user(account) # type: ignore + user_logged_in.send(current_app._get_current_object(), user=_get_user()) # type: ignore + else: + raise Unauthorized("Tenant owner account does not exist.") + else: + raise Unauthorized("Tenant does not exist.") + # ---------------------二开部分Begin 额度限制,API调用计费 --------------------- # TODO 需要写入缓存,读缓存 account_money = ( db.session.query(AccountMoneyExtend) - .filter(AccountMoneyExtend.account_id == tenantAccountJoin.account_id) + .filter(AccountMoneyExtend.account_id == tenant_account_join.account_id) .first() ) if account_money and account_money.used_quota >= account_money.total_quota: @@ -137,7 +147,7 @@ def validate_app_token(view: Optional[Callable] = None, *, fetch_user_arg: Optio # ---------------------二开部分Begin 额度限制,API调用计费 --------------------- if kwargs.get("end_user"): create_or_update_end_user_account_join_extend( - kwargs["end_user"].id, tenantAccountJoin.account_id, app_model.id + kwargs["end_user"].id, tenant_account_join.account_id, app_model.id ) # ---------------------二开部分End 额度限制,API调用计费 --------------------- diff --git a/api/controllers/web/passport.py b/api/controllers/web/passport.py index 4625c1f43..e30998c80 100644 --- a/api/controllers/web/passport.py +++ b/api/controllers/web/passport.py @@ -19,6 +19,8 @@ class PassportResource(Resource): def get(self): system_features = FeatureService.get_system_features() app_code = request.headers.get("X-App-Code") + user_id = request.args.get("user_id") + if app_code is None: raise Unauthorized("X-App-Code header is missing.") @@ -36,16 +38,33 @@ class PassportResource(Resource): if not app_model or app_model.status != "normal" or not app_model.enable_site: raise NotFound() - end_user = EndUser( - tenant_id=app_model.tenant_id, - app_id=app_model.id, - type="browser", - is_anonymous=True, - session_id=generate_session_id(), - ) + if user_id: + end_user = ( + db.session.query(EndUser).filter(EndUser.app_id == app_model.id, EndUser.session_id == user_id).first() + ) - db.session.add(end_user) - db.session.commit() + if end_user: + pass + else: + end_user = EndUser( + tenant_id=app_model.tenant_id, + app_id=app_model.id, + type="browser", + is_anonymous=True, + session_id=user_id, + ) + db.session.add(end_user) + db.session.commit() + else: + end_user = EndUser( + tenant_id=app_model.tenant_id, + app_id=app_model.id, + type="browser", + is_anonymous=True, + session_id=generate_session_id(), + ) + db.session.add(end_user) + db.session.commit() payload = { "iss": site.app_id, diff --git a/api/core/agent/base_agent_runner.py b/api/core/agent/base_agent_runner.py index 13c4e4c3d..48c92ea2d 100644 --- a/api/core/agent/base_agent_runner.py +++ b/api/core/agent/base_agent_runner.py @@ -332,7 +332,7 @@ class BaseAgentRunner(AppRunner): agent_thought = updated_agent_thought if thought: - agent_thought.thought = thought + agent_thought.thought += thought if tool_name: agent_thought.tool = tool_name diff --git a/api/core/agent/output_parser/cot_output_parser.py b/api/core/agent/output_parser/cot_output_parser.py index 61fa774ea..7c8f09e6b 100644 --- a/api/core/agent/output_parser/cot_output_parser.py +++ b/api/core/agent/output_parser/cot_output_parser.py @@ -12,39 +12,45 @@ class CotAgentOutputParser: def handle_react_stream_output( cls, llm_response: Generator[LLMResultChunk, None, None], usage_dict: dict ) -> Generator[Union[str, AgentScratchpadUnit.Action], None, None]: - def parse_action(json_str): - try: - action = json.loads(json_str, strict=False) - action_name = None - action_input = None + def parse_action(action) -> Union[str, AgentScratchpadUnit.Action]: + action_name = None + action_input = None + if isinstance(action, str): + try: + action = json.loads(action, strict=False) + except json.JSONDecodeError: + return action or "" - # cohere always returns a list - if isinstance(action, list) and len(action) == 1: - action = action[0] + # cohere always returns a list + if isinstance(action, list) and len(action) == 1: + action = action[0] - for key, value in action.items(): - if "input" in key.lower(): - action_input = value - else: - action_name = value - - if action_name is not None and action_input is not None: - return AgentScratchpadUnit.Action( - action_name=action_name, - action_input=action_input, - ) + for key, value in action.items(): + if "input" in key.lower(): + action_input = value else: - return json_str or "" - except: - return json_str or "" + action_name = value - def extra_json_from_code_block(code_block) -> Generator[Union[str, AgentScratchpadUnit.Action], None, None]: - code_blocks = re.findall(r"```(.*?)```", code_block, re.DOTALL) - if not code_blocks: - return - for block in code_blocks: - json_text = re.sub(r"^[a-zA-Z]+\n", "", block.strip(), flags=re.MULTILINE) - yield parse_action(json_text) + if action_name is not None and action_input is not None: + return AgentScratchpadUnit.Action( + action_name=action_name, + action_input=action_input, + ) + else: + return json.dumps(action) + + def extra_json_from_code_block(code_block) -> list[Union[list, dict]]: + blocks = re.findall(r"```[json]*\s*([\[{].*[]}])\s*```", code_block, re.DOTALL | re.IGNORECASE) + if not blocks: + return [] + try: + json_blocks = [] + for block in blocks: + json_text = re.sub(r"^[a-zA-Z]+\n", "", block.strip(), flags=re.MULTILINE) + json_blocks.append(json.loads(json_text, strict=False)) + return json_blocks + except: + return [] code_block_cache = "" code_block_delimiter_count = 0 @@ -78,7 +84,7 @@ class CotAgentOutputParser: delta = response_content[index : index + steps] yield_delta = False - if delta == "`": + if not in_json and delta == "`": last_character = delta code_block_cache += delta code_block_delimiter_count += 1 @@ -159,8 +165,14 @@ class CotAgentOutputParser: if code_block_delimiter_count == 3: if in_code_block: last_character = delta - yield from extra_json_from_code_block(code_block_cache) - code_block_cache = "" + action_json_list = extra_json_from_code_block(code_block_cache) + if action_json_list: + for action_json in action_json_list: + yield parse_action(action_json) + code_block_cache = "" + else: + index += steps + continue in_code_block = not in_code_block code_block_delimiter_count = 0 diff --git a/api/core/agent/plugin_entities.py b/api/core/agent/plugin_entities.py index 92bd5500e..6cf397533 100644 --- a/api/core/agent/plugin_entities.py +++ b/api/core/agent/plugin_entities.py @@ -70,11 +70,20 @@ class AgentStrategyIdentity(ToolIdentity): pass +class AgentFeature(enum.StrEnum): + """ + Agent Feature, used to describe the features of the agent strategy. + """ + + HISTORY_MESSAGES = "history-messages" + + class AgentStrategyEntity(BaseModel): identity: AgentStrategyIdentity parameters: list[AgentStrategyParameter] = Field(default_factory=list) description: I18nObject = Field(..., description="The description of the agent strategy") output_schema: Optional[dict] = None + features: Optional[list[AgentFeature]] = None # pydantic configs model_config = ConfigDict(protected_namespaces=()) diff --git a/api/core/app/app_config/easy_ui_based_app/model_config/converter.py b/api/core/app/app_config/easy_ui_based_app/model_config/converter.py index ecb045a30..5beb09c2a 100644 --- a/api/core/app/app_config/easy_ui_based_app/model_config/converter.py +++ b/api/core/app/app_config/easy_ui_based_app/model_config/converter.py @@ -16,7 +16,6 @@ class ModelConfigConverter: """ Convert app model config dict to entity. :param app_config: app config - :param skip_check: skip check :raises ProviderTokenNotInitError: provider token not init error :return: app orchestration config entity """ diff --git a/api/core/app/apps/advanced_chat/app_generator.py b/api/core/app/apps/advanced_chat/app_generator.py index aa35aba26..f59c237f9 100644 --- a/api/core/app/apps/advanced_chat/app_generator.py +++ b/api/core/app/apps/advanced_chat/app_generator.py @@ -91,7 +91,7 @@ class AdvancedChatAppGenerator(MessageBasedAppGenerator): :param user: account or end user :param args: request args :param invoke_from: invoke from source - :param stream: is stream + :param streaming: is stream """ if not args.get("query"): raise ValueError("query is required") @@ -191,10 +191,10 @@ class AdvancedChatAppGenerator(MessageBasedAppGenerator): :param app_model: App :param workflow: Workflow + :param node_id: the node id :param user: account or end user :param args: request args - :param invoke_from: invoke from source - :param stream: is stream + :param streaming: is streamed """ if not node_id: raise ValueError("node_id is required") @@ -248,10 +248,10 @@ class AdvancedChatAppGenerator(MessageBasedAppGenerator): :param app_model: App :param workflow: Workflow + :param node_id: the node id :param user: account or end user :param args: request args - :param invoke_from: invoke from source - :param stream: is stream + :param streaming: is stream """ if not node_id: raise ValueError("node_id is required") diff --git a/api/core/app/apps/agent_chat/app_generator.py b/api/core/app/apps/agent_chat/app_generator.py index 3fd840b62..d54b1ce15 100644 --- a/api/core/app/apps/agent_chat/app_generator.py +++ b/api/core/app/apps/agent_chat/app_generator.py @@ -81,7 +81,7 @@ class AgentChatAppGenerator(MessageBasedAppGenerator): :param user: account or end user :param args: request args :param invoke_from: invoke from source - :param stream: is stream + :param streaming: is stream """ if not streaming: raise ValueError("Agent Chat App does not support blocking mode") diff --git a/api/core/app/apps/base_app_runner.py b/api/core/app/apps/base_app_runner.py index 8c6b29731..c813dbb9d 100644 --- a/api/core/app/apps/base_app_runner.py +++ b/api/core/app/apps/base_app_runner.py @@ -157,6 +157,7 @@ class AppRunner: :param files: files :param query: query :param memory: memory + :param image_detail_config: the image quality config :return: """ # get prompt without memory and context diff --git a/api/core/app/apps/chat/app_generator.py b/api/core/app/apps/chat/app_generator.py index 157e24226..f9aca0358 100644 --- a/api/core/app/apps/chat/app_generator.py +++ b/api/core/app/apps/chat/app_generator.py @@ -77,7 +77,7 @@ class ChatAppGenerator(MessageBasedAppGenerator): :param user: account or end user :param args: request args :param invoke_from: invoke from source - :param stream: is stream + :param streaming: is stream """ if not args.get("query"): raise ValueError("query is required") diff --git a/api/core/app/apps/completion/app_generator.py b/api/core/app/apps/completion/app_generator.py index 5404dc0cb..330b87c49 100644 --- a/api/core/app/apps/completion/app_generator.py +++ b/api/core/app/apps/completion/app_generator.py @@ -75,7 +75,7 @@ class CompletionAppGenerator(MessageBasedAppGenerator): :param user: account or end user :param args: request args :param invoke_from: invoke from source - :param stream: is stream + :param streaming: is stream """ query = args["query"] if not isinstance(query, str): diff --git a/api/core/app/apps/message_based_app_generator.py b/api/core/app/apps/message_based_app_generator.py index efaa7b675..64ec6ac0c 100644 --- a/api/core/app/apps/message_based_app_generator.py +++ b/api/core/app/apps/message_based_app_generator.py @@ -148,6 +148,13 @@ class MessageBasedAppGenerator(BaseAppGenerator): # get conversation introduction introduction = self._get_conversation_introduction(application_generate_entity) + # get conversation name + if isinstance(application_generate_entity, AdvancedChatAppGenerateEntity): + query = application_generate_entity.query or "New conversation" + else: + query = next(iter(application_generate_entity.inputs.values()), "New conversation") + conversation_name = (query[:20] + "…") if len(query) > 20 else query + if not conversation: conversation = Conversation( app_id=app_config.app_id, @@ -156,7 +163,7 @@ class MessageBasedAppGenerator(BaseAppGenerator): model_id=model_id, override_model_configs=json.dumps(override_model_configs) if override_model_configs else None, mode=app_config.app_mode.value, - name="New conversation", + name=conversation_name, inputs=application_generate_entity.inputs, introduction=introduction, system_instruction="", diff --git a/api/core/app/apps/workflow/app_generator.py b/api/core/app/apps/workflow/app_generator.py index 3b5d34768..e90a8cdec 100644 --- a/api/core/app/apps/workflow/app_generator.py +++ b/api/core/app/apps/workflow/app_generator.py @@ -167,7 +167,7 @@ class WorkflowAppGenerator(BaseAppGenerator): :param user: account or end user :param application_generate_entity: application generate entity :param invoke_from: invoke from source - :param stream: is stream + :param streaming: is stream :param workflow_thread_pool_id: workflow thread pool id """ # init queue manager @@ -217,10 +217,10 @@ class WorkflowAppGenerator(BaseAppGenerator): :param app_model: App :param workflow: Workflow + :param node_id: the node id :param user: account or end user :param args: request args - :param invoke_from: invoke from source - :param stream: is stream + :param streaming: is streamed """ if not node_id: raise ValueError("node_id is required") @@ -273,10 +273,10 @@ class WorkflowAppGenerator(BaseAppGenerator): :param app_model: App :param workflow: Workflow + :param node_id: the node id :param user: account or end user :param args: request args - :param invoke_from: invoke from source - :param stream: is stream + :param streaming: is streamed """ if not node_id: raise ValueError("node_id is required") diff --git a/api/core/app/apps/workflow/app_runner.py b/api/core/app/apps/workflow/app_runner.py index 7bbf3612c..b38ee18ac 100644 --- a/api/core/app/apps/workflow/app_runner.py +++ b/api/core/app/apps/workflow/app_runner.py @@ -44,9 +44,6 @@ class WorkflowAppRunner(WorkflowBasedAppRunner): def run(self) -> None: """ Run application - :param application_generate_entity: application generate entity - :param queue_manager: application queue manager - :return: """ app_config = self.application_generate_entity.app_config app_config = cast(WorkflowAppConfig, app_config) diff --git a/api/core/app/task_pipeline/message_cycle_manage.py b/api/core/app/task_pipeline/message_cycle_manage.py index ef3a52442..6223b33b6 100644 --- a/api/core/app/task_pipeline/message_cycle_manage.py +++ b/api/core/app/task_pipeline/message_cycle_manage.py @@ -48,7 +48,7 @@ class MessageCycleManage: def _generate_conversation_name(self, *, conversation_id: str, query: str) -> Optional[Thread]: """ Generate conversation name. - :param conversation: conversation + :param conversation_id: conversation id :param query: query :return: thread """ diff --git a/api/core/app/task_pipeline/workflow_cycle_manage.py b/api/core/app/task_pipeline/workflow_cycle_manage.py index d07c7257d..b44b75b7a 100644 --- a/api/core/app/task_pipeline/workflow_cycle_manage.py +++ b/api/core/app/task_pipeline/workflow_cycle_manage.py @@ -44,6 +44,7 @@ from core.app.entities.task_entities import ( WorkflowFinishStreamResponse, WorkflowStartStreamResponse, ) +from core.app.task_pipeline.exc import WorkflowRunNotFoundError from core.file import FILE_MODEL_IDENTITY, File from core.model_runtime.utils.encoders import jsonable_encoder from core.ops.entities.trace_entity import TraceTaskName @@ -69,8 +70,6 @@ from tasks.extend.update_account_money_when_workflow_node_execution_created_exte update_account_money_when_workflow_node_execution_created_extend, # 二开部分End - 密钥额度限制 ) -from .exc import WorkflowRunNotFoundError - class WorkflowCycleManage: def __init__( @@ -157,7 +156,7 @@ class WorkflowCycleManage: ) -> WorkflowRun: """ Workflow run success - :param workflow_run: workflow run + :param workflow_run_id: workflow run id :param start_at: start time :param total_tokens: total tokens :param total_steps: total steps @@ -169,7 +168,7 @@ class WorkflowCycleManage: outputs = WorkflowEntry.handle_special_values(outputs) - workflow_run.status = WorkflowRunStatus.SUCCEEDED.value + workflow_run.status = WorkflowRunStatus.SUCCEEDED workflow_run.outputs = json.dumps(outputs or {}) workflow_run.elapsed_time = time.perf_counter() - start_at workflow_run.total_tokens = total_tokens @@ -204,7 +203,7 @@ class WorkflowCycleManage: workflow_run = self._get_workflow_run(session=session, workflow_run_id=workflow_run_id) outputs = WorkflowEntry.handle_special_values(dict(outputs) if outputs else None) - workflow_run.status = WorkflowRunStatus.PARTIAL_SUCCESSED.value + workflow_run.status = WorkflowRunStatus.PARTIAL_SUCCEEDED.value workflow_run.outputs = json.dumps(outputs or {}) workflow_run.elapsed_time = time.perf_counter() - start_at workflow_run.total_tokens = total_tokens @@ -240,7 +239,7 @@ class WorkflowCycleManage: ) -> WorkflowRun: """ Workflow run failed - :param workflow_run: workflow run + :param workflow_run_id: workflow run id :param start_at: start time :param total_tokens: total tokens :param total_steps: total steps diff --git a/api/core/entities/provider_entities.py b/api/core/entities/provider_entities.py index e04e2a42f..2a0751a5e 100644 --- a/api/core/entities/provider_entities.py +++ b/api/core/entities/provider_entities.py @@ -146,6 +146,7 @@ class BasicProviderConfig(BaseModel): BOOLEAN = CommonParameterType.BOOLEAN.value APP_SELECTOR = CommonParameterType.APP_SELECTOR.value MODEL_SELECTOR = CommonParameterType.MODEL_SELECTOR.value + TOOLS_SELECTOR = CommonParameterType.TOOLS_SELECTOR.value @classmethod def value_of(cls, value: str) -> "ProviderConfig.Type": diff --git a/api/core/file/upload_file_parser.py b/api/core/file/upload_file_parser.py index 062a0b6d2..96b288481 100644 --- a/api/core/file/upload_file_parser.py +++ b/api/core/file/upload_file_parser.py @@ -4,12 +4,10 @@ import time from typing import Optional from configs import dify_config +from constants import IMAGE_EXTENSIONS from core.helper.url_signer import UrlSigner from extensions.ext_storage import storage -IMAGE_EXTENSIONS = ["jpg", "jpeg", "png", "webp", "gif", "svg"] -IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS]) - class UploadFileParser: @classmethod @@ -38,7 +36,7 @@ class UploadFileParser: """ get signed url from upload file - :param upload_file: UploadFile object + :param upload_file_id: the id of UploadFile object :return: """ base_url = dify_config.FILES_URL diff --git a/api/core/helper/code_executor/code_executor.py b/api/core/helper/code_executor/code_executor.py index e96d9a075..7c662b64b 100644 --- a/api/core/helper/code_executor/code_executor.py +++ b/api/core/helper/code_executor/code_executor.py @@ -61,6 +61,7 @@ class CodeExecutor: Execute code :param purview: bool # Extend global code :param language: code language + :param preload: the preload script :param code: code :return: """ diff --git a/api/core/helper/position_helper.py b/api/core/helper/position_helper.py index 3efdc8aa4..8def6fe4e 100644 --- a/api/core/helper/position_helper.py +++ b/api/core/helper/position_helper.py @@ -53,7 +53,7 @@ def pin_position_map(original_position_map: dict[str, int], pin_list: list[str]) """ Pin the items in the pin list to the beginning of the position map. Overall logic: exclude > include > pin - :param position_map: the position map to be sorted and filtered + :param original_position_map: the position map to be sorted and filtered :param pin_list: the list of pins to be put at the beginning :return: the sorted position map """ diff --git a/api/core/helper/ssrf_proxy.py b/api/core/helper/ssrf_proxy.py index 6367e4563..969cd112e 100644 --- a/api/core/helper/ssrf_proxy.py +++ b/api/core/helper/ssrf_proxy.py @@ -56,8 +56,12 @@ def make_request(method, url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs): response = client.request(method=method, url=url, **kwargs) elif dify_config.SSRF_PROXY_HTTP_URL and dify_config.SSRF_PROXY_HTTPS_URL: proxy_mounts = { - "http://": httpx.HTTPTransport(proxy=dify_config.SSRF_PROXY_HTTP_URL), - "https://": httpx.HTTPTransport(proxy=dify_config.SSRF_PROXY_HTTPS_URL), + "http://": httpx.HTTPTransport( + proxy=dify_config.SSRF_PROXY_HTTP_URL, verify=HTTP_REQUEST_NODE_SSL_VERIFY + ), + "https://": httpx.HTTPTransport( + proxy=dify_config.SSRF_PROXY_HTTPS_URL, verify=HTTP_REQUEST_NODE_SSL_VERIFY + ), } with httpx.Client(mounts=proxy_mounts, verify=HTTP_REQUEST_NODE_SSL_VERIFY) as client: response = client.request(method=method, url=url, **kwargs) diff --git a/api/core/helper/tool_parameter_cache.py b/api/core/helper/tool_parameter_cache.py index 3b67b3f84..918b3e9ee 100644 --- a/api/core/helper/tool_parameter_cache.py +++ b/api/core/helper/tool_parameter_cache.py @@ -38,12 +38,7 @@ class ToolParameterCache: return None def set(self, parameters: dict) -> None: - """ - Cache model provider credentials. - - :param credentials: provider credentials - :return: - """ + """Cache model provider credentials.""" redis_client.setex(self.cache_key, 86400, json.dumps(parameters)) def delete(self) -> None: diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index 8206a8d3e..a75a4c22d 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -187,7 +187,7 @@ class IndexingRunner: }, ) if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX: - child_chunks = document_segment.child_chunks + child_chunks = document_segment.get_child_chunks() if child_chunks: child_documents = [] for child_chunk in child_chunks: diff --git a/api/core/llm_generator/prompts.py b/api/core/llm_generator/prompts.py index f9411e9ec..cf20e60c8 100644 --- a/api/core/llm_generator/prompts.py +++ b/api/core/llm_generator/prompts.py @@ -1,6 +1,6 @@ # Written by YORKI MINAKO🤡, Edited by Xiaoyi CONVERSATION_TITLE_PROMPT = """You need to decompose the user's input into "subject" and "intention" in order to accurately figure out what the user's input language actually is. -Notice: the language type user use could be diverse, which can be English, Chinese, Español, Arabic, Japanese, French, and etc. +Notice: the language type user use could be diverse, which can be English, Chinese, Italian, Español, Arabic, Japanese, French, and etc. MAKE SURE your output is the SAME language as the user's input! Your output is restricted only to: (Input language) Intention + Subject(short as possible) Your output MUST be a valid JSON. diff --git a/api/core/model_runtime/model_providers/__base/tts_model.py b/api/core/model_runtime/model_providers/__base/tts_model.py index 4feaa6f04..1f248d11a 100644 --- a/api/core/model_runtime/model_providers/__base/tts_model.py +++ b/api/core/model_runtime/model_providers/__base/tts_model.py @@ -38,7 +38,6 @@ class TTSModel(AIModel): :param credentials: model credentials :param voice: model timbre :param content_text: text content to be translated - :param streaming: output is streaming :param user: unique user id :return: translated audio file """ diff --git a/api/core/model_runtime/model_providers/openai/moderation/moderation.py b/api/core/model_runtime/model_providers/openai/moderation/moderation.py deleted file mode 100644 index 9bf055ce6..000000000 --- a/api/core/model_runtime/model_providers/openai/moderation/moderation.py +++ /dev/null @@ -1,170 +0,0 @@ -from collections.abc import Mapping -from typing import Optional - -import openai -from httpx import Timeout -from openai import OpenAI -from openai.types import ModerationCreateResponse - -from core.model_runtime.entities.model_entities import ModelPropertyKey -from core.model_runtime.errors.invoke import ( - InvokeAuthorizationError, - InvokeBadRequestError, - InvokeConnectionError, - InvokeError, - InvokeRateLimitError, - InvokeServerUnavailableError, -) -from core.model_runtime.errors.validate import CredentialsValidateFailedError -from core.model_runtime.model_providers.__base.moderation_model import ModerationModel - - -class OpenAIModerationModel(ModerationModel): - """ - Model class for OpenAI text moderation model. - """ - - def _invoke(self, model: str, credentials: dict, text: str, user: Optional[str] = None) -> bool: - """ - Invoke moderation model - - :param model: model name - :param credentials: model credentials - :param text: text to moderate - :param user: unique user id - :return: false if text is safe, true otherwise - """ - # transform credentials to kwargs for model instance - credentials_kwargs = self._to_credential_kwargs(credentials) - - # init model client - client = OpenAI(**credentials_kwargs) - - # chars per chunk - length = self._get_max_characters_per_chunk(model, credentials) - text_chunks = [text[i : i + length] for i in range(0, len(text), length)] - - max_text_chunks = self._get_max_chunks(model, credentials) - chunks = [text_chunks[i : i + max_text_chunks] for i in range(0, len(text_chunks), max_text_chunks)] - - for text_chunk in chunks: - moderation_result = self._moderation_invoke(model=model, client=client, texts=text_chunk) - - for result in moderation_result.results: - if result.flagged is True: - return True - - return False - - def validate_credentials(self, model: str, credentials: dict) -> None: - """ - Validate model credentials - - :param model: model name - :param credentials: model credentials - :return: - """ - try: - # transform credentials to kwargs for model instance - credentials_kwargs = self._to_credential_kwargs(credentials) - client = OpenAI(**credentials_kwargs) - - # call moderation model - self._moderation_invoke( - model=model, - client=client, - texts=["ping"], - ) - except Exception as ex: - raise CredentialsValidateFailedError(str(ex)) - - def _moderation_invoke(self, model: str, client: OpenAI, texts: list[str]) -> ModerationCreateResponse: - """ - Invoke moderation model - - :param model: model name - :param client: model client - :param texts: texts to moderate - :return: false if text is safe, true otherwise - """ - # call moderation model - moderation_result = client.moderations.create(model=model, input=texts) - - return moderation_result - - def _get_max_characters_per_chunk(self, model: str, credentials: dict) -> int: - """ - Get max characters per chunk - - :param model: model name - :param credentials: model credentials - :return: max characters per chunk - """ - model_schema = self.get_model_schema(model, credentials) - - if model_schema and ModelPropertyKey.MAX_CHARACTERS_PER_CHUNK in model_schema.model_properties: - max_characters_per_chunk: int = model_schema.model_properties[ModelPropertyKey.MAX_CHARACTERS_PER_CHUNK] - return max_characters_per_chunk - - return 2000 - - def _get_max_chunks(self, model: str, credentials: dict) -> int: - """ - Get max chunks for given embedding model - - :param model: model name - :param credentials: model credentials - :return: max chunks - """ - model_schema = self.get_model_schema(model, credentials) - - if model_schema and ModelPropertyKey.MAX_CHUNKS in model_schema.model_properties: - max_chunks: int = model_schema.model_properties[ModelPropertyKey.MAX_CHUNKS] - return max_chunks - - return 1 - - def _to_credential_kwargs(self, credentials: Mapping) -> dict: - """ - Transform credentials to kwargs for model instance - - :param credentials: - :return: - """ - credentials_kwargs = { - "api_key": credentials["openai_api_key"], - "timeout": Timeout(315.0, read=300.0, write=10.0, connect=5.0), - "max_retries": 1, - } - - if credentials.get("openai_api_base"): - openai_api_base = credentials["openai_api_base"].rstrip("/") - credentials_kwargs["base_url"] = openai_api_base + "/v1" - - if "openai_organization" in credentials: - credentials_kwargs["organization"] = credentials["openai_organization"] - - return credentials_kwargs - - @property - def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]]]: - """ - Map model invoke error to unified error - The key is the error type thrown to the caller - The value is the error type thrown by the model, - which needs to be converted into a unified error type for the caller. - - :return: Invoke error mapping - """ - return { - InvokeConnectionError: [openai.APIConnectionError, openai.APITimeoutError], - InvokeServerUnavailableError: [openai.InternalServerError], - InvokeRateLimitError: [openai.RateLimitError], - InvokeAuthorizationError: [openai.AuthenticationError, openai.PermissionDeniedError], - InvokeBadRequestError: [ - openai.BadRequestError, - openai.NotFoundError, - openai.UnprocessableEntityError, - openai.APIError, - ], - } diff --git a/api/core/model_runtime/model_providers/vertex_ai/llm/_position.yaml b/api/core/model_runtime/model_providers/vertex_ai/llm/_position.yaml deleted file mode 100644 index 321a49232..000000000 --- a/api/core/model_runtime/model_providers/vertex_ai/llm/_position.yaml +++ /dev/null @@ -1,22 +0,0 @@ -- claude-3-haiku@20240307 -- claude-3-opus@20240229 -- claude-3-sonnet@20240229 -- claude-3-5-sonnet-v2@20241022 -- claude-3-5-sonnet@20240620 -- gemini-1.0-pro-vision-001 -- gemini-1.0-pro-002 -- gemini-1.5-flash-001 -- gemini-1.5-flash-002 -- gemini-1.5-pro-001 -- gemini-1.5-pro-002 -- gemini-2.0-flash-001 -- gemini-2.0-flash-exp -- gemini-2.0-flash-lite-preview-02-05 -- gemini-2.0-flash-thinking-exp-01-21 -- gemini-2.0-flash-thinking-exp-1219 -- gemini-2.0-pro-exp-02-05 -- gemini-exp-1114 -- gemini-exp-1121 -- gemini-exp-1206 -- gemini-flash-experimental -- gemini-pro-experimental diff --git a/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-2.0-flash-001.yaml b/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-2.0-flash-001.yaml deleted file mode 100644 index bef7ca5ee..000000000 --- a/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-2.0-flash-001.yaml +++ /dev/null @@ -1,41 +0,0 @@ -model: gemini-2.0-flash-001 -label: - en_US: Gemini 2.0 Flash 001 -model_type: llm -features: - - agent-thought - - vision - - tool-call - - stream-tool-call - - document - - video - - audio -model_properties: - mode: chat - context_size: 1048576 -parameter_rules: - - name: temperature - use_template: temperature - - name: top_p - use_template: top_p - - name: top_k - label: - zh_Hans: 取样数量 - en_US: Top k - type: int - help: - zh_Hans: 仅从每个后续标记的前 K 个选项中采样。 - en_US: Only sample from the top K options for each subsequent token. - required: false - - name: max_output_tokens - use_template: max_tokens - default: 8192 - min: 1 - max: 8192 - - name: json_schema - use_template: json_schema -pricing: - input: '0.00' - output: '0.00' - unit: '0.000001' - currency: USD diff --git a/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-2.0-flash-lite-preview-02-05.yaml b/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-2.0-flash-lite-preview-02-05.yaml deleted file mode 100644 index 9c0a1e062..000000000 --- a/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-2.0-flash-lite-preview-02-05.yaml +++ /dev/null @@ -1,41 +0,0 @@ -model: gemini-2.0-flash-lite-preview-02-05 -label: - en_US: Gemini 2.0 Flash Lite Preview 0205 -model_type: llm -features: - - agent-thought - - vision - - tool-call - - stream-tool-call - - document - - video - - audio -model_properties: - mode: chat - context_size: 1048576 -parameter_rules: - - name: temperature - use_template: temperature - - name: top_p - use_template: top_p - - name: top_k - label: - zh_Hans: 取样数量 - en_US: Top k - type: int - help: - zh_Hans: 仅从每个后续标记的前 K 个选项中采样。 - en_US: Only sample from the top K options for each subsequent token. - required: false - - name: max_output_tokens - use_template: max_tokens - default: 8192 - min: 1 - max: 8192 - - name: json_schema - use_template: json_schema -pricing: - input: '0.00' - output: '0.00' - unit: '0.000001' - currency: USD diff --git a/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-2.0-flash-thinking-exp-01-21.yaml b/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-2.0-flash-thinking-exp-01-21.yaml deleted file mode 100644 index 6e2fc7678..000000000 --- a/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-2.0-flash-thinking-exp-01-21.yaml +++ /dev/null @@ -1,39 +0,0 @@ -model: gemini-2.0-flash-thinking-exp-01-21 -label: - en_US: Gemini 2.0 Flash Thinking Exp 0121 -model_type: llm -features: - - agent-thought - - vision - - document - - video - - audio -model_properties: - mode: chat - context_size: 32767 -parameter_rules: - - name: temperature - use_template: temperature - - name: top_p - use_template: top_p - - name: top_k - label: - zh_Hans: 取样数量 - en_US: Top k - type: int - help: - zh_Hans: 仅从每个后续标记的前 K 个选项中采样。 - en_US: Only sample from the top K options for each subsequent token. - required: false - - name: max_output_tokens - use_template: max_tokens - default: 8192 - min: 1 - max: 8192 - - name: json_schema - use_template: json_schema -pricing: - input: '0.00' - output: '0.00' - unit: '0.000001' - currency: USD diff --git a/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-2.0-flash-thinking-exp-1219.yaml b/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-2.0-flash-thinking-exp-1219.yaml deleted file mode 100644 index dfcf8fd05..000000000 --- a/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-2.0-flash-thinking-exp-1219.yaml +++ /dev/null @@ -1,39 +0,0 @@ -model: gemini-2.0-flash-thinking-exp-1219 -label: - en_US: Gemini 2.0 Flash Thinking Exp 1219 -model_type: llm -features: - - agent-thought - - vision - - document - - video - - audio -model_properties: - mode: chat - context_size: 32767 -parameter_rules: - - name: temperature - use_template: temperature - - name: top_p - use_template: top_p - - name: top_k - label: - zh_Hans: 取样数量 - en_US: Top k - type: int - help: - zh_Hans: 仅从每个后续标记的前 K 个选项中采样。 - en_US: Only sample from the top K options for each subsequent token. - required: false - - name: max_output_tokens - use_template: max_tokens - default: 8192 - min: 1 - max: 8192 - - name: json_schema - use_template: json_schema -pricing: - input: '0.00' - output: '0.00' - unit: '0.000001' - currency: USD diff --git a/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-2.0-pro-exp-02-05.yaml b/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-2.0-pro-exp-02-05.yaml deleted file mode 100644 index 96926a175..000000000 --- a/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-2.0-pro-exp-02-05.yaml +++ /dev/null @@ -1,37 +0,0 @@ -model: gemini-2.0-pro-exp-02-05 -label: - en_US: Gemini 2.0 Pro Exp 0205 -model_type: llm -features: - - agent-thought - - document -model_properties: - mode: chat - context_size: 2000000 -parameter_rules: - - name: temperature - use_template: temperature - - name: top_p - use_template: top_p - - name: top_k - label: - en_US: Top k - type: int - help: - en_US: Only sample from the top K options for each subsequent token. - required: false - - name: presence_penalty - use_template: presence_penalty - - name: frequency_penalty - use_template: frequency_penalty - - name: max_output_tokens - use_template: max_tokens - required: true - default: 8192 - min: 1 - max: 8192 -pricing: - input: '0.00' - output: '0.00' - unit: '0.000001' - currency: USD diff --git a/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-exp-1114.yaml b/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-exp-1114.yaml deleted file mode 100644 index bd49b4769..000000000 --- a/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-exp-1114.yaml +++ /dev/null @@ -1,41 +0,0 @@ -model: gemini-exp-1114 -label: - en_US: Gemini exp 1114 -model_type: llm -features: - - agent-thought - - vision - - tool-call - - stream-tool-call - - document - - video - - audio -model_properties: - mode: chat - context_size: 32767 -parameter_rules: - - name: temperature - use_template: temperature - - name: top_p - use_template: top_p - - name: top_k - label: - zh_Hans: 取样数量 - en_US: Top k - type: int - help: - zh_Hans: 仅从每个后续标记的前 K 个选项中采样。 - en_US: Only sample from the top K options for each subsequent token. - required: false - - name: max_output_tokens - use_template: max_tokens - default: 8192 - min: 1 - max: 8192 - - name: json_schema - use_template: json_schema -pricing: - input: '0.00' - output: '0.00' - unit: '0.000001' - currency: USD diff --git a/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-exp-1121.yaml b/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-exp-1121.yaml deleted file mode 100644 index 8e3f218df..000000000 --- a/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-exp-1121.yaml +++ /dev/null @@ -1,41 +0,0 @@ -model: gemini-exp-1121 -label: - en_US: Gemini exp 1121 -model_type: llm -features: - - agent-thought - - vision - - tool-call - - stream-tool-call - - document - - video - - audio -model_properties: - mode: chat - context_size: 32767 -parameter_rules: - - name: temperature - use_template: temperature - - name: top_p - use_template: top_p - - name: top_k - label: - zh_Hans: 取样数量 - en_US: Top k - type: int - help: - zh_Hans: 仅从每个后续标记的前 K 个选项中采样。 - en_US: Only sample from the top K options for each subsequent token. - required: false - - name: max_output_tokens - use_template: max_tokens - default: 8192 - min: 1 - max: 8192 - - name: json_schema - use_template: json_schema -pricing: - input: '0.00' - output: '0.00' - unit: '0.000001' - currency: USD diff --git a/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-exp-1206.yaml b/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-exp-1206.yaml deleted file mode 100644 index 7a7c361c4..000000000 --- a/api/core/model_runtime/model_providers/vertex_ai/llm/gemini-exp-1206.yaml +++ /dev/null @@ -1,41 +0,0 @@ -model: gemini-exp-1206 -label: - en_US: Gemini exp 1206 -model_type: llm -features: - - agent-thought - - vision - - tool-call - - stream-tool-call - - document - - video - - audio -model_properties: - mode: chat - context_size: 2097152 -parameter_rules: - - name: temperature - use_template: temperature - - name: top_p - use_template: top_p - - name: top_k - label: - zh_Hans: 取样数量 - en_US: Top k - type: int - help: - zh_Hans: 仅从每个后续标记的前 K 个选项中采样。 - en_US: Only sample from the top K options for each subsequent token. - required: false - - name: max_output_tokens - use_template: max_tokens - default: 8192 - min: 1 - max: 8192 - - name: json_schema - use_template: json_schema -pricing: - input: '0.00' - output: '0.00' - unit: '0.000001' - currency: USD diff --git a/api/core/model_runtime/model_providers/zhipuai/llm/glm-4-air-0111.yaml b/api/core/model_runtime/model_providers/zhipuai/llm/glm-4-air-0111.yaml deleted file mode 100644 index 8d301fc69..000000000 --- a/api/core/model_runtime/model_providers/zhipuai/llm/glm-4-air-0111.yaml +++ /dev/null @@ -1,66 +0,0 @@ -model: glm-4-air-0111 -label: - en_US: glm-4-air-0111 -model_type: llm -features: - - multi-tool-call - - agent-thought - - stream-tool-call -model_properties: - mode: chat - context_size: 131072 -parameter_rules: - - name: temperature - use_template: temperature - default: 0.95 - min: 0.0 - max: 1.0 - help: - zh_Hans: 采样温度,控制输出的随机性,必须为正数取值范围是:(0.0,1.0],不能等于 0,默认值为 0.95 值越大,会使输出更随机,更具创造性;值越小,输出会更加稳定或确定建议您根据应用场景调整 top_p 或 temperature 参数,但不要同时调整两个参数。 - en_US: Sampling temperature, controls the randomness of the output, must be a positive number. The value range is (0.0,1.0], which cannot be equal to 0. The default value is 0.95. The larger the value, the more random and creative the output will be; the smaller the value, The output will be more stable or certain. It is recommended that you adjust the top_p or temperature parameters according to the application scenario, but do not adjust both parameters at the same time. - - name: top_p - use_template: top_p - default: 0.7 - help: - zh_Hans: 用温度取样的另一种方法,称为核取样取值范围是:(0.0, 1.0) 开区间,不能等于 0 或 1,默认值为 0.7 模型考虑具有 top_p 概率质量tokens的结果例如:0.1 意味着模型解码器只考虑从前 10% 的概率的候选集中取 tokens 建议您根据应用场景调整 top_p 或 temperature 参数,但不要同时调整两个参数。 - en_US: Another method of temperature sampling is called kernel sampling. The value range is (0.0, 1.0) open interval, which cannot be equal to 0 or 1. The default value is 0.7. The model considers the results with top_p probability mass tokens. For example 0.1 means The model decoder only considers tokens from the candidate set with the top 10% probability. It is recommended that you adjust the top_p or temperature parameters according to the application scenario, but do not adjust both parameters at the same time. - - name: do_sample - label: - zh_Hans: 采样策略 - en_US: Sampling strategy - type: boolean - help: - zh_Hans: do_sample 为 true 时启用采样策略,do_sample 为 false 时采样策略 temperature、top_p 将不生效。默认值为 true。 - en_US: When `do_sample` is set to true, the sampling strategy is enabled. When `do_sample` is set to false, the sampling strategies such as `temperature` and `top_p` will not take effect. The default value is true. - default: true - - name: max_tokens - use_template: max_tokens - default: 1024 - min: 1 - max: 4095 - - name: web_search - type: boolean - label: - zh_Hans: 联网搜索 - en_US: Web Search - default: false - help: - zh_Hans: 模型内置了互联网搜索服务,该参数控制模型在生成文本时是否参考使用互联网搜索结果。启用互联网搜索,模型会将搜索结果作为文本生成过程中的参考信息,但模型会基于其内部逻辑“自行判断”是否使用互联网搜索结果。 - en_US: The model has a built-in Internet search service. This parameter controls whether the model refers to Internet search results when generating text. When Internet search is enabled, the model will use the search results as reference information in the text generation process, but the model will "judge" whether to use Internet search results based on its internal logic. - - name: response_format - label: - zh_Hans: 回复格式 - en_US: Response Format - type: string - help: - zh_Hans: 指定模型必须输出的格式 - en_US: specifying the format that the model must output - required: false - options: - - text - - json_object -pricing: - input: '0.0005' - output: '0.0005' - unit: '0.001' - currency: RMB diff --git a/api/core/ops/langfuse_trace/langfuse_trace.py b/api/core/ops/langfuse_trace/langfuse_trace.py index b9ba068b1..f67e270ab 100644 --- a/api/core/ops/langfuse_trace/langfuse_trace.py +++ b/api/core/ops/langfuse_trace/langfuse_trace.py @@ -213,9 +213,24 @@ class LangFuseDataTrace(BaseTraceInstance): if process_data and process_data.get("model_mode") == "chat": total_token = metadata.get("total_tokens", 0) + prompt_tokens = 0 + completion_tokens = 0 + try: + if outputs.get("usage"): + prompt_tokens = outputs.get("usage", {}).get("prompt_tokens", 0) + completion_tokens = outputs.get("usage", {}).get("completion_tokens", 0) + else: + prompt_tokens = process_data.get("usage", {}).get("prompt_tokens", 0) + completion_tokens = process_data.get("usage", {}).get("completion_tokens", 0) + except Exception: + logger.error("Failed to extract usage", exc_info=True) + # add generation generation_usage = GenerationUsage( + input=prompt_tokens, + output=completion_tokens, total=total_token, + unit=UnitEnum.TOKENS, ) node_generation_data = LangfuseGeneration( diff --git a/api/core/ops/langsmith_trace/langsmith_trace.py b/api/core/ops/langsmith_trace/langsmith_trace.py index 4ffd888bd..e3494e2f2 100644 --- a/api/core/ops/langsmith_trace/langsmith_trace.py +++ b/api/core/ops/langsmith_trace/langsmith_trace.py @@ -199,6 +199,7 @@ class LangSmithDataTrace(BaseTraceInstance): ) process_data = json.loads(node_execution.process_data) if node_execution.process_data else {} + if process_data and process_data.get("model_mode") == "chat": run_type = LangSmithRunType.llm metadata.update( @@ -212,9 +213,23 @@ class LangSmithDataTrace(BaseTraceInstance): else: run_type = LangSmithRunType.tool + prompt_tokens = 0 + completion_tokens = 0 + try: + if outputs.get("usage"): + prompt_tokens = outputs.get("usage", {}).get("prompt_tokens", 0) + completion_tokens = outputs.get("usage", {}).get("completion_tokens", 0) + else: + prompt_tokens = process_data.get("usage", {}).get("prompt_tokens", 0) + completion_tokens = process_data.get("usage", {}).get("completion_tokens", 0) + except Exception: + logger.error("Failed to extract usage", exc_info=True) + node_dotted_order = generate_dotted_order(node_execution_id, created_at, workflow_dotted_order) langsmith_run = LangSmithRunModel( total_tokens=node_total_tokens, + input_tokens=prompt_tokens, + output_tokens=completion_tokens, name=node_type, inputs=inputs, run_type=run_type, diff --git a/api/core/ops/ops_trace_manager.py b/api/core/ops/ops_trace_manager.py index 916509cd9..f388225cc 100644 --- a/api/core/ops/ops_trace_manager.py +++ b/api/core/ops/ops_trace_manager.py @@ -8,6 +8,7 @@ from datetime import timedelta from typing import Any, Optional, Union from uuid import UUID, uuid4 +from cachetools import LRUCache from flask import current_app from sqlalchemy import select from sqlalchemy.orm import Session @@ -70,6 +71,8 @@ provider_config_map: dict[str, dict[str, Any]] = { class OpsTraceManager: + ops_trace_instances_cache: LRUCache = LRUCache(maxsize=128) + @classmethod def encrypt_tracing_config( cls, tenant_id: str, tracing_provider: str, tracing_config: dict, current_trace_config=None @@ -204,28 +207,32 @@ class OpsTraceManager: return None app_ops_trace_config = json.loads(app.tracing) if app.tracing else None - if app_ops_trace_config is None: return None + if not app_ops_trace_config.get("enabled"): + return None tracing_provider = app_ops_trace_config.get("tracing_provider") - if tracing_provider is None or tracing_provider not in provider_config_map: return None # decrypt_token decrypt_trace_config = cls.get_decrypted_tracing_config(app_id, tracing_provider) - if app_ops_trace_config.get("enabled"): - trace_instance, config_class = ( - provider_config_map[tracing_provider]["trace_instance"], - provider_config_map[tracing_provider]["config_class"], - ) - if not decrypt_trace_config: - return None - tracing_instance = trace_instance(config_class(**decrypt_trace_config)) - return tracing_instance + if not decrypt_trace_config: + return None - return None + trace_instance, config_class = ( + provider_config_map[tracing_provider]["trace_instance"], + provider_config_map[tracing_provider]["config_class"], + ) + decrypt_trace_config_key = str(decrypt_trace_config) + tracing_instance = cls.ops_trace_instances_cache.get(decrypt_trace_config_key) + if tracing_instance is None: + # create new tracing_instance and update the cache if it absent + tracing_instance = trace_instance(config_class(**decrypt_trace_config)) + cls.ops_trace_instances_cache[decrypt_trace_config_key] = tracing_instance + logging.info(f"new tracing_instance for app_id: {app_id}") + return tracing_instance @classmethod def get_app_config_through_message_id(cls, message_id: str): diff --git a/api/core/rag/cleaner/clean_processor.py b/api/core/rag/cleaner/clean_processor.py index 754b0d18b..9cb009035 100644 --- a/api/core/rag/cleaner/clean_processor.py +++ b/api/core/rag/cleaner/clean_processor.py @@ -27,9 +27,26 @@ class CleanProcessor: pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)" text = re.sub(pattern, "", text) - # Remove URL - pattern = r"https?://[^\s]+" - text = re.sub(pattern, "", text) + # Remove URL but keep Markdown image URLs + # First, temporarily replace Markdown image URLs with a placeholder + markdown_image_pattern = r"!\[.*?\]\((https?://[^\s)]+)\)" + placeholders: list[str] = [] + + def replace_with_placeholder(match, placeholders=placeholders): + url = match.group(1) + placeholder = f"__MARKDOWN_IMAGE_URL_{len(placeholders)}__" + placeholders.append(url) + return f"![image]({placeholder})" + + text = re.sub(markdown_image_pattern, replace_with_placeholder, text) + + # Now remove all remaining URLs + url_pattern = r"https?://[^\s)]+" + text = re.sub(url_pattern, "", text) + + # Finally, restore the Markdown image URLs + for i, url in enumerate(placeholders): + text = text.replace(f"__MARKDOWN_IMAGE_URL_{i}__", url) return text def filter_string(self, text): diff --git a/api/core/rag/datasource/retrieval_service.py b/api/core/rag/datasource/retrieval_service.py index fea4d0edf..c4a1e9f05 100644 --- a/api/core/rag/datasource/retrieval_service.py +++ b/api/core/rag/datasource/retrieval_service.py @@ -1,4 +1,6 @@ import concurrent.futures +import logging +import time from concurrent.futures import ThreadPoolExecutor from typing import Optional @@ -46,7 +48,7 @@ class RetrievalService: if not query: return [] dataset = cls._get_dataset(dataset_id) - if not dataset or dataset.available_document_count == 0 or dataset.available_segment_count == 0: + if not dataset: return [] all_documents: list[Document] = [] @@ -178,6 +180,7 @@ class RetrievalService: if not dataset: raise ValueError("dataset not found") + start = time.time() vector = Vector(dataset=dataset) documents = vector.search_by_vector( query, @@ -187,6 +190,7 @@ class RetrievalService: filter={"group_id": [dataset.id]}, document_ids_filter=document_ids_filter, ) + logging.debug(f"embedding_search ends at {time.time() - start:.2f} seconds") if documents: if ( @@ -270,7 +274,8 @@ class RetrievalService: return [] try: - # Collect document IDs + start_time = time.time() + # Collect document IDs with existence check document_ids = {doc.metadata.get("document_id") for doc in documents if "document_id" in doc.metadata} if not document_ids: return [] @@ -288,43 +293,102 @@ class RetrievalService: include_segment_ids = set() segment_child_map = {} - # Process documents + # Precompute doc_forms to avoid redundant checks + doc_forms = {} + for doc in documents: + document_id = doc.metadata.get("document_id") + dataset_doc = dataset_documents.get(document_id) + if dataset_doc: + doc_forms[document_id] = dataset_doc.doc_form + + # Batch collect index node IDs with type safety + child_index_node_ids = [] + index_node_ids = [] + for doc in documents: + document_id = doc.metadata.get("document_id") + if doc_forms.get(document_id) == IndexType.PARENT_CHILD_INDEX: + child_index_node_ids.append(doc.metadata.get("doc_id")) + else: + index_node_ids.append(doc.metadata.get("doc_id")) + + # Batch query ChildChunk + child_chunks = db.session.query(ChildChunk).filter(ChildChunk.index_node_id.in_(child_index_node_ids)).all() + child_chunk_map = {chunk.index_node_id: chunk for chunk in child_chunks} + + # Batch query DocumentSegment with unified conditions + segment_map = { + segment.id: segment + for segment in db.session.query(DocumentSegment) + .filter( + ( + DocumentSegment.index_node_id.in_(index_node_ids) + | DocumentSegment.id.in_([chunk.segment_id for chunk in child_chunks]) + ), + DocumentSegment.enabled == True, + DocumentSegment.status == "completed", + ) + .options( + load_only( + DocumentSegment.id, + DocumentSegment.content, + DocumentSegment.answer, + ) + ) + .all() + } + for document in documents: document_id = document.metadata.get("document_id") - if document_id not in dataset_documents: - continue - - dataset_document = dataset_documents[document_id] + dataset_document = dataset_documents.get(document_id) if not dataset_document: continue - if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX: - # Handle parent-child documents + doc_form = doc_forms.get(document_id) + if doc_form == IndexType.PARENT_CHILD_INDEX: + # Handle parent-child documents using preloaded data child_index_node_id = document.metadata.get("doc_id") + if not child_index_node_id: + continue - child_chunk = ( - db.session.query(ChildChunk).filter(ChildChunk.index_node_id == child_index_node_id).first() - ) - + child_chunk = child_chunk_map.get(child_index_node_id) if not child_chunk: continue - segment = ( - db.session.query(DocumentSegment) - .filter( - DocumentSegment.dataset_id == dataset_document.dataset_id, - DocumentSegment.enabled == True, - DocumentSegment.status == "completed", - DocumentSegment.id == child_chunk.segment_id, - ) - .options( - load_only( - DocumentSegment.id, - DocumentSegment.content, - DocumentSegment.answer, - ) - ) - .first() + segment = segment_map.get(child_chunk.segment_id) + if not segment: + continue + + if segment.id not in include_segment_ids: + include_segment_ids.add(segment.id) + map_detail = {"max_score": document.metadata.get("score", 0.0), "child_chunks": []} + segment_child_map[segment.id] = map_detail + records.append({"segment": segment}) + + # Append child chunk details + child_chunk_detail = { + "id": child_chunk.id, + "content": child_chunk.content, + "position": child_chunk.position, + "score": document.metadata.get("score", 0.0), + } + segment_child_map[segment.id]["child_chunks"].append(child_chunk_detail) + segment_child_map[segment.id]["max_score"] = max( + segment_child_map[segment.id]["max_score"], document.metadata.get("score", 0.0) + ) + + else: + # Handle normal documents + index_node_id = document.metadata.get("doc_id") + if not index_node_id: + continue + + segment = next( + ( + s + for s in segment_map.values() + if s.index_node_id == index_node_id and s.dataset_id == dataset_document.dataset_id + ), + None, ) if not segment: @@ -332,66 +396,23 @@ class RetrievalService: if segment.id not in include_segment_ids: include_segment_ids.add(segment.id) - child_chunk_detail = { - "id": child_chunk.id, - "content": child_chunk.content, - "position": child_chunk.position, - "score": document.metadata.get("score", 0.0), - } - map_detail = { - "max_score": document.metadata.get("score", 0.0), - "child_chunks": [child_chunk_detail], - } - segment_child_map[segment.id] = map_detail - record = { - "segment": segment, - } - records.append(record) - else: - child_chunk_detail = { - "id": child_chunk.id, - "content": child_chunk.content, - "position": child_chunk.position, - "score": document.metadata.get("score", 0.0), - } - segment_child_map[segment.id]["child_chunks"].append(child_chunk_detail) - segment_child_map[segment.id]["max_score"] = max( - segment_child_map[segment.id]["max_score"], document.metadata.get("score", 0.0) + records.append( + { + "segment": segment, + "score": document.metadata.get("score", 0.0), + } ) - else: - # Handle normal documents - index_node_id = document.metadata.get("doc_id") - if not index_node_id: - continue - segment = ( - db.session.query(DocumentSegment) - .filter( - DocumentSegment.dataset_id == dataset_document.dataset_id, - DocumentSegment.enabled == True, - DocumentSegment.status == "completed", - DocumentSegment.index_node_id == index_node_id, - ) - .first() - ) - - if not segment: - continue - - include_segment_ids.add(segment.id) - record = { - "segment": segment, - "score": document.metadata.get("score"), # type: ignore - } - records.append(record) - - # Add child chunks information to records + # Merge child chunks information for record in records: - if record["segment"].id in segment_child_map: - record["child_chunks"] = segment_child_map[record["segment"].id].get("child_chunks") # type: ignore - record["score"] = segment_child_map[record["segment"].id]["max_score"] + segment_id = record["segment"].id + if segment_id in segment_child_map: + record["child_chunks"] = segment_child_map[segment_id]["child_chunks"] + record["score"] = segment_child_map[segment_id]["max_score"] + logging.debug(f"Formatting retrieval documents took {time.time() - start_time:.2f} seconds") return [RetrievalSegments(**record) for record in records] except Exception as e: + # Only rollback if there were write operations db.session.rollback() raise e diff --git a/api/core/rag/datasource/vdb/lindorm/lindorm_vector.py b/api/core/rag/datasource/vdb/lindorm/lindorm_vector.py index d3f528303..643ac2df4 100644 --- a/api/core/rag/datasource/vdb/lindorm/lindorm_vector.py +++ b/api/core/rag/datasource/vdb/lindorm/lindorm_vector.py @@ -1,10 +1,13 @@ import copy import json import logging +import time from typing import Any, Optional -from opensearchpy import OpenSearch +from opensearchpy import OpenSearch, helpers +from opensearchpy.helpers import BulkIndexError from pydantic import BaseModel, model_validator +from tenacity import retry, stop_after_attempt, wait_exponential from configs import dify_config from core.rag.datasource.vdb.field import Field @@ -77,33 +80,74 @@ class LindormVectorStore(BaseVector): def refresh(self): self._client.indices.refresh(index=self._collection_name) - def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs): - actions = [] + def add_texts( + self, + documents: list[Document], + embeddings: list[list[float]], + batch_size: int = 64, + timeout: int = 60, + **kwargs, + ): + logger.info(f"Total documents to add: {len(documents)}") uuids = self._get_uuids(documents) - for i in range(len(documents)): - action_header = { - "index": { - "_index": self.collection_name.lower(), - "_id": uuids[i], + + total_docs = len(documents) + num_batches = (total_docs + batch_size - 1) // batch_size + + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=10), + ) + def _bulk_with_retry(actions): + try: + response = self._client.bulk(actions, timeout=timeout) + if response["errors"]: + error_items = [item for item in response["items"] if "error" in item["index"]] + error_msg = f"Bulk indexing had {len(error_items)} errors" + logger.exception(error_msg) + raise Exception(error_msg) + return response + except Exception: + logger.exception("Bulk indexing error") + raise + + for batch_num in range(num_batches): + start_idx = batch_num * batch_size + end_idx = min((batch_num + 1) * batch_size, total_docs) + + actions = [] + for i in range(start_idx, end_idx): + action_header = { + "index": { + "_index": self.collection_name.lower(), + "_id": uuids[i], + } } - } - action_values: dict[str, Any] = { - Field.CONTENT_KEY.value: documents[i].page_content, - Field.VECTOR.value: embeddings[i], # Make sure you pass an array here - Field.METADATA_KEY.value: documents[i].metadata, - } - if self._using_ugc: - action_header["index"]["routing"] = self._routing - if self._routing_field is not None: - action_values[self._routing_field] = self._routing - actions.append(action_header) - actions.append(action_values) - response = self._client.bulk(actions) - if response["errors"]: - for item in response["items"]: - print(f"{item['index']['status']}: {item['index']['error']['type']}") - else: - self.refresh() + action_values: dict[str, Any] = { + Field.CONTENT_KEY.value: documents[i].page_content, + Field.VECTOR.value: embeddings[i], + Field.METADATA_KEY.value: documents[i].metadata, + } + if self._using_ugc: + action_header["index"]["routing"] = self._routing + if self._routing_field is not None: + action_values[self._routing_field] = self._routing + + actions.append(action_header) + actions.append(action_values) + + # logger.info(f"Processing batch {batch_num + 1}/{num_batches} (documents {start_idx + 1} to {end_idx})") + + try: + _bulk_with_retry(actions) + # logger.info(f"Successfully processed batch {batch_num + 1}") + # simple latency to avoid too many requests in a short time + if batch_num < num_batches - 1: + time.sleep(0.5) + + except Exception: + logger.exception(f"Failed to process batch {batch_num + 1}") + raise def get_ids_by_metadata_field(self, key: str, value: str): query: dict[str, Any] = { @@ -123,19 +167,51 @@ class LindormVectorStore(BaseVector): self.delete_by_ids(ids) def delete_by_ids(self, ids: list[str]) -> None: - params = {} - if self._using_ugc: - params["routing"] = self._routing + """Delete documents by their IDs in batch. + + Args: + ids: List of document IDs to delete + """ + if not ids: + return + + params = {"routing": self._routing} if self._using_ugc else {} + + # 1. First check if collection exists + if not self._client.indices.exists(index=self._collection_name): + logger.warning(f"Collection {self._collection_name} does not exist") + return + + # 2. Batch process deletions + actions = [] for id in ids: if self._client.exists(index=self._collection_name, id=id, params=params): - params = {} - if self._using_ugc: - params["routing"] = self._routing - self._client.delete(index=self._collection_name, id=id, params=params) - self.refresh() + actions.append( + { + "_op_type": "delete", + "_index": self._collection_name, + "_id": id, + **params, # Include routing if using UGC + } + ) else: logger.warning(f"DELETE BY ID: ID {id} does not exist in the index.") + # 3. Perform bulk deletion if there are valid documents to delete + if actions: + try: + helpers.bulk(self._client, actions) + except BulkIndexError as e: + for error in e.errors: + delete_error = error.get("delete", {}) + status = delete_error.get("status") + doc_id = delete_error.get("_id") + + if status == 404: + logger.warning(f"Document not found for deletion: {doc_id}") + else: + logger.exception(f"Error deleting document: {error}") + def delete(self) -> None: if self._using_ugc: routing_filter_query = { @@ -167,11 +243,11 @@ class LindormVectorStore(BaseVector): if not all(isinstance(x, float) for x in query_vector): raise ValueError("All elements in query_vector should be floats") - top_k = kwargs.get("top_k", 10) + top_k = kwargs.get("top_k", 3) document_ids_filter = kwargs.get("document_ids_filter") filters = [] if document_ids_filter: - filters.append({"terms": {"metadata.document_id": document_ids_filter}}) + filters.append({"terms": {"metadata.document_id.keyword": document_ids_filter}}) query = default_vector_search_query(query_vector=query_vector, k=top_k, filters=filters, **kwargs) try: @@ -210,11 +286,11 @@ class LindormVectorStore(BaseVector): must_not = kwargs.get("must_not") should = kwargs.get("should") minimum_should_match = kwargs.get("minimum_should_match", 0) - top_k = kwargs.get("top_k", 10) + top_k = kwargs.get("top_k", 3) filters = kwargs.get("filter", []) document_ids_filter = kwargs.get("document_ids_filter") if document_ids_filter: - filters.append({"terms": {"metadata.document_id": document_ids_filter}}) + filters.append({"terms": {"metadata.document_id.keyword": document_ids_filter}}) routing = self._routing full_text_query = default_text_search_query( query_text=query, @@ -228,6 +304,7 @@ class LindormVectorStore(BaseVector): routing=routing, routing_field=self._routing_field, ) + response = self._client.search(index=self._collection_name, body=full_text_query) docs = [] for hit in response["hits"]["hits"]: @@ -295,7 +372,7 @@ class LindormVectorStore(BaseVector): def default_text_mapping(dimension: int, method_name: str, **kwargs: Any) -> dict: - excludes_from_source = kwargs.get("excludes_from_source") + excludes_from_source = kwargs.get("excludes_from_source", False) analyzer = kwargs.get("analyzer", "ik_max_word") text_field = kwargs.get("text_field", Field.CONTENT_KEY.value) engine = kwargs["engine"] @@ -356,12 +433,12 @@ def default_text_mapping(dimension: int, method_name: str, **kwargs: Any) -> dic if excludes_from_source: # e.g. {"excludes": ["vector_field"]} - mapping["mappings"]["_source"] = {"excludes": excludes_from_source} + mapping["mappings"]["_source"] = {"excludes": [vector_field]} if using_ugc and method_name == "ivfpq": mapping["settings"]["index"]["knn_routing"] = True mapping["settings"]["index"]["knn.offline.construction"] = True - elif using_ugc and method_name == "hnsw" or using_ugc and method_name == "flat": + elif (using_ugc and method_name == "hnsw") or (using_ugc and method_name == "flat"): mapping["settings"]["index"]["knn_routing"] = True return mapping @@ -437,7 +514,7 @@ def default_vector_search_query( **kwargs, ) -> dict: if filters is not None: - filter_type = "post_filter" if filter_type is None else filter_type + filter_type = "pre_filter" if filter_type is None else filter_type if not isinstance(filters, list): raise RuntimeError(f"unexpected filter with {type(filters)}") final_ext: dict[str, Any] = {"lvector": {}} @@ -458,7 +535,7 @@ def default_vector_search_query( "query": {"knn": {vector_field: {"vector": query_vector, "k": k}}}, } - if filters is not None: + if filters is not None and len(filters) > 0: # when using filter, transform filter from List[Dict] to Dict as valid format filter_dict = {"bool": {"must": filters}} if len(filters) > 1 else filters[0] search_query["query"]["knn"][vector_field]["filter"] = filter_dict # filter should be Dict diff --git a/api/core/rag/datasource/vdb/milvus/milvus_vector.py b/api/core/rag/datasource/vdb/milvus/milvus_vector.py index a1180a650..7a3319f4a 100644 --- a/api/core/rag/datasource/vdb/milvus/milvus_vector.py +++ b/api/core/rag/datasource/vdb/milvus/milvus_vector.py @@ -231,8 +231,8 @@ class MilvusVector(BaseVector): document_ids_filter = kwargs.get("document_ids_filter") filter = "" if document_ids_filter: - document_ids = ", ".join(f"'{id}'" for id in document_ids_filter) - filter = f'metadata["document_id"] in ({document_ids})' + document_ids = ", ".join(f'"{id}"' for id in document_ids_filter) + filter = f'metadata["document_id"] in [{document_ids}]' results = self._client.search( collection_name=self._collection_name, data=[query_vector], @@ -259,7 +259,7 @@ class MilvusVector(BaseVector): filter = "" if document_ids_filter: document_ids = ", ".join(f"'{id}'" for id in document_ids_filter) - filter = f'metadata["document_id"] in ({document_ids})' + filter = f'metadata["document_id"] in [{document_ids}]' results = self._client.search( collection_name=self._collection_name, diff --git a/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py b/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py index 8ff97f2f2..ae6b0c51a 100644 --- a/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py +++ b/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py @@ -31,6 +31,7 @@ class OceanBaseVectorConfig(BaseModel): user: str password: str database: str + enable_hybrid_search: bool = False @model_validator(mode="before") @classmethod @@ -57,6 +58,7 @@ class OceanBaseVector(BaseVector): password=self._config.password, db_name=self._config.database, ) + self._hybrid_search_enabled = self._check_hybrid_search_support() # Check if hybrid search is supported def get_type(self) -> str: return VectorType.OCEANBASE @@ -98,6 +100,16 @@ class OceanBaseVector(BaseVector): columns=cols, vidxs=vidx_params, ) + try: + if self._hybrid_search_enabled: + self._client.perform_raw_text_sql(f"""ALTER TABLE {self._collection_name} + ADD FULLTEXT INDEX fulltext_index_for_col_text (text) WITH PARSER ik""") + except Exception as e: + raise Exception( + "Failed to add fulltext index to the target table, your OceanBase version must be 4.3.5.1 or above " + + "to support fulltext index and vector index in the same table", + e, + ) vals = [] params = self._client.perform_raw_text_sql("SHOW PARAMETERS LIKE '%ob_vector_memory_limit_percentage%'") for row in params: @@ -116,6 +128,27 @@ class OceanBaseVector(BaseVector): ) redis_client.set(collection_exist_cache_key, 1, ex=3600) + def _check_hybrid_search_support(self) -> bool: + """ + Check if the current OceanBase version supports hybrid search. + Returns True if the version is >= 4.3.5.1, otherwise False. + """ + if not self._config.enable_hybrid_search: + return False + + try: + from packaging import version + + # return OceanBase_CE 4.3.5.1 (r101000042025031818-bxxxx) (Built Mar 18 2025 18:13:36) + result = self._client.perform_raw_text_sql("SELECT @@version_comment AS version") + ob_full_version = result.fetchone()[0] + ob_version = ob_full_version.split()[1] + logger.debug("Current OceanBase version is %s", ob_version) + return version.parse(ob_version).base_version >= version.parse("4.3.5.1").base_version + except Exception as e: + logger.warning(f"Failed to check OceanBase version: {str(e)}. Disabling hybrid search.") + return False + def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs): ids = self._get_uuids(documents) for id, doc, emb in zip(ids, documents, embeddings): @@ -130,7 +163,7 @@ class OceanBaseVector(BaseVector): ) def text_exists(self, id: str) -> bool: - cur = self._client.get(table_name=self._collection_name, id=id) + cur = self._client.get(table_name=self._collection_name, ids=id) return bool(cur.rowcount != 0) def delete_by_ids(self, ids: list[str]) -> None: @@ -139,9 +172,12 @@ class OceanBaseVector(BaseVector): self._client.delete(table_name=self._collection_name, ids=ids) def get_ids_by_metadata_field(self, key: str, value: str) -> list[str]: + from sqlalchemy import text + cur = self._client.get( table_name=self._collection_name, - where_clause=f"metadata->>'$.{key}' = '{value}'", + ids=None, + where_clause=[text(f"metadata->>'$.{key}' = '{value}'")], output_column_name=["id"], ) return [row[0] for row in cur] @@ -151,36 +187,84 @@ class OceanBaseVector(BaseVector): self.delete_by_ids(ids) def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]: - return [] + if not self._hybrid_search_enabled: + return [] + + try: + top_k = kwargs.get("top_k", 5) + if not isinstance(top_k, int) or top_k <= 0: + raise ValueError("top_k must be a positive integer") + + document_ids_filter = kwargs.get("document_ids_filter") + where_clause = "" + if document_ids_filter: + document_ids = ", ".join(f"'{id}'" for id in document_ids_filter) + where_clause = f" AND metadata->>'$.document_id' IN ({document_ids})" + + full_sql = f"""SELECT metadata, text, MATCH (text) AGAINST (:query) AS score + FROM {self._collection_name} + WHERE MATCH (text) AGAINST (:query) > 0 + {where_clause} + ORDER BY score DESC + LIMIT {top_k}""" + + with self._client.engine.connect() as conn: + with conn.begin(): + from sqlalchemy import text + + result = conn.execute(text(full_sql), {"query": query}) + rows = result.fetchall() + + docs = [] + for row in rows: + metadata_str, _text, score = row + try: + metadata = json.loads(metadata_str) + except json.JSONDecodeError: + print(f"Invalid JSON metadata: {metadata_str}") + metadata = {} + metadata["score"] = score + docs.append(Document(page_content=_text, metadata=metadata)) + + return docs + except Exception as e: + logger.warning(f"Failed to fulltext search: {str(e)}.") + return [] def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]: document_ids_filter = kwargs.get("document_ids_filter") - where_clause = None + _where_clause = None if document_ids_filter: document_ids = ", ".join(f"'{id}'" for id in document_ids_filter) where_clause = f"metadata->>'$.document_id' in ({document_ids})" + from sqlalchemy import text + + _where_clause = [text(where_clause)] ef_search = kwargs.get("ef_search", self._hnsw_ef_search) if ef_search != self._hnsw_ef_search: self._client.set_ob_hnsw_ef_search(ef_search) self._hnsw_ef_search = ef_search topk = kwargs.get("top_k", 10) - cur = self._client.ann_search( - table_name=self._collection_name, - vec_column_name="vector", - vec_data=query_vector, - topk=topk, - distance_func=func.l2_distance, - output_column_names=["text", "metadata"], - with_dist=True, - where_clause=where_clause, - ) + try: + cur = self._client.ann_search( + table_name=self._collection_name, + vec_column_name="vector", + vec_data=query_vector, + topk=topk, + distance_func=func.l2_distance, + output_column_names=["text", "metadata"], + with_dist=True, + where_clause=_where_clause, + ) + except Exception as e: + raise Exception("Failed to search by vector. ", e) docs = [] - for text, metadata, distance in cur: + for _text, metadata, distance in cur: metadata = json.loads(metadata) metadata["score"] = 1 - distance / math.sqrt(2) docs.append( Document( - page_content=text, + page_content=_text, metadata=metadata, ) ) @@ -212,5 +296,6 @@ class OceanBaseVectorFactory(AbstractVectorFactory): user=dify_config.OCEANBASE_VECTOR_USER or "", password=(dify_config.OCEANBASE_VECTOR_PASSWORD or ""), database=dify_config.OCEANBASE_VECTOR_DATABASE or "", + enable_hybrid_search=dify_config.OCEANBASE_ENABLE_HYBRID_SEARCH or False, ), ) diff --git a/api/core/rag/datasource/vdb/opengauss/opengauss.py b/api/core/rag/datasource/vdb/opengauss/opengauss.py index 4d57a651d..dae908f67 100644 --- a/api/core/rag/datasource/vdb/opengauss/opengauss.py +++ b/api/core/rag/datasource/vdb/opengauss/opengauss.py @@ -177,7 +177,6 @@ class OpenGauss(BaseVector): Search the nearest neighbors to a vector. :param query_vector: The input vector to search for similar items. - :param top_k: The number of nearest neighbors to return, default is 5. :return: List of Documents that are nearest to the query vector. """ top_k = kwargs.get("top_k", 4) diff --git a/api/core/rag/datasource/vdb/oracle/oraclevector.py b/api/core/rag/datasource/vdb/oracle/oraclevector.py index 5888e04c7..143dfb325 100644 --- a/api/core/rag/datasource/vdb/oracle/oraclevector.py +++ b/api/core/rag/datasource/vdb/oracle/oraclevector.py @@ -197,7 +197,6 @@ class OracleVector(BaseVector): Search the nearest neighbors to a vector. :param query_vector: The input vector to search for similar items. - :param top_k: The number of nearest neighbors to return, default is 5. :return: List of Documents that are nearest to the query vector. """ top_k = kwargs.get("top_k", 4) diff --git a/api/core/rag/datasource/vdb/pgvector/pgvector.py b/api/core/rag/datasource/vdb/pgvector/pgvector.py index 783ad93b0..eab51ab01 100644 --- a/api/core/rag/datasource/vdb/pgvector/pgvector.py +++ b/api/core/rag/datasource/vdb/pgvector/pgvector.py @@ -167,7 +167,6 @@ class PGVector(BaseVector): Search the nearest neighbors to a vector. :param query_vector: The input vector to search for similar items. - :param top_k: The number of nearest neighbors to return, default is 5. :return: List of Documents that are nearest to the query vector. """ top_k = kwargs.get("top_k", 4) @@ -177,7 +176,7 @@ class PGVector(BaseVector): where_clause = "" if document_ids_filter: document_ids = ", ".join(f"'{id}'" for id in document_ids_filter) - where_clause = f" WHERE metadata->>'document_id' in ({document_ids}) " + where_clause = f" WHERE meta->>'document_id' in ({document_ids}) " with self._get_cursor() as cur: cur.execute( @@ -205,7 +204,7 @@ class PGVector(BaseVector): where_clause = "" if document_ids_filter: document_ids = ", ".join(f"'{id}'" for id in document_ids_filter) - where_clause = f" AND metadata->>'document_id' in ({document_ids}) " + where_clause = f" AND meta->>'document_id' in ({document_ids}) " if self.pg_bigm: cur.execute("SET pg_bigm.similarity_limit TO 0.000001") cur.execute( diff --git a/api/core/rag/datasource/vdb/tablestore/__init__.py b/api/core/rag/datasource/vdb/tablestore/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/api/core/rag/datasource/vdb/tablestore/tablestore_vector.py b/api/core/rag/datasource/vdb/tablestore/tablestore_vector.py new file mode 100644 index 000000000..a124faa50 --- /dev/null +++ b/api/core/rag/datasource/vdb/tablestore/tablestore_vector.py @@ -0,0 +1,295 @@ +import json +import logging +from typing import Any, Optional + +import tablestore # type: ignore +from pydantic import BaseModel, model_validator + +from configs import dify_config +from core.rag.datasource.vdb.field import Field +from core.rag.datasource.vdb.vector_base import BaseVector +from core.rag.datasource.vdb.vector_factory import AbstractVectorFactory +from core.rag.datasource.vdb.vector_type import VectorType +from core.rag.embedding.embedding_base import Embeddings +from core.rag.models.document import Document +from extensions.ext_redis import redis_client +from models import Dataset + + +class TableStoreConfig(BaseModel): + access_key_id: Optional[str] = None + access_key_secret: Optional[str] = None + instance_name: Optional[str] = None + endpoint: Optional[str] = None + + @model_validator(mode="before") + @classmethod + def validate_config(cls, values: dict) -> dict: + if not values["access_key_id"]: + raise ValueError("config ACCESS_KEY_ID is required") + if not values["access_key_secret"]: + raise ValueError("config ACCESS_KEY_SECRET is required") + if not values["instance_name"]: + raise ValueError("config INSTANCE_NAME is required") + if not values["endpoint"]: + raise ValueError("config ENDPOINT is required") + return values + + +class TableStoreVector(BaseVector): + def __init__(self, collection_name: str, config: TableStoreConfig): + super().__init__(collection_name) + self._config = config + self._tablestore_client = tablestore.OTSClient( + config.endpoint, + config.access_key_id, + config.access_key_secret, + config.instance_name, + ) + self._table_name = f"{collection_name}" + self._index_name = f"{collection_name}_idx" + self._tags_field = f"{Field.METADATA_KEY.value}_tags" + + def get_type(self) -> str: + return VectorType.TABLESTORE + + def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs): + dimension = len(embeddings[0]) + self._create_collection(dimension) + self.add_texts(documents=texts, embeddings=embeddings, **kwargs) + + def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs): + uuids = self._get_uuids(documents) + + for i in range(len(documents)): + self._write_row( + primary_key=uuids[i], + attributes={ + Field.CONTENT_KEY.value: documents[i].page_content, + Field.VECTOR.value: embeddings[i], + Field.METADATA_KEY.value: documents[i].metadata, + }, + ) + return uuids + + def text_exists(self, id: str) -> bool: + _, return_row, _ = self._tablestore_client.get_row( + table_name=self._table_name, primary_key=[("id", id)], columns_to_get=["id"] + ) + + return return_row is not None + + def delete_by_ids(self, ids: list[str]) -> None: + if not ids: + return + for id in ids: + self._delete_row(id=id) + + def get_ids_by_metadata_field(self, key: str, value: str): + return self._search_by_metadata(key, value) + + def delete_by_metadata_field(self, key: str, value: str) -> None: + ids = self.get_ids_by_metadata_field(key, value) + self.delete_by_ids(ids) + + def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]: + top_k = kwargs.get("top_k", 4) + return self._search_by_vector(query_vector, top_k) + + def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]: + return self._search_by_full_text(query) + + def delete(self) -> None: + self._delete_table_if_exist() + + def _create_collection(self, dimension: int): + lock_name = f"vector_indexing_lock_{self._collection_name}" + with redis_client.lock(lock_name, timeout=20): + collection_exist_cache_key = f"vector_indexing_{self._collection_name}" + if redis_client.get(collection_exist_cache_key): + logging.info(f"Collection {self._collection_name} already exists.") + return + + self._create_table_if_not_exist() + self._create_search_index_if_not_exist(dimension) + redis_client.set(collection_exist_cache_key, 1, ex=3600) + + def _create_table_if_not_exist(self) -> None: + table_list = self._tablestore_client.list_table() + if self._table_name in table_list: + logging.info("Tablestore system table[%s] already exists", self._table_name) + return None + + schema_of_primary_key = [("id", "STRING")] + table_meta = tablestore.TableMeta(self._table_name, schema_of_primary_key) + table_options = tablestore.TableOptions() + reserved_throughput = tablestore.ReservedThroughput(tablestore.CapacityUnit(0, 0)) + self._tablestore_client.create_table(table_meta, table_options, reserved_throughput) + logging.info("Tablestore create table[%s] successfully.", self._table_name) + + def _create_search_index_if_not_exist(self, dimension: int) -> None: + search_index_list = self._tablestore_client.list_search_index(table_name=self._table_name) + if self._index_name in [t[1] for t in search_index_list]: + logging.info("Tablestore system index[%s] already exists", self._index_name) + return None + + field_schemas = [ + tablestore.FieldSchema( + Field.CONTENT_KEY.value, + tablestore.FieldType.TEXT, + analyzer=tablestore.AnalyzerType.MAXWORD, + index=True, + enable_sort_and_agg=False, + store=False, + ), + tablestore.FieldSchema( + Field.VECTOR.value, + tablestore.FieldType.VECTOR, + vector_options=tablestore.VectorOptions( + data_type=tablestore.VectorDataType.VD_FLOAT_32, + dimension=dimension, + metric_type=tablestore.VectorMetricType.VM_COSINE, + ), + ), + tablestore.FieldSchema( + Field.METADATA_KEY.value, + tablestore.FieldType.KEYWORD, + index=True, + store=False, + ), + tablestore.FieldSchema( + self._tags_field, + tablestore.FieldType.KEYWORD, + index=True, + store=False, + is_array=True, + ), + ] + + index_meta = tablestore.SearchIndexMeta(field_schemas) + self._tablestore_client.create_search_index(self._table_name, self._index_name, index_meta) + logging.info("Tablestore create system index[%s] successfully.", self._index_name) + + def _delete_table_if_exist(self): + search_index_list = self._tablestore_client.list_search_index(table_name=self._table_name) + for resp_tuple in search_index_list: + self._tablestore_client.delete_search_index(resp_tuple[0], resp_tuple[1]) + logging.info("Tablestore delete index[%s] successfully.", self._index_name) + + self._tablestore_client.delete_table(self._table_name) + logging.info("Tablestore delete system table[%s] successfully.", self._index_name) + + def _delete_search_index(self) -> None: + self._tablestore_client.delete_search_index(self._table_name, self._index_name) + logging.info("Tablestore delete index[%s] successfully.", self._index_name) + + def _write_row(self, primary_key: str, attributes: dict[str, Any]) -> None: + pk = [("id", primary_key)] + + tags = [] + for key, value in attributes[Field.METADATA_KEY.value].items(): + tags.append(str(key) + "=" + str(value)) + + attribute_columns = [ + (Field.CONTENT_KEY.value, attributes[Field.CONTENT_KEY.value]), + (Field.VECTOR.value, json.dumps(attributes[Field.VECTOR.value])), + ( + Field.METADATA_KEY.value, + json.dumps(attributes[Field.METADATA_KEY.value]), + ), + (self._tags_field, json.dumps(tags)), + ] + row = tablestore.Row(pk, attribute_columns) + self._tablestore_client.put_row(self._table_name, row) + + def _delete_row(self, id: str) -> None: + primary_key = [("id", id)] + row = tablestore.Row(primary_key) + self._tablestore_client.delete_row(self._table_name, row, None) + logging.info("Tablestore delete row successfully. id:%s", id) + + def _search_by_metadata(self, key: str, value: str) -> list[str]: + query = tablestore.SearchQuery( + tablestore.TermQuery(self._tags_field, str(key) + "=" + str(value)), + limit=100, + get_total_count=False, + ) + + search_response = self._tablestore_client.search( + table_name=self._table_name, + index_name=self._index_name, + search_query=query, + columns_to_get=tablestore.ColumnsToGet(return_type=tablestore.ColumnReturnType.ALL_FROM_INDEX), + ) + + return [row[0][0][1] for row in search_response.rows] + + def _search_by_vector(self, query_vector: list[float], top_k: int) -> list[Document]: + ots_query = tablestore.KnnVectorQuery( + field_name=Field.VECTOR.value, + top_k=top_k, + float32_query_vector=query_vector, + ) + sort = tablestore.Sort(sorters=[tablestore.ScoreSort(sort_order=tablestore.SortOrder.DESC)]) + search_query = tablestore.SearchQuery(ots_query, limit=top_k, get_total_count=False, sort=sort) + + search_response = self._tablestore_client.search( + table_name=self._table_name, + index_name=self._index_name, + search_query=search_query, + columns_to_get=tablestore.ColumnsToGet(return_type=tablestore.ColumnReturnType.ALL_FROM_INDEX), + ) + logging.info( + "Tablestore search successfully. request_id:%s", + search_response.request_id, + ) + return self._to_query_result(search_response) + + def _to_query_result(self, search_response: tablestore.SearchResponse) -> list[Document]: + documents = [] + for row in search_response.rows: + documents.append( + Document( + page_content=row[1][2][1], + vector=json.loads(row[1][3][1]), + metadata=json.loads(row[1][0][1]), + ) + ) + + return documents + + def _search_by_full_text(self, query: str) -> list[Document]: + search_query = tablestore.SearchQuery( + query=tablestore.MatchQuery(text=query, field_name=Field.CONTENT_KEY.value), + sort=tablestore.Sort(sorters=[tablestore.ScoreSort(sort_order=tablestore.SortOrder.DESC)]), + limit=100, + ) + search_response = self._tablestore_client.search( + table_name=self._table_name, + index_name=self._index_name, + search_query=search_query, + columns_to_get=tablestore.ColumnsToGet(return_type=tablestore.ColumnReturnType.ALL_FROM_INDEX), + ) + + return self._to_query_result(search_response) + + +class TableStoreVectorFactory(AbstractVectorFactory): + def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> TableStoreVector: + if dataset.index_struct_dict: + class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"] + collection_name = class_prefix + else: + dataset_id = dataset.id + collection_name = Dataset.gen_collection_name_by_id(dataset_id) + dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.TABLESTORE, collection_name)) + + return TableStoreVector( + collection_name=collection_name, + config=TableStoreConfig( + endpoint=dify_config.TABLESTORE_ENDPOINT, + instance_name=dify_config.TABLESTORE_INSTANCE_NAME, + access_key_id=dify_config.TABLESTORE_ACCESS_KEY_ID, + access_key_secret=dify_config.TABLESTORE_ACCESS_KEY_SECRET, + ), + ) diff --git a/api/core/rag/datasource/vdb/tencent/tencent_vector.py b/api/core/rag/datasource/vdb/tencent/tencent_vector.py index 304d9538a..540d71bb8 100644 --- a/api/core/rag/datasource/vdb/tencent/tencent_vector.py +++ b/api/core/rag/datasource/vdb/tencent/tencent_vector.py @@ -1,11 +1,14 @@ import json +import logging +import math from typing import Any, Optional from pydantic import BaseModel -from tcvectordb import VectorDBClient # type: ignore +from tcvdb_text.encoder import BM25Encoder # type: ignore +from tcvectordb import RPCVectorDBClient, VectorDBException # type: ignore from tcvectordb.model import document, enum # type: ignore from tcvectordb.model import index as vdb_index # type: ignore -from tcvectordb.model.document import Filter # type: ignore +from tcvectordb.model.document import AnnSearch, Filter, KeywordSearch, WeightedRerank # type: ignore from configs import dify_config from core.rag.datasource.vdb.vector_base import BaseVector @@ -16,6 +19,8 @@ from core.rag.models.document import Document from extensions.ext_redis import redis_client from models.dataset import Dataset +logger = logging.getLogger(__name__) + class TencentConfig(BaseModel): url: str @@ -24,9 +29,11 @@ class TencentConfig(BaseModel): username: Optional[str] database: Optional[str] index_type: str = "HNSW" - metric_type: str = "L2" + metric_type: str = "IP" shard: int = 1 replicas: int = 2 + max_upsert_batch_size: int = 128 + enable_hybrid_search: bool = False # Flag to enable hybrid search def to_tencent_params(self): return {"url": self.url, "username": self.username, "key": self.api_key, "timeout": self.timeout} @@ -41,19 +48,33 @@ class TencentVector(BaseVector): def __init__(self, collection_name: str, config: TencentConfig): super().__init__(collection_name) self._client_config = config - self._client = VectorDBClient(**self._client_config.to_tencent_params()) - self._db = self._init_database() + self._client = RPCVectorDBClient(**self._client_config.to_tencent_params()) + self._enable_hybrid_search = False + self._dimension = 1024 + self._load_collection() + self._bm25 = BM25Encoder.default("zh") + + def _load_collection(self): + """ + Check if the collection supports hybrid search. + """ + if self._client_config.enable_hybrid_search: + self._enable_hybrid_search = True + if self._has_collection(): + coll = self._client.describe_collection( + database_name=self._client_config.database, collection_name=self.collection_name + ) + has_hybrid_search = False + for idx in coll.indexes: + if idx.name == "sparse_vector": + has_hybrid_search = True + elif idx.name == "vector": + self._dimension = idx.dimension + if not has_hybrid_search: + self._enable_hybrid_search = False def _init_database(self): - exists = False - for db in self._client.list_databases(): - if db.database_name == self._client_config.database: - exists = True - break - if exists: - return self._client.database(self._client_config.database) - else: - return self._client.create_database(database_name=self._client_config.database) + return self._client.create_database_if_not_exists(database_name=self._client_config.database) def get_type(self) -> str: return VectorType.TENCENT @@ -62,10 +83,14 @@ class TencentVector(BaseVector): return {"type": self.get_type(), "vector_store": {"class_prefix": self._collection_name}} def _has_collection(self) -> bool: - collections = self._db.list_collections() - return any(collection.collection_name == self._collection_name for collection in collections) + return bool( + self._client.exists_collection( + database_name=self._client_config.database, collection_name=self.collection_name + ) + ) def _create_collection(self, dimension: int) -> None: + self._dimension = dimension lock_name = "vector_indexing_lock_{}".format(self._collection_name) with redis_client.lock(lock_name, timeout=20): collection_exist_cache_key = "vector_indexing_{}".format(self._collection_name) @@ -75,7 +100,6 @@ class TencentVector(BaseVector): if self._has_collection(): return - self.delete() index_type = None for k, v in enum.IndexType.__members__.items(): if k == self._client_config.index_type: @@ -89,26 +113,52 @@ class TencentVector(BaseVector): if metric_type is None: raise ValueError("unsupported metric_type") params = vdb_index.HNSWParams(m=16, efconstruction=200) - index = vdb_index.Index( - vdb_index.FilterIndex(self.field_id, enum.FieldType.String, enum.IndexType.PRIMARY_KEY), - vdb_index.VectorIndex( - self.field_vector, - dimension, - index_type, - metric_type, - params, - ), - vdb_index.FilterIndex(self.field_text, enum.FieldType.String, enum.IndexType.FILTER), - vdb_index.FilterIndex(self.field_metadata, enum.FieldType.String, enum.IndexType.FILTER), + index_id = vdb_index.FilterIndex(self.field_id, enum.FieldType.String, enum.IndexType.PRIMARY_KEY) + index_vector = vdb_index.VectorIndex( + self.field_vector, + dimension, + index_type, + metric_type, + params, ) - - self._db.create_collection( - name=self._collection_name, - shard=self._client_config.shard, - replicas=self._client_config.replicas, - description="Collection for Dify", - index=index, + index_text = vdb_index.FilterIndex(self.field_text, enum.FieldType.String, enum.IndexType.FILTER) + index_metadate = vdb_index.FilterIndex(self.field_metadata, enum.FieldType.Json, enum.IndexType.FILTER) + index_sparse_vector = vdb_index.SparseIndex( + name="sparse_vector", + field_type=enum.FieldType.SparseVector, + index_type=enum.IndexType.SPARSE_INVERTED, + metric_type=enum.MetricType.IP, ) + indexes = [index_id, index_vector, index_text, index_metadate] + if self._enable_hybrid_search: + indexes.append(index_sparse_vector) + try: + self._client.create_collection( + database_name=self._client_config.database, + collection_name=self._collection_name, + shard=self._client_config.shard, + replicas=self._client_config.replicas, + description="Collection for Dify", + indexes=indexes, + ) + except VectorDBException as e: + if "fieldType:json" not in e.message: + raise e + # vdb version not support json, use string + index_metadate = vdb_index.FilterIndex( + self.field_metadata, enum.FieldType.String, enum.IndexType.FILTER + ) + indexes = [index_id, index_vector, index_text, index_metadate] + if self._enable_hybrid_search: + indexes.append(index_sparse_vector) + self._client.create_collection( + database_name=self._client_config.database, + collection_name=self._collection_name, + shard=self._client_config.shard, + replicas=self._client_config.replicas, + description="Collection for Dify", + indexes=indexes, + ) redis_client.set(collection_exist_cache_key, 1, ex=3600) def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs): @@ -119,22 +169,36 @@ class TencentVector(BaseVector): texts = [doc.page_content for doc in documents] metadatas = [doc.metadata for doc in documents] total_count = len(embeddings) - docs = [] - for i in range(0, total_count): - if metadatas is None: - continue - metadata = metadatas[i] or {} - doc = document.Document( - id=metadata.get("doc_id"), - vector=embeddings[i], - text=texts[i], - metadata=json.dumps(metadata), + batch_size = self._client_config.max_upsert_batch_size + batch = math.ceil(total_count / batch_size) + for j in range(batch): + docs = [] + start_idx = j * batch_size + end_idx = min(total_count, (j + 1) * batch_size) + for i in range(start_idx, end_idx): + if metadatas is None: + continue + metadata = metadatas[i] or {} + doc = document.Document( + id=metadata.get("doc_id"), + vector=embeddings[i], + text=texts[i], + metadata=metadata, + ) + if self._enable_hybrid_search: + doc.__dict__["sparse_vector"] = self._bm25.encode_texts(texts[i]) + docs.append(doc) + self._client.upsert( + database_name=self._client_config.database, + collection_name=self.collection_name, + documents=docs, + timeout=self._client_config.timeout, ) - docs.append(doc) - self._db.collection(self._collection_name).upsert(docs, self._client_config.timeout) def text_exists(self, id: str) -> bool: - docs = self._db.collection(self._collection_name).query(document_ids=[id]) + docs = self._client.query( + database_name=self._client_config.database, collection_name=self.collection_name, document_ids=[id] + ) if docs and len(docs) > 0: return True return False @@ -142,17 +206,25 @@ class TencentVector(BaseVector): def delete_by_ids(self, ids: list[str]) -> None: if not ids: return - self._db.collection(self._collection_name).delete(document_ids=ids) + self._client.delete( + database_name=self._client_config.database, collection_name=self.collection_name, document_ids=ids + ) def delete_by_metadata_field(self, key: str, value: str) -> None: - self._db.collection(self._collection_name).delete(filter=Filter(Filter.In(f"metadata.{key}", [value]))) + self._client.delete( + database_name=self._client_config.database, + collection_name=self.collection_name, + filter=Filter(Filter.In(f"metadata.{key}", [value])), + ) def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]: document_ids_filter = kwargs.get("document_ids_filter") filter = None if document_ids_filter: filter = Filter(Filter.In("metadata.document_id", document_ids_filter)) - res = self._db.collection(self._collection_name).search( + res = self._client.search( + database_name=self._client_config.database, + collection_name=self.collection_name, vectors=[query_vector], filter=filter, params=document.HNSWSearchParams(ef=kwargs.get("ef", 10)), @@ -164,7 +236,32 @@ class TencentVector(BaseVector): return self._get_search_res(res, score_threshold) def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]: - return [] + if not self._enable_hybrid_search: + return [] + res = self._client.hybrid_search( + database_name=self._client_config.database, + collection_name=self.collection_name, + ann=[ + AnnSearch( + field_name="vector", + data=[0.0] * self._dimension, + ) + ], + match=[ + KeywordSearch( + field_name="sparse_vector", + data=self._bm25.encode_queries(query), + ), + ], + rerank=WeightedRerank( + field_list=["vector", "sparse_vector"], + weight=[0, 1], + ), + retrieve_vector=False, + limit=kwargs.get("top_k", 4), + ) + score_threshold = float(kwargs.get("score_threshold") or 0.0) + return self._get_search_res(res, score_threshold) def _get_search_res(self, res: list | None, score_threshold: float) -> list[Document]: docs: list[Document] = [] @@ -173,9 +270,7 @@ class TencentVector(BaseVector): for result in res[0]: meta = result.get(self.field_metadata) - if meta is not None: - meta = json.loads(meta) - score = 1 - result.get("score", 0.0) + score = result.get("score", 0.0) if score > score_threshold: meta["score"] = score doc = Document(page_content=result.get(self.field_text), metadata=meta) @@ -184,7 +279,7 @@ class TencentVector(BaseVector): return docs def delete(self) -> None: - self._db.drop_collection(name=self._collection_name) + self._client.drop_collection(database_name=self._client_config.database, collection_name=self.collection_name) class TencentVectorFactory(AbstractVectorFactory): @@ -207,5 +302,6 @@ class TencentVectorFactory(AbstractVectorFactory): database=dify_config.TENCENT_VECTOR_DB_DATABASE, shard=dify_config.TENCENT_VECTOR_DB_SHARD, replicas=dify_config.TENCENT_VECTOR_DB_REPLICAS, + enable_hybrid_search=dify_config.TENCENT_VECTOR_DB_ENABLE_HYBRID_SEARCH or False, ), ) diff --git a/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_service.py b/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_service.py index 0a48c7951..3958280bd 100644 --- a/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_service.py +++ b/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_service.py @@ -22,7 +22,6 @@ class TidbService: :param iam_url: The URL of the TiDB Cloud IAM API (required). :param public_key: The public key for the API (required). :param private_key: The private key for the API (required). - :param display_name: The user-friendly display name of the cluster (required). :param region: The region where the cluster will be created (required). :return: The response from the API. @@ -149,13 +148,12 @@ class TidbService: ): """ Update the status of a new TiDB Serverless cluster. + :param tidb_serverless_list: The TiDB serverless list (required). :param project_id: The project ID of the TiDB Cloud project (required). :param api_url: The URL of the TiDB Cloud API (required). :param iam_url: The URL of the TiDB Cloud IAM API (required). :param public_key: The public key for the API (required). :param private_key: The private key for the API (required). - :param display_name: The user-friendly display name of the cluster (required). - :param region: The region where the cluster will be created (required). :return: The response from the API. """ @@ -186,12 +184,12 @@ class TidbService: ) -> list[dict]: """ Creates a new TiDB Serverless cluster. + :param batch_size: The batch size (required). :param project_id: The project ID of the TiDB Cloud project (required). :param api_url: The URL of the TiDB Cloud API (required). :param iam_url: The URL of the TiDB Cloud IAM API (required). :param public_key: The public key for the API (required). :param private_key: The private key for the API (required). - :param display_name: The user-friendly display name of the cluster (required). :param region: The region where the cluster will be created (required). :return: The response from the API. diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index 6b4d7b042..00601c38a 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -152,6 +152,10 @@ class Vector: from core.rag.datasource.vdb.opengauss.opengauss import OpenGaussFactory return OpenGaussFactory + case VectorType.TABLESTORE: + from core.rag.datasource.vdb.tablestore.tablestore_vector import TableStoreVectorFactory + + return TableStoreVectorFactory case _: raise ValueError(f"Vector store {vector_type} is not supported.") diff --git a/api/core/rag/datasource/vdb/vector_type.py b/api/core/rag/datasource/vdb/vector_type.py index e06988bfc..940f12cae 100644 --- a/api/core/rag/datasource/vdb/vector_type.py +++ b/api/core/rag/datasource/vdb/vector_type.py @@ -25,3 +25,4 @@ class VectorType(StrEnum): TIDB_ON_QDRANT = "tidb_on_qdrant" OCEANBASE = "oceanbase" OPENGAUSS = "opengauss" + TABLESTORE = "tablestore" diff --git a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py index 01eaf947f..8fe619951 100644 --- a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py +++ b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py @@ -226,7 +226,6 @@ class WeaviateVector(BaseVector): Args: query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. Returns: List of Documents most similar to the query. diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py index f9fd7f92a..bc19899ea 100644 --- a/api/core/rag/extractor/extract_processor.py +++ b/api/core/rag/extractor/extract_processor.py @@ -18,6 +18,7 @@ from core.rag.extractor.markdown_extractor import MarkdownExtractor from core.rag.extractor.notion_extractor import NotionExtractor from core.rag.extractor.pdf_extractor import PdfExtractor from core.rag.extractor.text_extractor import TextExtractor +from core.rag.extractor.unstructured.unstructured_doc_extractor import UnstructuredWordExtractor from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor @@ -25,6 +26,7 @@ from core.rag.extractor.unstructured.unstructured_msg_extractor import Unstructu from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor +from core.rag.extractor.watercrawl.extractor import WaterCrawlWebExtractor from core.rag.extractor.word_extractor import WordExtractor from core.rag.models.document import Document from extensions.ext_storage import storage @@ -104,7 +106,7 @@ class ExtractProcessor: etl_type = dify_config.ETL_TYPE extractor: Optional[BaseExtractor] = None if etl_type == "Unstructured": - unstructured_api_url = dify_config.UNSTRUCTURED_API_URL + unstructured_api_url = dify_config.UNSTRUCTURED_API_URL or "" unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or "" if file_extension in {".xlsx", ".xls"}: @@ -121,6 +123,8 @@ class ExtractProcessor: extractor = HtmlExtractor(file_path) elif file_extension == ".docx": extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by) + elif file_extension == ".doc": + extractor = UnstructuredWordExtractor(file_path, unstructured_api_url, unstructured_api_key) elif file_extension == ".csv": extractor = CSVExtractor(file_path, autodetect_encoding=True) elif file_extension == ".msg": @@ -180,6 +184,15 @@ class ExtractProcessor: only_main_content=extract_setting.website_info.only_main_content, ) return extractor.extract() + elif extract_setting.website_info.provider == "watercrawl": + extractor = WaterCrawlWebExtractor( + url=extract_setting.website_info.url, + job_id=extract_setting.website_info.job_id, + tenant_id=extract_setting.website_info.tenant_id, + mode=extract_setting.website_info.mode, + only_main_content=extract_setting.website_info.only_main_content, + ) + return extractor.extract() elif extract_setting.website_info.provider == "jinareader": extractor = JinaReaderWebExtractor( url=extract_setting.website_info.url, diff --git a/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py b/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py index 355a2fb20..4de831888 100644 --- a/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py +++ b/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py @@ -7,11 +7,10 @@ class FirecrawlWebExtractor(BaseExtractor): """ Crawl and scrape websites and return content in clean llm-ready markdown. - Args: url: The URL to scrape. - api_key: The API key for Firecrawl. - base_url: The base URL for the Firecrawl API. Defaults to 'https://api.firecrawl.dev'. + job_id: The crawl job id. + tenant_id: The tenant id. mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'. only_main_content: Only return the main content of the page excluding headers, navs, footers, etc. """ diff --git a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py index a525c9e9e..5199208f7 100644 --- a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py @@ -10,14 +10,11 @@ logger = logging.getLogger(__name__) class UnstructuredWordExtractor(BaseExtractor): """Loader that uses unstructured to load word documents.""" - def __init__( - self, - file_path: str, - api_url: str, - ): + def __init__(self, file_path: str, api_url: str, api_key: str = ""): """Initialize with file path.""" self._file_path = file_path self._api_url = api_url + self._api_key = api_key def extract(self) -> list[Document]: from unstructured.__version__ import __version__ as __unstructured_version__ @@ -41,9 +38,10 @@ class UnstructuredWordExtractor(BaseExtractor): ) if is_doc: - from unstructured.partition.doc import partition_doc + from unstructured.partition.api import partition_via_api + + elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key) - elements = partition_doc(filename=self._file_path) else: from unstructured.partition.docx import partition_docx diff --git a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py index 35ca686f6..fa91f7dd0 100644 --- a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py @@ -1,6 +1,8 @@ import logging from typing import Optional +import pypandoc # type: ignore + from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -34,6 +36,7 @@ class UnstructuredEpubExtractor(BaseExtractor): else: from unstructured.partition.epub import partition_epub + pypandoc.download_pandoc() elements = partition_epub(filename=self._file_path, xml_keep_tags=True) from unstructured.chunking.title import chunk_by_title diff --git a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py index d5418e612..0a0c8d3a1 100644 --- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py @@ -14,15 +14,6 @@ class UnstructuredMarkdownExtractor(BaseExtractor): Args: file_path: Path to the file to load. - remove_hyperlinks: Whether to remove hyperlinks from the text. - - remove_images: Whether to remove images from the text. - - encoding: File encoding to use. If `None`, the file will be loaded - with the default system encoding. - - autodetect_encoding: Whether to try to autodetect the file encoding - if the specified encoding fails. """ def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): diff --git a/api/core/rag/extractor/watercrawl/client.py b/api/core/rag/extractor/watercrawl/client.py new file mode 100644 index 000000000..6eaede7db --- /dev/null +++ b/api/core/rag/extractor/watercrawl/client.py @@ -0,0 +1,161 @@ +import json +from collections.abc import Generator +from typing import Union +from urllib.parse import urljoin + +import requests +from requests import Response + + +class BaseAPIClient: + def __init__(self, api_key, base_url): + self.api_key = api_key + self.base_url = base_url + self.session = self.init_session() + + def init_session(self): + session = requests.Session() + session.headers.update({"X-API-Key": self.api_key}) + session.headers.update({"Content-Type": "application/json"}) + session.headers.update({"Accept": "application/json"}) + session.headers.update({"User-Agent": "WaterCrawl-Plugin"}) + session.headers.update({"Accept-Language": "en-US"}) + return session + + def _get(self, endpoint: str, query_params: dict | None = None, **kwargs): + return self.session.get(urljoin(self.base_url, endpoint), params=query_params, **kwargs) + + def _post(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs): + return self.session.post(urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs) + + def _put(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs): + return self.session.put(urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs) + + def _delete(self, endpoint: str, query_params: dict | None = None, **kwargs): + return self.session.delete(urljoin(self.base_url, endpoint), params=query_params, **kwargs) + + def _patch(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs): + return self.session.patch(urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs) + + +class WaterCrawlAPIClient(BaseAPIClient): + def __init__(self, api_key, base_url: str | None = "https://app.watercrawl.dev/"): + super().__init__(api_key, base_url) + + def process_eventstream(self, response: Response, download: bool = False) -> Generator: + for line in response.iter_lines(): + line = line.decode("utf-8") + if line.startswith("data:"): + line = line[5:].strip() + data = json.loads(line) + if data["type"] == "result" and download: + data["data"] = self.download_result(data["data"]) + yield data + + def process_response(self, response: Response) -> dict | bytes | list | None | Generator: + response.raise_for_status() + if response.status_code == 204: + return None + if response.headers.get("Content-Type") == "application/json": + return response.json() or {} + + if response.headers.get("Content-Type") == "application/octet-stream": + return response.content + + if response.headers.get("Content-Type") == "text/event-stream": + return self.process_eventstream(response) + + raise Exception(f"Unknown response type: {response.headers.get('Content-Type')}") + + def get_crawl_requests_list(self, page: int | None = None, page_size: int | None = None): + query_params = {"page": page or 1, "page_size": page_size or 10} + return self.process_response( + self._get( + "/api/v1/core/crawl-requests/", + query_params=query_params, + ) + ) + + def get_crawl_request(self, item_id: str): + return self.process_response( + self._get( + f"/api/v1/core/crawl-requests/{item_id}/", + ) + ) + + def create_crawl_request( + self, + url: Union[list, str] | None = None, + spider_options: dict | None = None, + page_options: dict | None = None, + plugin_options: dict | None = None, + ): + data = { + # 'urls': url if isinstance(url, list) else [url], + "url": url, + "options": { + "spider_options": spider_options or {}, + "page_options": page_options or {}, + "plugin_options": plugin_options or {}, + }, + } + return self.process_response( + self._post( + "/api/v1/core/crawl-requests/", + data=data, + ) + ) + + def stop_crawl_request(self, item_id: str): + return self.process_response( + self._delete( + f"/api/v1/core/crawl-requests/{item_id}/", + ) + ) + + def download_crawl_request(self, item_id: str): + return self.process_response( + self._get( + f"/api/v1/core/crawl-requests/{item_id}/download/", + ) + ) + + def monitor_crawl_request(self, item_id: str, prefetched=False) -> Generator: + query_params = {"prefetched": str(prefetched).lower()} + generator = self.process_response( + self._get(f"/api/v1/core/crawl-requests/{item_id}/status/", stream=True, query_params=query_params), + ) + if not isinstance(generator, Generator): + raise ValueError("Generator expected") + yield from generator + + def get_crawl_request_results( + self, item_id: str, page: int = 1, page_size: int = 25, query_params: dict | None = None + ): + query_params = query_params or {} + query_params.update({"page": page or 1, "page_size": page_size or 25}) + return self.process_response( + self._get(f"/api/v1/core/crawl-requests/{item_id}/results/", query_params=query_params) + ) + + def scrape_url( + self, + url: str, + page_options: dict | None = None, + plugin_options: dict | None = None, + sync: bool = True, + prefetched: bool = True, + ): + response_result = self.create_crawl_request(url=url, page_options=page_options, plugin_options=plugin_options) + if not sync: + return response_result + + for event_data in self.monitor_crawl_request(response_result["uuid"], prefetched): + if event_data["type"] == "result": + return event_data["data"] + + def download_result(self, result_object: dict): + response = requests.get(result_object["result"]) + response.raise_for_status() + result_object["result"] = response.json() + return result_object diff --git a/api/core/rag/extractor/watercrawl/extractor.py b/api/core/rag/extractor/watercrawl/extractor.py new file mode 100644 index 000000000..40d174096 --- /dev/null +++ b/api/core/rag/extractor/watercrawl/extractor.py @@ -0,0 +1,57 @@ +from core.rag.extractor.extractor_base import BaseExtractor +from core.rag.models.document import Document +from services.website_service import WebsiteService + + +class WaterCrawlWebExtractor(BaseExtractor): + """ + Crawl and scrape websites and return content in clean llm-ready markdown. + + + Args: + url: The URL to scrape. + api_key: The API key for WaterCrawl. + base_url: The base URL for the Firecrawl API. Defaults to 'https://app.firecrawl.dev'. + mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'. + only_main_content: Only return the main content of the page excluding headers, navs, footers, etc. + """ + + def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True): + """Initialize with url, api_key, base_url and mode.""" + self._url = url + self.job_id = job_id + self.tenant_id = tenant_id + self.mode = mode + self.only_main_content = only_main_content + + def extract(self) -> list[Document]: + """Extract content from the URL.""" + documents = [] + if self.mode == "crawl": + crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "watercrawl", self._url, self.tenant_id) + if crawl_data is None: + return [] + document = Document( + page_content=crawl_data.get("markdown", ""), + metadata={ + "source_url": crawl_data.get("source_url"), + "description": crawl_data.get("description"), + "title": crawl_data.get("title"), + }, + ) + documents.append(document) + elif self.mode == "scrape": + scrape_data = WebsiteService.get_scrape_url_data( + "watercrawl", self._url, self.tenant_id, self.only_main_content + ) + + document = Document( + page_content=scrape_data.get("markdown", ""), + metadata={ + "source_url": scrape_data.get("source_url"), + "description": scrape_data.get("description"), + "title": scrape_data.get("title"), + }, + ) + documents.append(document) + return documents diff --git a/api/core/rag/extractor/watercrawl/provider.py b/api/core/rag/extractor/watercrawl/provider.py new file mode 100644 index 000000000..b8003b386 --- /dev/null +++ b/api/core/rag/extractor/watercrawl/provider.py @@ -0,0 +1,117 @@ +from collections.abc import Generator +from datetime import datetime +from typing import Any + +from core.rag.extractor.watercrawl.client import WaterCrawlAPIClient + + +class WaterCrawlProvider: + def __init__(self, api_key, base_url: str | None = None): + self.client = WaterCrawlAPIClient(api_key, base_url) + + def crawl_url(self, url, options: dict | Any = None) -> dict: + options = options or {} + spider_options = { + "max_depth": 1, + "page_limit": 1, + "allowed_domains": [], + "exclude_paths": [], + "include_paths": [], + } + if options.get("crawl_sub_pages", True): + spider_options["page_limit"] = options.get("limit", 1) + spider_options["max_depth"] = options.get("depth", 1) + spider_options["include_paths"] = options.get("includes", "").split(",") if options.get("includes") else [] + spider_options["exclude_paths"] = options.get("excludes", "").split(",") if options.get("excludes") else [] + + wait_time = options.get("wait_time", 1000) + page_options = { + "exclude_tags": options.get("exclude_tags", "").split(",") if options.get("exclude_tags") else [], + "include_tags": options.get("include_tags", "").split(",") if options.get("include_tags") else [], + "wait_time": max(1000, wait_time), # minimum wait time is 1 second + "include_html": False, + "only_main_content": options.get("only_main_content", True), + "include_links": False, + "timeout": 15000, + "accept_cookies_selector": "#cookies-accept", + "locale": "en-US", + "actions": [], + } + result = self.client.create_crawl_request(url=url, spider_options=spider_options, page_options=page_options) + + return {"status": "active", "job_id": result.get("uuid")} + + def get_crawl_status(self, crawl_request_id) -> dict: + response = self.client.get_crawl_request(crawl_request_id) + data = [] + if response["status"] in ["new", "running"]: + status = "active" + else: + status = "completed" + data = list(self._get_results(crawl_request_id)) + + time_str = response.get("duration") + time_consuming: float = 0 + if time_str: + time_obj = datetime.strptime(time_str, "%H:%M:%S.%f") + time_consuming = ( + time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1_000_000 + ) + + return { + "status": status, + "job_id": response.get("uuid"), + "total": response.get("options", {}).get("spider_options", {}).get("page_limit", 1), + "current": response.get("number_of_documents", 0), + "data": data, + "time_consuming": time_consuming, + } + + def get_crawl_url_data(self, job_id, url) -> dict | None: + if not job_id: + return self.scrape_url(url) + + for result in self._get_results( + job_id, + { + # filter by url + "url": url + }, + ): + return result + + return None + + def scrape_url(self, url: str) -> dict: + response = self.client.scrape_url(url=url, sync=True, prefetched=True) + return self._structure_data(response) + + def _structure_data(self, result_object: dict) -> dict: + if isinstance(result_object.get("result", {}), str): + raise ValueError("Invalid result object. Expected a dictionary.") + + metadata = result_object.get("result", {}).get("metadata", {}) + return { + "title": metadata.get("og:title") or metadata.get("title"), + "description": metadata.get("description"), + "source_url": result_object.get("url"), + "markdown": result_object.get("result", {}).get("markdown"), + } + + def _get_results(self, crawl_request_id: str, query_params: dict | None = None) -> Generator[dict, None, None]: + page = 0 + page_size = 100 + + query_params = query_params or {} + query_params.update({"prefetched": "true"}) + while True: + page += 1 + response = self.client.get_crawl_request_results(crawl_request_id, page, page_size, query_params) + if not response["results"]: + break + + for result in response["results"]: + yield self._structure_data(result) + + if response["next"] is None: + break diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py index 0a6ffaa1d..70c618a63 100644 --- a/api/core/rag/extractor/word_extractor.py +++ b/api/core/rag/extractor/word_extractor.py @@ -85,7 +85,7 @@ class WordExtractor(BaseExtractor): if "image" in rel.target_ref: image_count += 1 if rel.is_external: - url = rel.reltype + url = rel.target_ref response = ssrf_proxy.get(url) if response.status_code == 200: image_ext = mimetypes.guess_extension(response.headers["Content-Type"]) diff --git a/api/core/rag/index_processor/constant/built_in_field.py b/api/core/rag/index_processor/constant/built_in_field.py index 09c5e949e..c8ad53e3d 100644 --- a/api/core/rag/index_processor/constant/built_in_field.py +++ b/api/core/rag/index_processor/constant/built_in_field.py @@ -1,7 +1,7 @@ -from enum import Enum +from enum import Enum, StrEnum -class BuiltInField(str, Enum): +class BuiltInField(StrEnum): document_name = "document_name" uploader = "uploader" upload_date = "upload_date" diff --git a/api/core/rag/index_processor/constant/index_type.py b/api/core/rag/index_processor/constant/index_type.py index 0845b58e2..659086e80 100644 --- a/api/core/rag/index_processor/constant/index_type.py +++ b/api/core/rag/index_processor/constant/index_type.py @@ -1,7 +1,7 @@ -from enum import Enum +from enum import StrEnum -class IndexType(str, Enum): +class IndexType(StrEnum): PARAGRAPH_INDEX = "text_model" QA_INDEX = "qa_model" PARENT_CHILD_INDEX = "hierarchical_model" diff --git a/api/core/rag/index_processor/processor/parent_child_index_processor.py b/api/core/rag/index_processor/processor/parent_child_index_processor.py index 894b85339..1cde5e1c8 100644 --- a/api/core/rag/index_processor/processor/parent_child_index_processor.py +++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py @@ -39,6 +39,8 @@ class ParentChildIndexProcessor(BaseIndexProcessor): all_documents = [] # type: ignore if rules.parent_mode == ParentMode.PARAGRAPH: # Split the text documents into nodes. + if not rules.segmentation: + raise ValueError("No segmentation found in rules.") splitter = self._get_splitter( processing_rule_mode=process_rule.get("mode"), max_tokens=rules.segmentation.max_tokens, diff --git a/api/core/rag/retrieval/dataset_retrieval.py b/api/core/rag/retrieval/dataset_retrieval.py index 2fe46197f..e00c989c9 100644 --- a/api/core/rag/retrieval/dataset_retrieval.py +++ b/api/core/rag/retrieval/dataset_retrieval.py @@ -100,6 +100,7 @@ class DatasetRetrieval: :param hit_callback: hit callback :param message_id: message id :param memory: memory + :param inputs: inputs :return: """ dataset_ids = config.dataset_ids @@ -734,6 +735,7 @@ class DatasetRetrieval: Calculate keywords scores :param query: search query :param documents: documents for reranking + :param top_k: top k :return: """ @@ -850,8 +852,9 @@ class DatasetRetrieval: ) if automatic_metadata_filters: conditions = [] - for filter in automatic_metadata_filters: + for sequence, filter in enumerate(automatic_metadata_filters): self._process_metadata_filter_func( + sequence, filter.get("condition"), # type: ignore filter.get("metadata_name"), # type: ignore filter.get("value"), @@ -871,14 +874,18 @@ class DatasetRetrieval: elif metadata_filtering_mode == "manual": if metadata_filtering_conditions: metadata_condition = MetadataCondition(**metadata_filtering_conditions.model_dump()) - for condition in metadata_filtering_conditions.conditions: # type: ignore + for sequence, condition in enumerate(metadata_filtering_conditions.conditions): # type: ignore metadata_name = condition.name expected_value = condition.value if expected_value is not None or condition.comparison_operator in ("empty", "not empty"): if isinstance(expected_value, str): expected_value = self._replace_metadata_filter_value(expected_value, inputs) filters = self._process_metadata_filter_func( - condition.comparison_operator, metadata_name, expected_value, filters + sequence, + condition.comparison_operator, + metadata_name, + expected_value, + filters, ) else: raise ValueError("Invalid metadata filtering mode") @@ -900,7 +907,10 @@ class DatasetRetrieval: return str(inputs.get(key, f"{{{{{key}}}}}")) pattern = re.compile(r"\{\{(\w+)\}\}") - return pattern.sub(replacer, text) + output = pattern.sub(replacer, text) + if isinstance(output, str): + output = re.sub(r"[\r\n\t]+", " ", output).strip() + return output def _automatic_metadata_filter_func( self, dataset_ids: list, query: str, tenant_id: str, user_id: str, metadata_model_config: ModelConfig @@ -957,26 +967,36 @@ class DatasetRetrieval: return None return automatic_metadata_filters - def _process_metadata_filter_func(self, condition: str, metadata_name: str, value: Optional[Any], filters: list): + def _process_metadata_filter_func( + self, sequence: int, condition: str, metadata_name: str, value: Optional[Any], filters: list + ): + key = f"{metadata_name}_{sequence}" + key_value = f"{metadata_name}_{sequence}_value" match condition: case "contains": filters.append( - (text("documents.doc_metadata ->> :key LIKE :value")).params(key=metadata_name, value=f"%{value}%") + (text(f"documents.doc_metadata ->> :{key} LIKE :{key_value}")).params( + **{key: metadata_name, key_value: f"%{value}%"} + ) ) case "not contains": filters.append( - (text("documents.doc_metadata ->> :key NOT LIKE :value")).params( - key=metadata_name, value=f"%{value}%" + (text(f"documents.doc_metadata ->> :{key} NOT LIKE :{key_value}")).params( + **{key: metadata_name, key_value: f"%{value}%"} ) ) case "start with": filters.append( - (text("documents.doc_metadata ->> :key LIKE :value")).params(key=metadata_name, value=f"{value}%") + (text(f"documents.doc_metadata ->> :{key} LIKE :{key_value}")).params( + **{key: metadata_name, key_value: f"{value}%"} + ) ) case "end with": filters.append( - (text("documents.doc_metadata ->> :key LIKE :value")).params(key=metadata_name, value=f"%{value}") + (text(f"documents.doc_metadata ->> :{key} LIKE :{key_value}")).params( + **{key: metadata_name, key_value: f"%{value}"} + ) ) case "is" | "=": if isinstance(value, str): @@ -1000,7 +1020,7 @@ class DatasetRetrieval: filters.append(sqlalchemy_cast(DatasetDocument.doc_metadata[metadata_name].astext, Integer) < value) case "after" | ">": filters.append(sqlalchemy_cast(DatasetDocument.doc_metadata[metadata_name].astext, Integer) > value) - case "≤" | ">=": + case "≤" | "<=": filters.append(sqlalchemy_cast(DatasetDocument.doc_metadata[metadata_name].astext, Integer) <= value) case "≥" | ">=": filters.append(sqlalchemy_cast(DatasetDocument.doc_metadata[metadata_name].astext, Integer) >= value) @@ -1013,8 +1033,6 @@ class DatasetRetrieval: ) -> tuple[ModelInstance, ModelConfigWithCredentialsEntity]: """ Fetch model config - :param node_data: node data - :return: """ if model is None: raise ValueError("single_retrieval_config is required") diff --git a/api/core/rag/retrieval/router/multi_dataset_react_route.py b/api/core/rag/retrieval/router/multi_dataset_react_route.py index 05e8d043d..f0426ace1 100644 --- a/api/core/rag/retrieval/router/multi_dataset_react_route.py +++ b/api/core/rag/retrieval/router/multi_dataset_react_route.py @@ -235,6 +235,7 @@ class ReactMultiDatasetRouter: tools: List of tools the agent will have access to, used to format the prompt. prefix: String to put before the list of tools. + format_instructions: The format instruction prompt. Returns: A PromptTemplate with the template assembled from the pieces here. """ diff --git a/api/core/tools/__base/tool.py b/api/core/tools/__base/tool.py index 63937f5f7..35e16b5c8 100644 --- a/api/core/tools/__base/tool.py +++ b/api/core/tools/__base/tool.py @@ -29,9 +29,7 @@ class Tool(ABC): def fork_tool_runtime(self, runtime: ToolRuntime) -> "Tool": """ - fork a new tool with meta data - - :param meta: the meta data of a tool call processing, tenant_id is required + fork a new tool with metadata :return: the new tool """ return self.__class__( @@ -206,6 +204,7 @@ class Tool(ABC): create a blob message :param blob: the blob + :param meta: the meta info of blob object :return: the blob message """ return ToolInvokeMessage( diff --git a/api/core/tools/builtin_tool/provider.py b/api/core/tools/builtin_tool/provider.py index e77625852..4f733f0ea 100644 --- a/api/core/tools/builtin_tool/provider.py +++ b/api/core/tools/builtin_tool/provider.py @@ -35,7 +35,7 @@ class BuiltinToolProviderController(ToolProviderController): provider_yaml["credentials_for_provider"][credential_name]["name"] = credential_name credentials_schema = [] - for credential in provider_yaml.get("credentials_for_provider", {}): + for credential in provider_yaml.get("credentials_for_provider", {}).values(): credentials_schema.append(credential) super().__init__( @@ -153,7 +153,7 @@ class BuiltinToolProviderController(ToolProviderController): """ validate the credentials of the provider - :param tool_name: the name of the tool, defined in `get_tools` + :param user_id: use id :param credentials: the credentials of the tool """ # validate credentials format @@ -167,7 +167,7 @@ class BuiltinToolProviderController(ToolProviderController): """ validate the credentials of the provider - :param tool_name: the name of the tool, defined in `get_tools` + :param user_id: use id :param credentials: the credentials of the tool """ pass diff --git a/api/core/tools/builtin_tool/providers/webscraper/webscraper.yaml b/api/core/tools/builtin_tool/providers/webscraper/webscraper.yaml index d6d0a0d61..96edcf42f 100644 --- a/api/core/tools/builtin_tool/providers/webscraper/webscraper.yaml +++ b/api/core/tools/builtin_tool/providers/webscraper/webscraper.yaml @@ -12,4 +12,4 @@ identity: icon: icon.svg tags: - productivity -credentials_for_provider: [] +credentials_for_provider: {} diff --git a/api/core/tools/builtin_tool/tool.py b/api/core/tools/builtin_tool/tool.py index e61cda5de..7f37f98d0 100644 --- a/api/core/tools/builtin_tool/tool.py +++ b/api/core/tools/builtin_tool/tool.py @@ -28,9 +28,7 @@ class BuiltinTool(Tool): def fork_tool_runtime(self, runtime: ToolRuntime) -> "BuiltinTool": """ - fork a new tool with meta data - - :param meta: the meta data of a tool call processing, tenant_id is required + fork a new tool with metadata :return: the new tool """ return self.__class__( @@ -43,7 +41,7 @@ class BuiltinTool(Tool): """ invoke model - :param model_config: the model config + :param user_id: the user id :param prompt_messages: the prompt messages :param stop: the stop words :return: the model result @@ -64,7 +62,6 @@ class BuiltinTool(Tool): """ get max tokens - :param model_config: the model config :return: the max tokens """ if self.runtime is None: diff --git a/api/core/tools/custom_tool/provider.py b/api/core/tools/custom_tool/provider.py index 713353531..bfceb6679 100644 --- a/api/core/tools/custom_tool/provider.py +++ b/api/core/tools/custom_tool/provider.py @@ -145,7 +145,6 @@ class ApiToolProviderController(ToolProviderController): """ fetch tools from database - :param user_id: the user id :param tenant_id: the tenant id :return: the tools """ diff --git a/api/core/tools/custom_tool/tool.py b/api/core/tools/custom_tool/tool.py index 5003c1dd5..2f2f1ebbd 100644 --- a/api/core/tools/custom_tool/tool.py +++ b/api/core/tools/custom_tool/tool.py @@ -35,9 +35,7 @@ class ApiTool(Tool): def fork_tool_runtime(self, runtime: ToolRuntime): """ - fork a new tool with meta data - - :param meta: the meta data of a tool call processing, tenant_id is required + fork a new tool with metadata :return: the new tool """ if self.api_bundle is None: @@ -195,7 +193,12 @@ class ApiTool(Tool): properties = body_schema.get("properties", {}) for name, property in properties.items(): if name in parameters: - if property.get("format") == "binary": + # multiple file upload: if the type is array and the items have format as binary + if property.get("type") == "array" and property.get("items", {}).get("format") == "binary": + # parameters[name] should be a list of file objects. + for f in parameters[name]: + files.append((name, (f.filename, download(f), f.mime_type))) + elif property.get("format") == "binary": f = parameters[name] files.append((name, (f.filename, download(f), f.mime_type))) elif "$ref" in property: @@ -226,6 +229,13 @@ class ApiTool(Tool): else: body = body + # if there is a file upload, remove the Content-Type header + # so that httpx can automatically generate the boundary header required for multipart/form-data. + # issue: https://github.com/langgenius/dify/issues/13684 + # reference: https://stackoverflow.com/questions/39280438/fetch-missing-boundary-in-multipart-form-data-post + if files: + headers.pop("Content-Type", None) + if method in { "get", "head", diff --git a/api/core/tools/entities/tool_entities.py b/api/core/tools/entities/tool_entities.py index 83e69d063..d75676313 100644 --- a/api/core/tools/entities/tool_entities.py +++ b/api/core/tools/entities/tool_entities.py @@ -264,7 +264,7 @@ class ToolParameter(PluginParameter): :param name: the name of the parameter :param llm_description: the description presented to the LLM - :param type: the type of the parameter + :param typ: the type of the parameter :param required: if the parameter is required :param options: the options of the parameter """ diff --git a/api/core/tools/tool_engine.py b/api/core/tools/tool_engine.py index cf5411112..997917f31 100644 --- a/api/core/tools/tool_engine.py +++ b/api/core/tools/tool_engine.py @@ -313,7 +313,6 @@ class ToolEngine: """ Create message file - :param messages: messages :return: message file ids """ result = [] diff --git a/api/core/tools/tool_manager.py b/api/core/tools/tool_manager.py index 1caf02192..f2d0b74f7 100644 --- a/api/core/tools/tool_manager.py +++ b/api/core/tools/tool_manager.py @@ -161,8 +161,11 @@ class ToolManager: get the tool runtime :param provider_type: the type of the provider - :param provider_name: the name of the provider + :param provider_id: the id of the provider :param tool_name: the name of the tool + :param tenant_id: the tenant id + :param invoke_from: invoke from + :param tool_invoke_from: the tool invoke from :return: the tool """ @@ -427,8 +430,6 @@ class ToolManager: get the absolute path of the icon of the hardcoded provider :param provider: the name of the provider - :param tenant_id: the id of the tenant - :return: the absolute path of the icon, the mime type of the icon """ # get provider @@ -672,7 +673,8 @@ class ToolManager: """ get the api provider - :param provider_name: the name of the provider + :param tenant_id: the id of the tenant + :param provider_id: the id of the provider :return: the provider controller, the credentials """ diff --git a/api/core/tools/utils/model_invocation_utils.py b/api/core/tools/utils/model_invocation_utils.py index 245470ea4..3f59b3f47 100644 --- a/api/core/tools/utils/model_invocation_utils.py +++ b/api/core/tools/utils/model_invocation_utils.py @@ -84,12 +84,8 @@ class ModelInvocationUtils: :param user_id: user id :param tenant_id: tenant id, the tenant id of the creator of the tool - :param tool_provider: tool provider - :param tool_id: tool id + :param tool_type: tool type :param tool_name: tool name - :param provider: model provider - :param model: model name - :param model_parameters: model parameters :param prompt_messages: prompt messages :return: AssistantPromptMessage """ diff --git a/api/core/tools/utils/parser.py b/api/core/tools/utils/parser.py index c69fc1f82..f72291783 100644 --- a/api/core/tools/utils/parser.py +++ b/api/core/tools/utils/parser.py @@ -186,6 +186,9 @@ class ApiBasedToolSchemaParser: return ToolParameter.ToolParameterType.BOOLEAN elif typ == "string": return ToolParameter.ToolParameterType.STRING + elif typ == "array": + items = parameter.get("items") or parameter.get("schema", {}).get("items") + return ToolParameter.ToolParameterType.FILES if items and items.get("format") == "binary" else None else: return None @@ -197,6 +200,8 @@ class ApiBasedToolSchemaParser: parse openapi yaml to tool bundle :param yaml: the yaml string + :param extra_info: the extra info + :param warning: the warning message :return: the tool bundle """ warning = warning if warning is not None else {} @@ -278,6 +283,8 @@ class ApiBasedToolSchemaParser: parse openapi plugin yaml to tool bundle :param json: the json string + :param extra_info: the extra info + :param warning: the warning message :return: the tool bundle """ warning = warning if warning is not None else {} @@ -312,6 +319,8 @@ class ApiBasedToolSchemaParser: auto parse to tool bundle :param content: the content + :param extra_info: the extra info + :param warning: the warning message :return: tools bundle, schema_type """ warning = warning if warning is not None else {} diff --git a/api/core/tools/workflow_as_tool/provider.py b/api/core/tools/workflow_as_tool/provider.py index 4777a019e..7661e1e6a 100644 --- a/api/core/tools/workflow_as_tool/provider.py +++ b/api/core/tools/workflow_as_tool/provider.py @@ -182,7 +182,6 @@ class WorkflowToolProviderController(ToolProviderController): """ fetch tools from database - :param user_id: the user id :param tenant_id: the tenant id :return: the tools """ diff --git a/api/core/tools/workflow_as_tool/tool.py b/api/core/tools/workflow_as_tool/tool.py index cf840880b..241b4a94d 100644 --- a/api/core/tools/workflow_as_tool/tool.py +++ b/api/core/tools/workflow_as_tool/tool.py @@ -127,9 +127,8 @@ class WorkflowTool(Tool): def fork_tool_runtime(self, runtime: ToolRuntime) -> "WorkflowTool": """ - fork a new tool with meta data + fork a new tool with metadata - :param meta: the meta data of a tool call processing, tenant_id is required :return: the new tool """ return self.__class__( @@ -212,7 +211,6 @@ class WorkflowTool(Tool): """ extract files from the result - :param result: the result :return: the result, files """ files: list[File] = [] diff --git a/api/core/workflow/entities/node_entities.py b/api/core/workflow/entities/node_entities.py index 70d40d87e..82fd6cdc3 100644 --- a/api/core/workflow/entities/node_entities.py +++ b/api/core/workflow/entities/node_entities.py @@ -30,6 +30,7 @@ class NodeRunMetadataKey(StrEnum): ITERATION_DURATION_MAP = "iteration_duration_map" # single iteration duration if iteration node runs LOOP_DURATION_MAP = "loop_duration_map" # single loop duration if loop node runs ERROR_STRATEGY = "error_strategy" # node in continue on error mode return the field + LOOP_VARIABLE_MAP = "loop_variable_map" # single loop variable output class NodeRunResult(BaseModel): diff --git a/api/core/workflow/graph_engine/graph_engine.py b/api/core/workflow/graph_engine/graph_engine.py index d0f3041d5..36273d8ec 100644 --- a/api/core/workflow/graph_engine/graph_engine.py +++ b/api/core/workflow/graph_engine/graph_engine.py @@ -641,6 +641,8 @@ class GraphEngine: try: # run node retry_start_at = datetime.now(UTC).replace(tzinfo=None) + # yield control to other threads + time.sleep(0.001) generator = node_instance.run() for item in generator: if isinstance(item, GraphEngineEvent): diff --git a/api/core/workflow/nodes/agent/agent_node.py b/api/core/workflow/nodes/agent/agent_node.py index be6db6677..7c8960fe4 100644 --- a/api/core/workflow/nodes/agent/agent_node.py +++ b/api/core/workflow/nodes/agent/agent_node.py @@ -1,15 +1,18 @@ import json from collections.abc import Generator, Mapping, Sequence -from typing import Any, cast +from typing import Any, Optional, cast from core.agent.entities import AgentToolEntity from core.agent.plugin_entities import AgentStrategyParameter -from core.model_manager import ModelManager -from core.model_runtime.entities.model_entities import ModelType +from core.memory.token_buffer_memory import TokenBufferMemory +from core.model_manager import ModelInstance, ModelManager +from core.model_runtime.entities.model_entities import AIModelEntity, ModelType from core.plugin.manager.exc import PluginDaemonClientSideError from core.plugin.manager.plugin import PluginInstallationManager +from core.provider_manager import ProviderManager from core.tools.entities.tool_entities import ToolParameter, ToolProviderType from core.tools.tool_manager import ToolManager +from core.variables.segments import StringSegment from core.workflow.entities.node_entities import NodeRunResult from core.workflow.entities.variable_pool import VariablePool from core.workflow.enums import SystemVariableKey @@ -19,7 +22,9 @@ from core.workflow.nodes.enums import NodeType from core.workflow.nodes.event.event import RunCompletedEvent from core.workflow.nodes.tool.tool_node import ToolNode from core.workflow.utils.variable_template_parser import VariableTemplateParser +from extensions.ext_database import db from factories.agent_factory import get_plugin_agent_strategy +from models.model import Conversation from models.workflow import WorkflowNodeExecutionStatus @@ -233,17 +238,20 @@ class AgentNode(ToolNode): value = tool_value if parameter.type == "model-selector": value = cast(dict[str, Any], value) - model_instance = ModelManager().get_model_instance( - tenant_id=self.tenant_id, - provider=value.get("provider", ""), - model_type=ModelType(value.get("model_type", "")), - model=value.get("model", ""), - ) - models = model_instance.model_type_instance.plugin_model_provider.declaration.models - finded_model = next((model for model in models if model.model == value.get("model", "")), None) - - value["entity"] = finded_model.model_dump(mode="json") if finded_model else None - + model_instance, model_schema = self._fetch_model(value) + # memory config + history_prompt_messages = [] + if node_data.memory: + memory = self._fetch_memory(model_instance) + if memory: + prompt_messages = memory.get_history_prompt_messages( + message_limit=node_data.memory.window.size if node_data.memory.window.size else None + ) + history_prompt_messages = [ + prompt_message.model_dump(mode="json") for prompt_message in prompt_messages + ] + value["history_prompt_messages"] = history_prompt_messages + value["entity"] = model_schema.model_dump(mode="json") if model_schema else None result[parameter_name] = value return result @@ -297,3 +305,46 @@ class AgentNode(ToolNode): except StopIteration: icon = None return icon + + def _fetch_memory(self, model_instance: ModelInstance) -> Optional[TokenBufferMemory]: + # get conversation id + conversation_id_variable = self.graph_runtime_state.variable_pool.get( + ["sys", SystemVariableKey.CONVERSATION_ID.value] + ) + if not isinstance(conversation_id_variable, StringSegment): + return None + conversation_id = conversation_id_variable.value + + # get conversation + conversation = ( + db.session.query(Conversation) + .filter(Conversation.app_id == self.app_id, Conversation.id == conversation_id) + .first() + ) + + if not conversation: + return None + + memory = TokenBufferMemory(conversation=conversation, model_instance=model_instance) + + return memory + + def _fetch_model(self, value: dict[str, Any]) -> tuple[ModelInstance, AIModelEntity | None]: + provider_manager = ProviderManager() + provider_model_bundle = provider_manager.get_provider_model_bundle( + tenant_id=self.tenant_id, provider=value.get("provider", ""), model_type=ModelType.LLM + ) + model_name = value.get("model", "") + model_credentials = provider_model_bundle.configuration.get_current_credentials( + model_type=ModelType.LLM, model=model_name + ) + provider_name = provider_model_bundle.configuration.provider.provider + model_type_instance = provider_model_bundle.model_type_instance + model_instance = ModelManager().get_model_instance( + tenant_id=self.tenant_id, + provider=provider_name, + model_type=ModelType(value.get("model_type", "")), + model=model_name, + ) + model_schema = model_type_instance.get_model_schema(model_name, model_credentials) + return model_instance, model_schema diff --git a/api/core/workflow/nodes/agent/entities.py b/api/core/workflow/nodes/agent/entities.py index a10cee69b..87cc7e982 100644 --- a/api/core/workflow/nodes/agent/entities.py +++ b/api/core/workflow/nodes/agent/entities.py @@ -3,6 +3,7 @@ from typing import Any, Literal, Union from pydantic import BaseModel +from core.prompt.entities.advanced_prompt_entities import MemoryConfig from core.tools.entities.tool_entities import ToolSelector from core.workflow.nodes.base.entities import BaseNodeData @@ -11,6 +12,7 @@ class AgentNodeData(BaseNodeData): agent_strategy_provider_name: str # redundancy agent_strategy_name: str agent_strategy_label: str # redundancy + memory: MemoryConfig | None = None class AgentInput(BaseModel): value: Union[list[str], list[ToolSelector], Any] diff --git a/api/core/workflow/nodes/code/code_node.py b/api/core/workflow/nodes/code/code_node.py index 9b3be3dea..212442dee 100644 --- a/api/core/workflow/nodes/code/code_node.py +++ b/api/core/workflow/nodes/code/code_node.py @@ -6,6 +6,7 @@ from core.helper.code_executor.code_executor import CodeExecutionError, CodeExec from core.helper.code_executor.code_node_provider import CodeNodeProvider from core.helper.code_executor.javascript.javascript_code_provider import JavascriptCodeProvider from core.helper.code_executor.python3.python3_code_provider import Python3CodeProvider +from core.variables.segments import ArrayFileSegment from core.workflow.entities.node_entities import NodeRunResult from core.workflow.nodes.base import BaseNode from core.workflow.nodes.code.control_extend import ExecutionControl # Extend: Adding execution control logic @@ -50,7 +51,10 @@ class CodeNode(BaseNode[CodeNodeData]): for variable_selector in self.node_data.variables: variable_name = variable_selector.variable variable = self.graph_runtime_state.variable_pool.get(variable_selector.value_selector) - variables[variable_name] = variable.to_object() if variable else None + if isinstance(variable, ArrayFileSegment): + variables[variable_name] = [v.to_dict() for v in variable.value] if variable.value else None + else: + variables[variable_name] = variable.to_object() if variable else None # Run code try: # Extend: Start Adding execution control logic diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index 0964b8b71..960d0c396 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -9,6 +9,7 @@ from typing import Any, cast import docx import pandas as pd +import pypandoc # type: ignore import pypdfium2 # type: ignore import yaml # type: ignore from docx.document import Document @@ -369,7 +370,7 @@ def _extract_text_from_ppt(file_content: bytes) -> str: from unstructured.partition.ppt import partition_ppt try: - if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY: + if dify_config.UNSTRUCTURED_API_URL: with tempfile.NamedTemporaryFile(suffix=".ppt", delete=False) as temp_file: temp_file.write(file_content) temp_file.flush() @@ -378,7 +379,7 @@ def _extract_text_from_ppt(file_content: bytes) -> str: file=file, metadata_filename=temp_file.name, api_url=dify_config.UNSTRUCTURED_API_URL, - api_key=dify_config.UNSTRUCTURED_API_KEY, + api_key=dify_config.UNSTRUCTURED_API_KEY, # type: ignore ) os.unlink(temp_file.name) else: @@ -395,7 +396,7 @@ def _extract_text_from_pptx(file_content: bytes) -> str: from unstructured.partition.pptx import partition_pptx try: - if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY: + if dify_config.UNSTRUCTURED_API_URL: with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as temp_file: temp_file.write(file_content) temp_file.flush() @@ -404,7 +405,7 @@ def _extract_text_from_pptx(file_content: bytes) -> str: file=file, metadata_filename=temp_file.name, api_url=dify_config.UNSTRUCTURED_API_URL, - api_key=dify_config.UNSTRUCTURED_API_KEY, + api_key=dify_config.UNSTRUCTURED_API_KEY, # type: ignore ) os.unlink(temp_file.name) else: @@ -416,11 +417,26 @@ def _extract_text_from_pptx(file_content: bytes) -> str: def _extract_text_from_epub(file_content: bytes) -> str: + from unstructured.partition.api import partition_via_api from unstructured.partition.epub import partition_epub try: - with io.BytesIO(file_content) as file: - elements = partition_epub(file=file) + if dify_config.UNSTRUCTURED_API_URL: + with tempfile.NamedTemporaryFile(suffix=".epub", delete=False) as temp_file: + temp_file.write(file_content) + temp_file.flush() + with open(temp_file.name, "rb") as file: + elements = partition_via_api( + file=file, + metadata_filename=temp_file.name, + api_url=dify_config.UNSTRUCTURED_API_URL, + api_key=dify_config.UNSTRUCTURED_API_KEY, # type: ignore + ) + os.unlink(temp_file.name) + else: + pypandoc.download_pandoc() + with io.BytesIO(file_content) as file: + elements = partition_epub(file=file) return "\n".join([str(element) for element in elements]) except Exception as e: raise TextExtractionError(f"Failed to extract text from EPUB: {str(e)}") from e diff --git a/api/core/workflow/nodes/enums.py b/api/core/workflow/nodes/enums.py index d9a2c2d8a..73b43eeaf 100644 --- a/api/core/workflow/nodes/enums.py +++ b/api/core/workflow/nodes/enums.py @@ -17,6 +17,7 @@ class NodeType(StrEnum): LEGACY_VARIABLE_AGGREGATOR = "variable-assigner" # TODO: Merge this into VARIABLE_AGGREGATOR in the database. LOOP = "loop" LOOP_START = "loop-start" + LOOP_END = "loop-end" ITERATION = "iteration" ITERATION_START = "iteration-start" # Fake start node for iteration. PARAMETER_EXTRACTOR = "parameter-extractor" diff --git a/api/core/workflow/nodes/if_else/entities.py b/api/core/workflow/nodes/if_else/entities.py index 23f5d2cc3..67d6d6a88 100644 --- a/api/core/workflow/nodes/if_else/entities.py +++ b/api/core/workflow/nodes/if_else/entities.py @@ -8,7 +8,7 @@ from core.workflow.utils.condition.entities import Condition class IfElseNodeData(BaseNodeData): """ - Answer Node Data. + If Else Node Data. """ class Case(BaseModel): diff --git a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py index 5643e37ce..860373948 100644 --- a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py +++ b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py @@ -1,5 +1,6 @@ import json import logging +import re import time from collections import defaultdict from collections.abc import Mapping, Sequence @@ -331,8 +332,9 @@ class KnowledgeRetrievalNode(LLMNode): automatic_metadata_filters = self._automatic_metadata_filter_func(dataset_ids, query, node_data) if automatic_metadata_filters: conditions = [] - for filter in automatic_metadata_filters: + for sequence, filter in enumerate(automatic_metadata_filters): self._process_metadata_filter_func( + sequence, filter.get("condition", ""), filter.get("metadata_name", ""), filter.get("value"), @@ -353,17 +355,26 @@ class KnowledgeRetrievalNode(LLMNode): if node_data.metadata_filtering_conditions: metadata_condition = MetadataCondition(**node_data.metadata_filtering_conditions.model_dump()) if node_data.metadata_filtering_conditions: - for condition in node_data.metadata_filtering_conditions.conditions: # type: ignore + for sequence, condition in enumerate(node_data.metadata_filtering_conditions.conditions): # type: ignore metadata_name = condition.name expected_value = condition.value if expected_value is not None or condition.comparison_operator in ("empty", "not empty"): if isinstance(expected_value, str): expected_value = self.graph_runtime_state.variable_pool.convert_template( expected_value - ).text - + ).value[0] + if expected_value.value_type == "number": # type: ignore + expected_value = expected_value.value # type: ignore + elif expected_value.value_type == "string": # type: ignore + expected_value = re.sub(r"[\r\n\t]+", " ", expected_value.text).strip() # type: ignore + else: + raise ValueError("Invalid expected metadata value type") filters = self._process_metadata_filter_func( - condition.comparison_operator, metadata_name, expected_value, filters + sequence, + condition.comparison_operator, + metadata_name, + expected_value, + filters, ) else: raise ValueError("Invalid metadata filtering mode") @@ -442,25 +453,35 @@ class KnowledgeRetrievalNode(LLMNode): return [] return automatic_metadata_filters - def _process_metadata_filter_func(self, condition: str, metadata_name: str, value: Optional[str], filters: list): + def _process_metadata_filter_func( + self, sequence: int, condition: str, metadata_name: str, value: Optional[Any], filters: list + ): + key = f"{metadata_name}_{sequence}" + key_value = f"{metadata_name}_{sequence}_value" match condition: case "contains": filters.append( - (text("documents.doc_metadata ->> :key LIKE :value")).params(key=metadata_name, value=f"%{value}%") + (text(f"documents.doc_metadata ->> :{key} LIKE :{key_value}")).params( + **{key: metadata_name, key_value: f"%{value}%"} + ) ) case "not contains": filters.append( - (text("documents.doc_metadata ->> :key NOT LIKE :value")).params( - key=metadata_name, value=f"%{value}%" + (text(f"documents.doc_metadata ->> :{key} NOT LIKE :{key_value}")).params( + **{key: metadata_name, key_value: f"%{value}%"} ) ) case "start with": filters.append( - (text("documents.doc_metadata ->> :key LIKE :value")).params(key=metadata_name, value=f"{value}%") + (text(f"documents.doc_metadata ->> :{key} LIKE :{key_value}")).params( + **{key: metadata_name, key_value: f"{value}%"} + ) ) case "end with": filters.append( - (text("documents.doc_metadata ->> :key LIKE :value")).params(key=metadata_name, value=f"%{value}") + (text(f"documents.doc_metadata ->> :{key} LIKE :{key_value}")).params( + **{key: metadata_name, key_value: f"%{value}"} + ) ) case "=" | "is": if isinstance(value, str): diff --git a/api/core/workflow/nodes/loop/__init__.py b/api/core/workflow/nodes/loop/__init__.py index 9dd33be0a..9fe695607 100644 --- a/api/core/workflow/nodes/loop/__init__.py +++ b/api/core/workflow/nodes/loop/__init__.py @@ -1,5 +1,6 @@ from .entities import LoopNodeData +from .loop_end_node import LoopEndNode from .loop_node import LoopNode from .loop_start_node import LoopStartNode -__all__ = ["LoopNode", "LoopNodeData", "LoopStartNode"] +__all__ = ["LoopEndNode", "LoopNode", "LoopNodeData", "LoopStartNode"] diff --git a/api/core/workflow/nodes/loop/entities.py b/api/core/workflow/nodes/loop/entities.py index 4f9c149bd..16802311d 100644 --- a/api/core/workflow/nodes/loop/entities.py +++ b/api/core/workflow/nodes/loop/entities.py @@ -1,11 +1,23 @@ +from collections.abc import Mapping from typing import Any, Literal, Optional -from pydantic import Field +from pydantic import BaseModel, Field from core.workflow.nodes.base import BaseLoopNodeData, BaseLoopState, BaseNodeData from core.workflow.utils.condition.entities import Condition +class LoopVariableData(BaseModel): + """ + Loop Variable Data. + """ + + label: str + var_type: Literal["string", "number", "object", "array[string]", "array[number]", "array[object]"] + value_type: Literal["variable", "constant"] + value: Optional[Any | list[str]] = None + + class LoopNodeData(BaseLoopNodeData): """ Loop Node Data. @@ -14,6 +26,8 @@ class LoopNodeData(BaseLoopNodeData): loop_count: int # Maximum number of loops break_conditions: list[Condition] # Conditions to break the loop logical_operator: Literal["and", "or"] + loop_variables: Optional[list[LoopVariableData]] = Field(default_factory=list) + outputs: Optional[Mapping[str, Any]] = None class LoopStartNodeData(BaseNodeData): @@ -24,6 +38,14 @@ class LoopStartNodeData(BaseNodeData): pass +class LoopEndNodeData(BaseNodeData): + """ + Loop End Node Data. + """ + + pass + + class LoopState(BaseLoopState): """ Loop State. diff --git a/api/core/workflow/nodes/loop/loop_end_node.py b/api/core/workflow/nodes/loop/loop_end_node.py new file mode 100644 index 000000000..5d4ce0ccb --- /dev/null +++ b/api/core/workflow/nodes/loop/loop_end_node.py @@ -0,0 +1,20 @@ +from core.workflow.entities.node_entities import NodeRunResult +from core.workflow.nodes.base import BaseNode +from core.workflow.nodes.enums import NodeType +from core.workflow.nodes.loop.entities import LoopEndNodeData +from models.workflow import WorkflowNodeExecutionStatus + + +class LoopEndNode(BaseNode[LoopEndNodeData]): + """ + Loop End Node. + """ + + _node_data_cls = LoopEndNodeData + _node_type = NodeType.LOOP_END + + def _run(self) -> NodeRunResult: + """ + Run the node. + """ + return NodeRunResult(status=WorkflowNodeExecutionStatus.SUCCEEDED) diff --git a/api/core/workflow/nodes/loop/loop_node.py b/api/core/workflow/nodes/loop/loop_node.py index 65acf1211..eae33c0a9 100644 --- a/api/core/workflow/nodes/loop/loop_node.py +++ b/api/core/workflow/nodes/loop/loop_node.py @@ -1,10 +1,20 @@ +import json import logging from collections.abc import Generator, Mapping, Sequence from datetime import UTC, datetime -from typing import Any, cast +from typing import TYPE_CHECKING, Any, Literal, cast from configs import dify_config -from core.variables import IntegerSegment +from core.variables import ( + ArrayNumberSegment, + ArrayObjectSegment, + ArrayStringSegment, + IntegerSegment, + ObjectSegment, + Segment, + SegmentType, + StringSegment, +) from core.workflow.entities.node_entities import NodeRunMetadataKey, NodeRunResult from core.workflow.graph_engine.entities.event import ( BaseGraphEvent, @@ -29,6 +39,10 @@ from core.workflow.nodes.loop.entities import LoopNodeData from core.workflow.utils.condition.processor import ConditionProcessor from models.workflow import WorkflowNodeExecutionStatus +if TYPE_CHECKING: + from core.workflow.entities.variable_pool import VariablePool + from core.workflow.graph_engine.graph_engine import GraphEngine + logger = logging.getLogger(__name__) @@ -61,6 +75,28 @@ class LoopNode(BaseNode[LoopNodeData]): variable_pool = self.graph_runtime_state.variable_pool variable_pool.add([self.node_id, "index"], 0) + # Initialize loop variables + loop_variable_selectors = {} + if self.node_data.loop_variables: + for loop_variable in self.node_data.loop_variables: + value_processor = { + "constant": lambda var=loop_variable: self._get_segment_for_constant(var.var_type, var.value), + "variable": lambda var=loop_variable: variable_pool.get(var.value), + } + + if loop_variable.value_type not in value_processor: + raise ValueError( + f"Invalid value type '{loop_variable.value_type}' for loop variable {loop_variable.label}" + ) + + processed_segment = value_processor[loop_variable.value_type]() + if not processed_segment: + raise ValueError(f"Invalid value for loop variable {loop_variable.label}") + variable_selector = [self.node_id, loop_variable.label] + variable_pool.add(variable_selector, processed_segment.value) + loop_variable_selectors[loop_variable.label] = variable_selector + inputs[loop_variable.label] = processed_segment.value + from core.workflow.graph_engine.graph_engine import GraphEngine graph_engine = GraphEngine( @@ -95,135 +131,51 @@ class LoopNode(BaseNode[LoopNodeData]): predecessor_node_id=self.previous_node_id, ) - yield LoopRunNextEvent( - loop_id=self.id, - loop_node_id=self.node_id, - loop_node_type=self.node_type, - loop_node_data=self.node_data, - index=0, - pre_loop_output=None, - ) - + # yield LoopRunNextEvent( + # loop_id=self.id, + # loop_node_id=self.node_id, + # loop_node_type=self.node_type, + # loop_node_data=self.node_data, + # index=0, + # pre_loop_output=None, + # ) + loop_duration_map = {} + single_loop_variable_map = {} # single loop variable output try: check_break_result = False for i in range(loop_count): - # Run workflow - rst = graph_engine.run() - current_index_variable = variable_pool.get([self.node_id, "index"]) - if not isinstance(current_index_variable, IntegerSegment): - raise ValueError(f"loop {self.node_id} current index not found") - current_index = current_index_variable.value + loop_start_time = datetime.now(UTC).replace(tzinfo=None) + # run single loop + loop_result = yield from self._run_single_loop( + graph_engine=graph_engine, + loop_graph=loop_graph, + variable_pool=variable_pool, + loop_variable_selectors=loop_variable_selectors, + break_conditions=break_conditions, + logical_operator=logical_operator, + condition_processor=condition_processor, + current_index=i, + start_at=start_at, + inputs=inputs, + ) + loop_end_time = datetime.now(UTC).replace(tzinfo=None) - check_break_result = False - - for event in rst: - if isinstance(event, (BaseNodeEvent | BaseParallelBranchEvent)) and not event.in_loop_id: - event.in_loop_id = self.node_id - - if ( - isinstance(event, BaseNodeEvent) - and event.node_type == NodeType.LOOP_START - and not isinstance(event, NodeRunStreamChunkEvent) - ): - continue - - if isinstance(event, NodeRunSucceededEvent): - yield self._handle_event_metadata(event=event, iter_run_index=current_index) - - # Check if all variables in break conditions exist - exists_variable = False - for condition in break_conditions: - if not self.graph_runtime_state.variable_pool.get(condition.variable_selector): - exists_variable = False - break - else: - exists_variable = True - if exists_variable: - input_conditions, group_result, check_break_result = condition_processor.process_conditions( - variable_pool=self.graph_runtime_state.variable_pool, - conditions=break_conditions, - operator=logical_operator, - ) - if check_break_result: - break - - elif isinstance(event, BaseGraphEvent): - if isinstance(event, GraphRunFailedEvent): - # Loop run failed - yield LoopRunFailedEvent( - loop_id=self.id, - loop_node_id=self.node_id, - loop_node_type=self.node_type, - loop_node_data=self.node_data, - start_at=start_at, - inputs=inputs, - steps=i, - metadata={ - NodeRunMetadataKey.TOTAL_TOKENS: graph_engine.graph_runtime_state.total_tokens, - "completed_reason": "error", - }, - error=event.error, - ) - yield RunCompletedEvent( - run_result=NodeRunResult( - status=WorkflowNodeExecutionStatus.FAILED, - error=event.error, - metadata={ - NodeRunMetadataKey.TOTAL_TOKENS: graph_engine.graph_runtime_state.total_tokens - }, - ) - ) - return - elif isinstance(event, NodeRunFailedEvent): - # Loop run failed - yield event - yield LoopRunFailedEvent( - loop_id=self.id, - loop_node_id=self.node_id, - loop_node_type=self.node_type, - loop_node_data=self.node_data, - start_at=start_at, - inputs=inputs, - steps=i, - metadata={ - NodeRunMetadataKey.TOTAL_TOKENS: graph_engine.graph_runtime_state.total_tokens, - "completed_reason": "error", - }, - error=event.error, - ) - yield RunCompletedEvent( - run_result=NodeRunResult( - status=WorkflowNodeExecutionStatus.FAILED, - error=event.error, - metadata={ - NodeRunMetadataKey.TOTAL_TOKENS: graph_engine.graph_runtime_state.total_tokens - }, - ) - ) - return + single_loop_variable = {} + for key, selector in loop_variable_selectors.items(): + item = variable_pool.get(selector) + if item: + single_loop_variable[key] = item.value else: - yield self._handle_event_metadata(event=cast(InNodeEvent, event), iter_run_index=current_index) + single_loop_variable[key] = None - # Remove all nodes outputs from variable pool - for node_id in loop_graph.node_ids: - variable_pool.remove([node_id]) + loop_duration_map[str(i)] = (loop_end_time - loop_start_time).total_seconds() + single_loop_variable_map[str(i)] = single_loop_variable + + check_break_result = loop_result.get("check_break_result", False) if check_break_result: break - # Move to next loop - next_index = current_index + 1 - variable_pool.add([self.node_id, "index"], next_index) - - yield LoopRunNextEvent( - loop_id=self.id, - loop_node_id=self.node_id, - loop_node_type=self.node_type, - loop_node_data=self.node_data, - index=next_index, - pre_loop_output=None, - ) - # Loop completed successfully yield LoopRunSucceededEvent( loop_id=self.id, @@ -232,17 +184,26 @@ class LoopNode(BaseNode[LoopNodeData]): loop_node_data=self.node_data, start_at=start_at, inputs=inputs, + outputs=self.node_data.outputs, steps=loop_count, metadata={ NodeRunMetadataKey.TOTAL_TOKENS: graph_engine.graph_runtime_state.total_tokens, "completed_reason": "loop_break" if check_break_result else "loop_completed", + NodeRunMetadataKey.LOOP_DURATION_MAP: loop_duration_map, + NodeRunMetadataKey.LOOP_VARIABLE_MAP: single_loop_variable_map, }, ) yield RunCompletedEvent( run_result=NodeRunResult( status=WorkflowNodeExecutionStatus.SUCCEEDED, - metadata={NodeRunMetadataKey.TOTAL_TOKENS: graph_engine.graph_runtime_state.total_tokens}, + metadata={ + NodeRunMetadataKey.TOTAL_TOKENS: graph_engine.graph_runtime_state.total_tokens, + NodeRunMetadataKey.LOOP_DURATION_MAP: loop_duration_map, + NodeRunMetadataKey.LOOP_VARIABLE_MAP: single_loop_variable_map, + }, + outputs=self.node_data.outputs, + inputs=inputs, ) ) @@ -260,6 +221,8 @@ class LoopNode(BaseNode[LoopNodeData]): metadata={ "total_tokens": graph_engine.graph_runtime_state.total_tokens, "completed_reason": "error", + NodeRunMetadataKey.LOOP_DURATION_MAP: loop_duration_map, + NodeRunMetadataKey.LOOP_VARIABLE_MAP: single_loop_variable_map, }, error=str(e), ) @@ -268,7 +231,11 @@ class LoopNode(BaseNode[LoopNodeData]): run_result=NodeRunResult( status=WorkflowNodeExecutionStatus.FAILED, error=str(e), - metadata={NodeRunMetadataKey.TOTAL_TOKENS: graph_engine.graph_runtime_state.total_tokens}, + metadata={ + NodeRunMetadataKey.TOTAL_TOKENS: graph_engine.graph_runtime_state.total_tokens, + NodeRunMetadataKey.LOOP_DURATION_MAP: loop_duration_map, + NodeRunMetadataKey.LOOP_VARIABLE_MAP: single_loop_variable_map, + }, ) ) @@ -276,6 +243,159 @@ class LoopNode(BaseNode[LoopNodeData]): # Clean up variable_pool.remove([self.node_id, "index"]) + def _run_single_loop( + self, + *, + graph_engine: "GraphEngine", + loop_graph: Graph, + variable_pool: "VariablePool", + loop_variable_selectors: dict, + break_conditions: list, + logical_operator: Literal["and", "or"], + condition_processor: ConditionProcessor, + current_index: int, + start_at: datetime, + inputs: dict, + ) -> Generator[NodeEvent | InNodeEvent, None, dict]: + """Run a single loop iteration. + Returns: + dict: {'check_break_result': bool} + """ + # Run workflow + rst = graph_engine.run() + current_index_variable = variable_pool.get([self.node_id, "index"]) + if not isinstance(current_index_variable, IntegerSegment): + raise ValueError(f"loop {self.node_id} current index not found") + current_index = current_index_variable.value + + check_break_result = False + + for event in rst: + if isinstance(event, (BaseNodeEvent | BaseParallelBranchEvent)) and not event.in_loop_id: + event.in_loop_id = self.node_id + + if ( + isinstance(event, BaseNodeEvent) + and event.node_type == NodeType.LOOP_START + and not isinstance(event, NodeRunStreamChunkEvent) + ): + continue + + if ( + isinstance(event, NodeRunSucceededEvent) + and event.node_type == NodeType.LOOP_END + and not isinstance(event, NodeRunStreamChunkEvent) + ): + check_break_result = True + yield self._handle_event_metadata(event=event, iter_run_index=current_index) + break + + if isinstance(event, NodeRunSucceededEvent): + yield self._handle_event_metadata(event=event, iter_run_index=current_index) + + # Check if all variables in break conditions exist + exists_variable = False + for condition in break_conditions: + if not self.graph_runtime_state.variable_pool.get(condition.variable_selector): + exists_variable = False + break + else: + exists_variable = True + if exists_variable: + input_conditions, group_result, check_break_result = condition_processor.process_conditions( + variable_pool=self.graph_runtime_state.variable_pool, + conditions=break_conditions, + operator=logical_operator, + ) + if check_break_result: + break + + elif isinstance(event, BaseGraphEvent): + if isinstance(event, GraphRunFailedEvent): + # Loop run failed + yield LoopRunFailedEvent( + loop_id=self.id, + loop_node_id=self.node_id, + loop_node_type=self.node_type, + loop_node_data=self.node_data, + start_at=start_at, + inputs=inputs, + steps=current_index, + metadata={ + NodeRunMetadataKey.TOTAL_TOKENS: graph_engine.graph_runtime_state.total_tokens, + "completed_reason": "error", + }, + error=event.error, + ) + yield RunCompletedEvent( + run_result=NodeRunResult( + status=WorkflowNodeExecutionStatus.FAILED, + error=event.error, + metadata={NodeRunMetadataKey.TOTAL_TOKENS: graph_engine.graph_runtime_state.total_tokens}, + ) + ) + return {"check_break_result": True} + elif isinstance(event, NodeRunFailedEvent): + # Loop run failed + yield event + yield LoopRunFailedEvent( + loop_id=self.id, + loop_node_id=self.node_id, + loop_node_type=self.node_type, + loop_node_data=self.node_data, + start_at=start_at, + inputs=inputs, + steps=current_index, + metadata={ + NodeRunMetadataKey.TOTAL_TOKENS: graph_engine.graph_runtime_state.total_tokens, + "completed_reason": "error", + }, + error=event.error, + ) + yield RunCompletedEvent( + run_result=NodeRunResult( + status=WorkflowNodeExecutionStatus.FAILED, + error=event.error, + metadata={NodeRunMetadataKey.TOTAL_TOKENS: graph_engine.graph_runtime_state.total_tokens}, + ) + ) + return {"check_break_result": True} + else: + yield self._handle_event_metadata(event=cast(InNodeEvent, event), iter_run_index=current_index) + + # Remove all nodes outputs from variable pool + for node_id in loop_graph.node_ids: + variable_pool.remove([node_id]) + + _outputs = {} + for loop_variable_key, loop_variable_selector in loop_variable_selectors.items(): + _loop_variable_segment = variable_pool.get(loop_variable_selector) + if _loop_variable_segment: + _outputs[loop_variable_key] = _loop_variable_segment.value + else: + _outputs[loop_variable_key] = None + + _outputs["loop_round"] = current_index + 1 + self.node_data.outputs = _outputs + + if check_break_result: + return {"check_break_result": True} + + # Move to next loop + next_index = current_index + 1 + variable_pool.add([self.node_id, "index"], next_index) + + yield LoopRunNextEvent( + loop_id=self.id, + loop_node_id=self.node_id, + loop_node_type=self.node_type, + loop_node_data=self.node_data, + index=next_index, + pre_loop_output=self.node_data.outputs, + ) + + return {"check_break_result": False} + def _handle_event_metadata( self, *, @@ -360,3 +480,25 @@ class LoopNode(BaseNode[LoopNodeData]): } return variable_mapping + + @staticmethod + def _get_segment_for_constant(var_type: str, value: Any) -> Segment: + """Get the appropriate segment type for a constant value.""" + segment_mapping: dict[str, tuple[type[Segment], SegmentType]] = { + "string": (StringSegment, SegmentType.STRING), + "number": (IntegerSegment, SegmentType.NUMBER), + "object": (ObjectSegment, SegmentType.OBJECT), + "array[string]": (ArrayStringSegment, SegmentType.ARRAY_STRING), + "array[number]": (ArrayNumberSegment, SegmentType.ARRAY_NUMBER), + "array[object]": (ArrayObjectSegment, SegmentType.ARRAY_OBJECT), + } + if var_type in ["array[string]", "array[number]", "array[object]"]: + if value: + value = json.loads(value) + else: + value = [] + segment_info = segment_mapping.get(var_type) + if not segment_info: + raise ValueError(f"Invalid variable type: {var_type}") + segment_class, value_type = segment_info + return segment_class(value=value, value_type=value_type) diff --git a/api/core/workflow/nodes/node_mapping.py b/api/core/workflow/nodes/node_mapping.py index 63cd28976..1f1be5954 100644 --- a/api/core/workflow/nodes/node_mapping.py +++ b/api/core/workflow/nodes/node_mapping.py @@ -13,7 +13,7 @@ from core.workflow.nodes.iteration import IterationNode, IterationStartNode from core.workflow.nodes.knowledge_retrieval import KnowledgeRetrievalNode from core.workflow.nodes.list_operator import ListOperatorNode from core.workflow.nodes.llm import LLMNode -from core.workflow.nodes.loop import LoopNode, LoopStartNode +from core.workflow.nodes.loop import LoopEndNode, LoopNode, LoopStartNode from core.workflow.nodes.parameter_extractor import ParameterExtractorNode from core.workflow.nodes.question_classifier import QuestionClassifierNode from core.workflow.nodes.start import StartNode @@ -94,6 +94,10 @@ NODE_TYPE_CLASSES_MAPPING: Mapping[NodeType, Mapping[str, type[BaseNode]]] = { LATEST_VERSION: LoopStartNode, "1": LoopStartNode, }, + NodeType.LOOP_END: { + LATEST_VERSION: LoopEndNode, + "1": LoopEndNode, + }, NodeType.PARAMETER_EXTRACTOR: { LATEST_VERSION: ParameterExtractorNode, "1": ParameterExtractorNode, diff --git a/api/core/workflow/nodes/template_transform/entities.py b/api/core/workflow/nodes/template_transform/entities.py index 96adff6ff..ecff438cf 100644 --- a/api/core/workflow/nodes/template_transform/entities.py +++ b/api/core/workflow/nodes/template_transform/entities.py @@ -4,7 +4,7 @@ from core.workflow.nodes.base import BaseNodeData class TemplateTransformNodeData(BaseNodeData): """ - Code Node Data. + Template Transform Node Data. """ variables: list[VariableSelector] diff --git a/api/core/workflow/nodes/variable_aggregator/entities.py b/api/core/workflow/nodes/variable_aggregator/entities.py index 71a930e6b..9e58f5e94 100644 --- a/api/core/workflow/nodes/variable_aggregator/entities.py +++ b/api/core/workflow/nodes/variable_aggregator/entities.py @@ -26,7 +26,7 @@ class AdvancedSettings(BaseModel): class VariableAssignerNodeData(BaseNodeData): """ - Knowledge retrieval Node Data. + Variable Assigner Node Data. """ type: str = "variable-assigner" diff --git a/api/core/workflow/nodes/variable_assigner/v2/node.py b/api/core/workflow/nodes/variable_assigner/v2/node.py index afa5656f4..0305eb7f4 100644 --- a/api/core/workflow/nodes/variable_assigner/v2/node.py +++ b/api/core/workflow/nodes/variable_assigner/v2/node.py @@ -2,6 +2,7 @@ import json from collections.abc import Sequence from typing import Any, cast +from core.app.entities.app_invoke_entities import InvokeFrom from core.variables import SegmentType, Variable from core.workflow.constants import CONVERSATION_VARIABLE_NODE_ID from core.workflow.entities.node_entities import NodeRunResult @@ -123,13 +124,14 @@ class VariableAssignerNode(BaseNode[VariableAssignerNodeData]): if variable.selector[0] == CONVERSATION_VARIABLE_NODE_ID: conversation_id = self.graph_runtime_state.variable_pool.get(["sys", "conversation_id"]) if not conversation_id: - raise ConversationIDNotFoundError + if self.invoke_from != InvokeFrom.DEBUGGER: + raise ConversationIDNotFoundError else: conversation_id = conversation_id.value - common_helpers.update_conversation_variable( - conversation_id=cast(str, conversation_id), - variable=variable, - ) + common_helpers.update_conversation_variable( + conversation_id=cast(str, conversation_id), + variable=variable, + ) return NodeRunResult( status=WorkflowNodeExecutionStatus.SUCCEEDED, diff --git a/api/core/workflow/utils/condition/processor.py b/api/core/workflow/utils/condition/processor.py index c61b3d186..979538778 100644 --- a/api/core/workflow/utils/condition/processor.py +++ b/api/core/workflow/utils/condition/processor.py @@ -375,11 +375,25 @@ def _process_sub_conditions( for condition in sub_conditions: key = FileAttribute(condition.key) values = [file_manager.get_attr(file=file, attr=key) for file in files] + expected_value = condition.value + if key == FileAttribute.EXTENSION: + if not isinstance(expected_value, str): + raise TypeError("Expected value must be a string when key is FileAttribute.EXTENSION") + if expected_value and not expected_value.startswith("."): + expected_value = "." + expected_value + + normalized_values = [] + for value in values: + if value and isinstance(value, str): + if not value.startswith("."): + value = "." + value + normalized_values.append(value) + values = normalized_values sub_group_results = [ _evaluate_condition( value=value, operator=condition.comparison_operator, - expected=condition.value, + expected=expected_value, ) for value in values ] diff --git a/api/core/workflow/utils/variable_template_parser.py b/api/core/workflow/utils/variable_template_parser.py index 1d8fb38eb..f86c54c50 100644 --- a/api/core/workflow/utils/variable_template_parser.py +++ b/api/core/workflow/utils/variable_template_parser.py @@ -95,7 +95,6 @@ class VariableTemplateParser: Args: inputs: A dictionary containing the values for the template variables. - remove_template_variables: A boolean indicating whether to remove the template variables from the output. Returns: The formatted string with template variables replaced by their values. diff --git a/api/core/workflow/workflow_entry.py b/api/core/workflow/workflow_entry.py index 5a7d5373c..50118a401 100644 --- a/api/core/workflow/workflow_entry.py +++ b/api/core/workflow/workflow_entry.py @@ -204,6 +204,8 @@ class WorkflowEntry: NOTE: only parameter_extractor/question_classifier are supported :param node_data: node data + :param node_id: node id + :param tenant_id: tenant id :param user_id: user id :param user_inputs: user inputs :return: diff --git a/api/factories/file_factory.py b/api/factories/file_factory.py index 8c989e6b5..b69621ba5 100644 --- a/api/factories/file_factory.py +++ b/api/factories/file_factory.py @@ -134,8 +134,9 @@ def _build_from_local_file( if row is None: raise ValueError("Invalid upload file") - file_type = FileType(mapping.get("type", "custom")) - file_type = _standardize_file_type(file_type, extension="." + row.extension, mime_type=row.mime_type) + file_type = _standardize_file_type(extension="." + row.extension, mime_type=row.mime_type) + if file_type.value != mapping.get("type", "custom"): + raise ValueError("Detected file type does not match the specified type. Please verify the file.") return File( id=mapping.get("id"), @@ -173,10 +174,9 @@ def _build_from_remote_url( if upload_file is None: raise ValueError("Invalid upload file") - file_type = FileType(mapping.get("type", "custom")) - file_type = _standardize_file_type( - file_type, extension="." + upload_file.extension, mime_type=upload_file.mime_type - ) + file_type = _standardize_file_type(extension="." + upload_file.extension, mime_type=upload_file.mime_type) + if file_type.value != mapping.get("type", "custom"): + raise ValueError("Detected file type does not match the specified type. Please verify the file.") return File( id=mapping.get("id"), @@ -196,10 +196,11 @@ def _build_from_remote_url( raise ValueError("Invalid file url") mime_type, filename, file_size = _get_remote_file_info(url) - extension = mimetypes.guess_extension(mime_type) or "." + filename.split(".")[-1] if "." in filename else ".bin" + extension = mimetypes.guess_extension(mime_type) or ("." + filename.split(".")[-1] if "." in filename else ".bin") - file_type = FileType(mapping.get("type", "custom")) - file_type = _standardize_file_type(file_type, extension=extension, mime_type=mime_type) + file_type = _standardize_file_type(extension=extension, mime_type=mime_type) + if file_type.value != mapping.get("type", "custom"): + raise ValueError("Detected file type does not match the specified type. Please verify the file.") return File( id=mapping.get("id"), @@ -250,8 +251,8 @@ def _build_from_tool_file( raise ValueError(f"ToolFile {mapping.get('tool_file_id')} not found") extension = "." + tool_file.file_key.split(".")[-1] if "." in tool_file.file_key else ".bin" - file_type = FileType(mapping.get("type", "custom")) - file_type = _standardize_file_type(file_type, extension=extension, mime_type=tool_file.mimetype) + + file_type = _standardize_file_type(extension=extension, mime_type=tool_file.mimetype) return File( id=mapping.get("id"), @@ -302,12 +303,10 @@ def _is_file_valid_with_config( return True -def _standardize_file_type(file_type: FileType, /, *, extension: str = "", mime_type: str = "") -> FileType: +def _standardize_file_type(*, extension: str = "", mime_type: str = "") -> FileType: """ - If custom type, try to guess the file type by extension and mime_type. + Infer the possible actual type of the file based on the extension and mime_type """ - if file_type != FileType.CUSTOM: - return FileType(file_type) guessed_type = None if extension: guessed_type = _get_file_type_by_extension(extension) diff --git a/api/migrations/versions/2025_03_29_2227-6a9f914f656c_change_documentsegment_and_childchunk_.py b/api/migrations/versions/2025_03_29_2227-6a9f914f656c_change_documentsegment_and_childchunk_.py new file mode 100644 index 000000000..45904f0c8 --- /dev/null +++ b/api/migrations/versions/2025_03_29_2227-6a9f914f656c_change_documentsegment_and_childchunk_.py @@ -0,0 +1,43 @@ +"""change documentsegment and childchunk indexes + +Revision ID: 6a9f914f656c +Revises: d20049ed0af6 +Create Date: 2025-03-29 22:27:24.789481 + +""" +from alembic import op +import models as models +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '6a9f914f656c' +down_revision = 'd20049ed0af6' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('child_chunks', schema=None) as batch_op: + batch_op.create_index('child_chunks_node_idx', ['index_node_id', 'dataset_id'], unique=False) + batch_op.create_index('child_chunks_segment_idx', ['segment_id'], unique=False) + + with op.batch_alter_table('document_segments', schema=None) as batch_op: + batch_op.drop_index('document_segment_dataset_node_idx') + batch_op.create_index('document_segment_node_dataset_idx', ['index_node_id', 'dataset_id'], unique=False) + + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('document_segments', schema=None) as batch_op: + batch_op.drop_index('document_segment_node_dataset_idx') + batch_op.create_index('document_segment_dataset_node_idx', ['dataset_id', 'index_node_id'], unique=False) + + with op.batch_alter_table('child_chunks', schema=None) as batch_op: + batch_op.drop_index('child_chunks_segment_idx') + batch_op.drop_index('child_chunks_node_idx') + + # ### end Alembic commands ### diff --git a/api/models/dataset.py b/api/models/dataset.py index f104c32b5..d6708ac88 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -643,7 +643,7 @@ class DocumentSegment(db.Model): # type: ignore[name-defined] db.Index("document_segment_document_id_idx", "document_id"), db.Index("document_segment_tenant_dataset_idx", "dataset_id", "tenant_id"), db.Index("document_segment_tenant_document_idx", "document_id", "tenant_id"), - db.Index("document_segment_dataset_node_idx", "dataset_id", "index_node_id"), + db.Index("document_segment_node_dataset_idx", "index_node_id", "dataset_id"), db.Index("document_segment_tenant_idx", "tenant_id"), ) @@ -720,6 +720,23 @@ class DocumentSegment(db.Model): # type: ignore[name-defined] else: return [] + def get_child_chunks(self): + process_rule = self.document.dataset_process_rule + if process_rule.mode == "hierarchical": + rules = Rule(**process_rule.rules_dict) + if rules.parent_mode: + child_chunks = ( + db.session.query(ChildChunk) + .filter(ChildChunk.segment_id == self.id) + .order_by(ChildChunk.position.asc()) + .all() + ) + return child_chunks or [] + else: + return [] + else: + return [] + @property def sign_content(self): return self.get_sign_content() @@ -774,6 +791,8 @@ class ChildChunk(db.Model): # type: ignore[name-defined] __table_args__ = ( db.PrimaryKeyConstraint("id", name="child_chunk_pkey"), db.Index("child_chunk_dataset_id_idx", "tenant_id", "dataset_id", "document_id", "segment_id", "index_node_id"), + db.Index("child_chunks_node_idx", "index_node_id", "dataset_id"), + db.Index("child_chunks_segment_idx", "segment_id"), ) # initial fields diff --git a/api/models/model.py b/api/models/model.py index 8a85d51b1..e43592877 100644 --- a/api/models/model.py +++ b/api/models/model.py @@ -829,7 +829,7 @@ class Conversation(db.Model): # type: ignore[name-defined] WorkflowRunStatus.SUCCEEDED: 0, WorkflowRunStatus.FAILED: 0, WorkflowRunStatus.STOPPED: 0, - WorkflowRunStatus.PARTIAL_SUCCESSED: 0, + WorkflowRunStatus.PARTIAL_SUCCEEDED: 0, } for message in messages: @@ -840,7 +840,7 @@ class Conversation(db.Model): # type: ignore[name-defined] { "success": status_counts[WorkflowRunStatus.SUCCEEDED], "failed": status_counts[WorkflowRunStatus.FAILED], - "partial_success": status_counts[WorkflowRunStatus.PARTIAL_SUCCESSED], + "partial_success": status_counts[WorkflowRunStatus.PARTIAL_SUCCEEDED], } if messages else None diff --git a/api/models/workflow.py b/api/models/workflow.py index a599cb87d..0ee80b263 100644 --- a/api/models/workflow.py +++ b/api/models/workflow.py @@ -109,7 +109,7 @@ class Workflow(Base): tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False) app_id: Mapped[str] = mapped_column(StringUUID, nullable=False) type: Mapped[str] = mapped_column(db.String(255), nullable=False) - version: Mapped[str] + version: Mapped[str] = mapped_column(db.String(255), nullable=False) marked_name: Mapped[str] = mapped_column(default="", server_default="") marked_comment: Mapped[str] = mapped_column(default="", server_default="") graph: Mapped[str] = mapped_column(sa.Text) @@ -352,7 +352,7 @@ class WorkflowRunStatus(StrEnum): SUCCEEDED = "succeeded" FAILED = "failed" STOPPED = "stopped" - PARTIAL_SUCCESSED = "partial-succeeded" + PARTIAL_SUCCEEDED = "partial-succeeded" class WorkflowRun(Base): @@ -755,7 +755,8 @@ class WorkflowAppLog(Base): __tablename__ = "workflow_app_logs" __table_args__ = ( db.PrimaryKeyConstraint("id", name="workflow_app_log_pkey"), - db.Index("workflow_app_log_app_idx", "tenant_id", "app_id"), + db.Index("workflow_app_log_app_idx", "tenant_id", "app_id", "created_at"), + db.Index("workflow_app_log_workflow_run_idx", "workflow_run_id"), ) id: Mapped[str] = mapped_column(StringUUID, server_default=db.text("uuid_generate_v4()")) diff --git a/api/mypy.ini b/api/mypy.ini index 2c754f9fc..2898b9b52 100644 --- a/api/mypy.ini +++ b/api/mypy.ini @@ -3,8 +3,7 @@ warn_return_any = True warn_unused_configs = True check_untyped_defs = True exclude = (?x)( - core/tools/provider/builtin/ - | core/model_runtime/model_providers/ + core/model_runtime/model_providers/ | tests/ | migrations/ - ) \ No newline at end of file + ) diff --git a/api/poetry.lock b/api/poetry.lock index 68516ca1f..a91023707 100644 --- a/api/poetry.lock +++ b/api/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "aiofiles" @@ -501,7 +501,7 @@ description = "Timeout context manager for asyncio programs" optional = false python-versions = ">=3.8" groups = ["main"] -markers = "python_full_version < \"3.11.3\"" +markers = "python_version == \"3.11\" and python_full_version < \"3.11.3\"" files = [ {file = "async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c"}, {file = "async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3"}, @@ -1157,7 +1157,7 @@ files = [ {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"}, {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"}, ] -markers = {storage = "platform_python_implementation != \"PyPy\"", vdb = "python_version < \"3.12\" or platform_python_implementation != \"PyPy\""} +markers = {storage = "platform_python_implementation != \"PyPy\"", vdb = "platform_python_implementation != \"PyPy\""} [package.dependencies] pycparser = "*" @@ -1728,6 +1728,123 @@ files = [ [package.extras] toml = ["tomli ; python_full_version <= \"3.11.0a6\""] +[[package]] +name = "crc32c" +version = "2.7.1" +description = "A python package implementing the crc32c algorithm in hardware and software" +optional = false +python-versions = ">=3.7" +groups = ["vdb"] +files = [ + {file = "crc32c-2.7.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1fd1f9c6b50d7357736676278a1b8c8986737b8a1c76d7eab4baa71d0b6af67f"}, + {file = "crc32c-2.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:805c2be1bc0e251c48439a62b0422385899c15289483692bc70e78473c1039f1"}, + {file = "crc32c-2.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f4333e62b7844dfde112dbb8489fd2970358eddc3310db21e943a9f6994df749"}, + {file = "crc32c-2.7.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f0fadc741e79dc705e2d9ee967473e8a061d26b04310ed739f1ee292f33674f"}, + {file = "crc32c-2.7.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91ced31055d26d59385d708bbd36689e1a1d604d4b0ceb26767eb5a83156f85d"}, + {file = "crc32c-2.7.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36ffa999b72e3c17f6a066ae9e970b40f8c65f38716e436c39a33b809bc6ed9f"}, + {file = "crc32c-2.7.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e80114dd7f462297e54d5da1b9ff472e5249c5a2b406aa51c371bb0edcbf76bd"}, + {file = "crc32c-2.7.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:676f5b46da268b5190f9fb91b3f037a00d114b411313664438525db876adc71f"}, + {file = "crc32c-2.7.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8d0e660c9ed269e90692993a4457a932fc22c9cc96caf79dd1f1a84da85bb312"}, + {file = "crc32c-2.7.1-cp310-cp310-win32.whl", hash = "sha256:17a2c3f8c6d85b04b5511af827b5dbbda4e672d188c0b9f20a8156e93a1aa7b6"}, + {file = "crc32c-2.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:3208764c29688f91a35392073229975dd7687b6cb9f76b919dae442cabcd5126"}, + {file = "crc32c-2.7.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:19e03a50545a3ef400bd41667d5525f71030488629c57d819e2dd45064f16192"}, + {file = "crc32c-2.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8c03286b1e5ce9bed7090084f206aacd87c5146b4b10de56fe9e86cbbbf851cf"}, + {file = "crc32c-2.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:80ebbf144a1a56a532b353e81fa0f3edca4f4baa1bf92b1dde2c663a32bb6a15"}, + {file = "crc32c-2.7.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96b794fd11945298fdd5eb1290a812efb497c14bc42592c5c992ca077458eeba"}, + {file = "crc32c-2.7.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9df7194dd3c0efb5a21f5d70595b7a8b4fd9921fbbd597d6d8e7a11eca3e2d27"}, + {file = "crc32c-2.7.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d698eec444b18e296a104d0b9bb6c596c38bdcb79d24eba49604636e9d747305"}, + {file = "crc32c-2.7.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e07cf10ef852d219d179333fd706d1c415626f1f05e60bd75acf0143a4d8b225"}, + {file = "crc32c-2.7.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d2a051f296e6e92e13efee3b41db388931cdb4a2800656cd1ed1d9fe4f13a086"}, + {file = "crc32c-2.7.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a1738259802978cdf428f74156175da6a5fdfb7256f647fdc0c9de1bc6cd7173"}, + {file = "crc32c-2.7.1-cp311-cp311-win32.whl", hash = "sha256:f7786d219a1a1bf27d0aa1869821d11a6f8e90415cfffc1e37791690d4a848a1"}, + {file = "crc32c-2.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:887f6844bb3ad35f0778cd10793ad217f7123a5422e40041231b8c4c7329649d"}, + {file = "crc32c-2.7.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f7d1c4e761fe42bf856130daf8b2658df33fe0ced3c43dadafdfeaa42b57b950"}, + {file = "crc32c-2.7.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:73361c79a6e4605204457f19fda18b042a94508a52e53d10a4239da5fb0f6a34"}, + {file = "crc32c-2.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:afd778fc8ac0ed2ffbfb122a9aa6a0e409a8019b894a1799cda12c01534493e0"}, + {file = "crc32c-2.7.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:56ef661b34e9f25991fface7f9ad85e81bbc1b3fe3b916fd58c893eabe2fa0b8"}, + {file = "crc32c-2.7.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:571aa4429444b5d7f588e4377663592145d2d25eb1635abb530f1281794fc7c9"}, + {file = "crc32c-2.7.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c02a3bd67dea95cdb25844aaf44ca2e1b0c1fd70b287ad08c874a95ef4bb38db"}, + {file = "crc32c-2.7.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:99d17637c4867672cb8adeea007294e3c3df9d43964369516cfe2c1f47ce500a"}, + {file = "crc32c-2.7.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:f4a400ac3c69a32e180d8753fd7ec7bccb80ade7ab0812855dce8a208e72495f"}, + {file = "crc32c-2.7.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:588587772e55624dd9c7a906ec9e8773ae0b6ac5e270fc0bc84ee2758eba90d5"}, + {file = "crc32c-2.7.1-cp312-cp312-win32.whl", hash = "sha256:9f14b60e5a14206e8173dd617fa0c4df35e098a305594082f930dae5488da428"}, + {file = "crc32c-2.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:7c810a246660a24dc818047dc5f89c7ce7b2814e1e08a8e99993f4103f7219e8"}, + {file = "crc32c-2.7.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:24949bffb06fc411cc18188d33357923cb935273642164d0bb37a5f375654169"}, + {file = "crc32c-2.7.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2d5d326e7e118d4fa60187770d86b66af2fdfc63ce9eeb265f0d3e7d49bebe0b"}, + {file = "crc32c-2.7.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ba110df60c64c8e2d77a9425b982a520ccdb7abe42f06604f4d98a45bb1fff62"}, + {file = "crc32c-2.7.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c277f9d16a3283e064d54854af0976b72abaa89824955579b2b3f37444f89aae"}, + {file = "crc32c-2.7.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:881af0478a01331244e27197356929edbdeaef6a9f81b5c6bacfea18d2139289"}, + {file = "crc32c-2.7.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:724d5ff4d29ff093a983ae656be3307093706d850ea2a233bf29fcacc335d945"}, + {file = "crc32c-2.7.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b2416c4d88696ac322632555c0f81ab35e15f154bc96055da6cf110d642dbc10"}, + {file = "crc32c-2.7.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:60254251b88ec9b9795215f0f9ec015a6b5eef8b2c5fba1267c672d83c78fc02"}, + {file = "crc32c-2.7.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:edefc0e46f3c37372183f70338e5bdee42f6789b62fcd36ec53aa933e9dfbeaf"}, + {file = "crc32c-2.7.1-cp313-cp313-win32.whl", hash = "sha256:813af8111218970fe2adb833c5e5239f091b9c9e76f03b4dd91aaba86e99b499"}, + {file = "crc32c-2.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:7d9ede7be8e4ec1c9e90aaf6884decbeef10e3473e6ddac032706d710cab5888"}, + {file = "crc32c-2.7.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:db9ac92294284b22521356715784b91cc9094eee42a5282ab281b872510d1831"}, + {file = "crc32c-2.7.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:8fcd7f2f29a30dc92af64a9ee3d38bde0c82bd20ad939999427aac94bbd87373"}, + {file = "crc32c-2.7.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5c056ef043393085523e149276a7ce0cb534b872e04f3e20d74d9a94a75c0ad7"}, + {file = "crc32c-2.7.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03a92551a343702629af91f78d205801219692b6909f8fa126b830e332bfb0e0"}, + {file = "crc32c-2.7.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fb9424ec1a8ca54763155a703e763bcede82e6569fe94762614bb2de1412d4e1"}, + {file = "crc32c-2.7.1-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88732070f6175530db04e0bb36880ac45c33d49f8ac43fa0e50cfb1830049d23"}, + {file = "crc32c-2.7.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:57a20dfc27995f568f64775eea2bbb58ae269f1a1144561df5e4a4955f79db32"}, + {file = "crc32c-2.7.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:f7186d098bfd2cff25eac6880b7c7ad80431b90610036131c1c7dd0eab42a332"}, + {file = "crc32c-2.7.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:55a77e29a265418fa34bef15bd0f2c60afae5348988aaf35ed163b4bbf93cf37"}, + {file = "crc32c-2.7.1-cp313-cp313t-win32.whl", hash = "sha256:ae38a4b6aa361595d81cab441405fbee905c72273e80a1c010fb878ae77ac769"}, + {file = "crc32c-2.7.1-cp313-cp313t-win_amd64.whl", hash = "sha256:eee2a43b663feb6c79a6c1c6e5eae339c2b72cfac31ee54ec0209fa736cf7ee5"}, + {file = "crc32c-2.7.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:04a56e9f4995559fa86bcf5d0ed5c48505a36e2be1c41d70cae5c080d9a00b74"}, + {file = "crc32c-2.7.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88c5c9c21cd9fff593bb7dfe97d3287438c8aecbcc73d227f2366860a0663521"}, + {file = "crc32c-2.7.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:595146cb94ba0055301d273113add2af5859b467db41b50367f47870c2d0a81c"}, + {file = "crc32c-2.7.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b9f3792872f1320961f33aaf0198edea371aee393bcc221fab66d10ecffd77d"}, + {file = "crc32c-2.7.1-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:999a40d75cd1696e779f6f99c29fa52be777197d1d9e3ae69cb919a05a369c1e"}, + {file = "crc32c-2.7.1-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:eff485526172cee7e6d1fa9c23913f92c7d38ab05674b0b578767c7b693faf5d"}, + {file = "crc32c-2.7.1-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:541dac90c64ed9ce05f85a71066567e854c1b40743a01d83fa2c66419a2e97b6"}, + {file = "crc32c-2.7.1-cp37-cp37m-win32.whl", hash = "sha256:7138ec26e79100c4cf4294ef40027a1cff26a1e23b7e5eb70efe5d7ff37cbc66"}, + {file = "crc32c-2.7.1-cp37-cp37m-win_amd64.whl", hash = "sha256:35a3ed12ac2e2551a07d246b7e6512ac39db021e006205a40c1cfd32ea73fcc3"}, + {file = "crc32c-2.7.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:af062f11aea283b7e9c95f3a97fb6bb96ac08a9063f71621c2140237df141ada"}, + {file = "crc32c-2.7.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8f25ca521ecf7cccfff0ecae4d0538b5c0c7235d27bf098241f3e2bf86aed713"}, + {file = "crc32c-2.7.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1410bcd909be36ccbf8a52c45e4bddca77adfd4e80789ac3cd575c024086516d"}, + {file = "crc32c-2.7.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33fc8cb32f82685ebefd078e740925ea9da37a008ed5f43b68fc8324f8ca4a37"}, + {file = "crc32c-2.7.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ad3dc6283ce53ad7d1dc5775003460110ab7eebf348efebe0486a531b28f8184"}, + {file = "crc32c-2.7.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:758ead20e122496764ae50db26bb90fb47fc4b6d242c8e99e87c3f1dae1f1dce"}, + {file = "crc32c-2.7.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:e436d9044bbd51936f7aeb8b322543c516bf22371a17970a370a10af1661fa54"}, + {file = "crc32c-2.7.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:47e5be99057264b603e3cd88cf091985f33c16d3c8609f1c83ed6e72ec4179b4"}, + {file = "crc32c-2.7.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:280509210e622a236f16f031856847fd0d6704df662d7209da819ccfb40c6167"}, + {file = "crc32c-2.7.1-cp38-cp38-win32.whl", hash = "sha256:4ab48e048cfa123a9f9bdc5d4d687a3461723132c749c721a6d358605e6d470d"}, + {file = "crc32c-2.7.1-cp38-cp38-win_amd64.whl", hash = "sha256:65471d1b1b6e10a404ca8200a4271d5bc0a552c3f5dcd943c1c7835f766ea02d"}, + {file = "crc32c-2.7.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:39ca842586084bca24f9c4ab43e2d99191b1186b2f89b2122b470d0730254d1b"}, + {file = "crc32c-2.7.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a911abc33d453b3f171a3200b1e18b3fc39c204670b5b0a353cca99e4c664332"}, + {file = "crc32c-2.7.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:22a72e81ec08a7ece6a35ac68d1ed32dd4a8be7949b164db88d4b4a4bade5c5a"}, + {file = "crc32c-2.7.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54d6f8c5be6815eabd6e3e90fa0bc13045183a6aa33a30dd684eb0f062b92213"}, + {file = "crc32c-2.7.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c855726d71dee7ae25f81c6b54293455fc66802f34d334d22bea1f6ce8bc21c"}, + {file = "crc32c-2.7.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98d5f7fc364bb9c4c4123d149406fbee063f2e8c2cff19a12f13e50faa146237"}, + {file = "crc32c-2.7.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:51ffba582c95a281e5a3f71eacdafc96b9a1835ddae245385639458fff197034"}, + {file = "crc32c-2.7.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3950d3c340c9d70889630ef81fba8666abfd0cf0aa19fd9c3a55634e0b383b0f"}, + {file = "crc32c-2.7.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:522fba1770aad8f7eb189f21fca591a51d96dcc749859088f462281324aec30b"}, + {file = "crc32c-2.7.1-cp39-cp39-win32.whl", hash = "sha256:812723e222b6a9fe0562554d72f4f072c3a95720c60ee500984e7d0e568caac3"}, + {file = "crc32c-2.7.1-cp39-cp39-win_amd64.whl", hash = "sha256:6793fcfe9d4130230d196abbe4021c01ffe8e85c92633bf3c8559f9836c227f5"}, + {file = "crc32c-2.7.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:2e83fedebcdeb80c19e76b7a0e5103528bb062521c40702bf34516a429e81df3"}, + {file = "crc32c-2.7.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30004a7383538ef93bda9b22f7b3805bc0aa5625ab2675690e1b676b19417d4b"}, + {file = "crc32c-2.7.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a01b0983aa87f517c12418f9898ecf2083bf86f4ea04122e053357c3edb0d73f"}, + {file = "crc32c-2.7.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb2b963c42128b38872e9ed63f04a73ce1ff89a1dfad7ea38add6fe6296497b8"}, + {file = "crc32c-2.7.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:cdd5e576fee5d255c1e68a4dae4420f21e57e6f05900b38d5ae47c713fc3330d"}, + {file = "crc32c-2.7.1-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:79f0ff50863aeb441fbfa87e9db6542ddfe3e941189dece832b0af2e454dbab0"}, + {file = "crc32c-2.7.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cd27a1e400d77e9872fa1303e8f9d30bd050df35ee4858354ce0b59f8227d32"}, + {file = "crc32c-2.7.1-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:274739b3e1591bd4b7ec98764f2f79c6fbcc0f7d7676d5f17369832fe14ee4f0"}, + {file = "crc32c-2.7.1-pp37-pypy37_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:050f52045b4a033a245e0ee4357e1a793de5af6496c82250ef13d8cb90a21e20"}, + {file = "crc32c-2.7.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:ceb4ca126f75694bda020a307221563d3c522719c0acedcc81ffb985b4867c94"}, + {file = "crc32c-2.7.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:eabefe7a6fb5dfc6318fb35f4d98893baef17ebda9b311498e870526d32168e7"}, + {file = "crc32c-2.7.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:217edd9ba8c5f0c3ad60c82a11fa78f01162fa106fd7f5d17175dac6bf1eedf9"}, + {file = "crc32c-2.7.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15d640d9d4aa213aec6c837f602081a17d1522f8cd78b52334b62ee27b083410"}, + {file = "crc32c-2.7.1-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:519878822bf9bdead63c25a5e4bdc26d2eae9da6056f92b9b5f3023c08f1d016"}, + {file = "crc32c-2.7.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2bf69cfa4c3ea9f060fe06db00b7e34f771c83f73dd2c3568c2c9019479e34c2"}, + {file = "crc32c-2.7.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:e89d51c90f6730b67b12c97d49099ba18d0fdce18541fab94d2be95d1c939adb"}, + {file = "crc32c-2.7.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:488a0feba1bb005d0dd2f702c1da4849d083e88d82cd27b83ac2d2d93af80755"}, + {file = "crc32c-2.7.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:919262b7a12ef63f222ec19c0e092f39268802652e11669315257ae6249ec79f"}, + {file = "crc32c-2.7.1-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4181240f6080c38eec9dd1539cd23a304a12100d3f4ffe43234f32064fae5ef0"}, + {file = "crc32c-2.7.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fedde1e53507d0ede1980e8109442edd108c04ab100abcd5145c274820dacd4f"}, + {file = "crc32c-2.7.1.tar.gz", hash = "sha256:f91b144a21eef834d64178e01982bb9179c354b3e9e5f4c803b0e5096384968c"}, +] + [[package]] name = "crcmod" version = "1.7" @@ -1980,6 +2097,19 @@ files = [ [package.extras] dev = ["coverage", "pytest (>=7.4.4)"] +[[package]] +name = "enum34" +version = "1.1.10" +description = "Python 3.4 Enum backported to 3.3, 3.2, 3.1, 2.7, 2.6, 2.5, and 2.4" +optional = false +python-versions = "*" +groups = ["vdb"] +files = [ + {file = "enum34-1.1.10-py2-none-any.whl", hash = "sha256:a98a201d6de3f2ab3db284e70a33b0f896fbf35f8086594e8c9e74b909058d53"}, + {file = "enum34-1.1.10-py3-none-any.whl", hash = "sha256:c3858660960c984d6ab0ebad691265180da2b43f07e061c0f8dca9ef3cffd328"}, + {file = "enum34-1.1.10.tar.gz", hash = "sha256:cce6a7477ed816bd2542d03d53db9f0db935dd013b70f336a95c73979289f248"}, +] + [[package]] name = "esdk-obs-python" version = "3.24.6.1" @@ -2775,6 +2905,7 @@ files = [ {file = "google_crc32c-1.7.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:6a40522958040051c755a173eb98c05ad4d64a6dd898888c3e5ccca2d1cbdcdc"}, {file = "google_crc32c-1.7.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f714fe5cdf5007d7064c57cf7471a99e0cbafda24ddfa829117fc3baafa424f7"}, {file = "google_crc32c-1.7.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f04e58dbe1bf0c9398e603a9be5aaa09e0ba7eb022a3293195d8749459a01069"}, + {file = "google_crc32c-1.7.0-cp313-cp313-win_amd64.whl", hash = "sha256:e545b51ddf97f604d30114f7c23eecaf4c06cd6c023ff1ae0b80dcd99af32833"}, {file = "google_crc32c-1.7.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:364067b063664dd8d1fec75a3fe85edf05c46f688365269beccaf42ef5dfe889"}, {file = "google_crc32c-1.7.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1b0d6044799f6ac51d1cc2decb997280a83c448b3bef517a54b57a3b71921c0"}, {file = "google_crc32c-1.7.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:02bc3295d26cd7666521fd6d5b7b93923ae1eb4417ddd3bc57185a5881ad7b96"}, @@ -3583,7 +3714,7 @@ version = "0.42.1" description = "Chinese Words Segmentation Utilities" optional = false python-versions = "*" -groups = ["main"] +groups = ["main", "vdb"] files = [ {file = "jieba-0.42.1.tar.gz", hash = "sha256:055ca12f62674fafed09427f176506079bc135638a14e23e25be909131928db2"}, ] @@ -4232,6 +4363,21 @@ html5 = ["html5lib"] htmlsoup = ["BeautifulSoup4"] source = ["Cython (>=3.0.11,<3.1.0)"] +[[package]] +name = "lxml-stubs" +version = "0.5.1" +description = "Type annotations for the lxml package" +optional = false +python-versions = "*" +groups = ["dev"] +files = [ + {file = "lxml-stubs-0.5.1.tar.gz", hash = "sha256:e0ec2aa1ce92d91278b719091ce4515c12adc1d564359dfaf81efa7d4feab79d"}, + {file = "lxml_stubs-0.5.1-py3-none-any.whl", hash = "sha256:1f689e5dbc4b9247cb09ae820c7d34daeb1fdbd1db06123814b856dae7787272"}, +] + +[package.extras] +test = ["coverage[toml] (>=7.2.5)", "mypy (>=1.2.0)", "pytest (>=7.3.0)", "pytest-mypy-plugins (>=1.10.1)"] + [[package]] name = "lz4" version = "4.4.3" @@ -4771,49 +4917,49 @@ files = [ [[package]] name = "mypy" -version = "1.13.0" +version = "1.15.0" description = "Optional static typing for Python" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "mypy-1.13.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6607e0f1dd1fb7f0aca14d936d13fd19eba5e17e1cd2a14f808fa5f8f6d8f60a"}, - {file = "mypy-1.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a21be69bd26fa81b1f80a61ee7ab05b076c674d9b18fb56239d72e21d9f4c80"}, - {file = "mypy-1.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b2353a44d2179846a096e25691d54d59904559f4232519d420d64da6828a3a7"}, - {file = "mypy-1.13.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0730d1c6a2739d4511dc4253f8274cdd140c55c32dfb0a4cf8b7a43f40abfa6f"}, - {file = "mypy-1.13.0-cp310-cp310-win_amd64.whl", hash = "sha256:c5fc54dbb712ff5e5a0fca797e6e0aa25726c7e72c6a5850cfd2adbc1eb0a372"}, - {file = "mypy-1.13.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:581665e6f3a8a9078f28d5502f4c334c0c8d802ef55ea0e7276a6e409bc0d82d"}, - {file = "mypy-1.13.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3ddb5b9bf82e05cc9a627e84707b528e5c7caaa1c55c69e175abb15a761cec2d"}, - {file = "mypy-1.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:20c7ee0bc0d5a9595c46f38beb04201f2620065a93755704e141fcac9f59db2b"}, - {file = "mypy-1.13.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3790ded76f0b34bc9c8ba4def8f919dd6a46db0f5a6610fb994fe8efdd447f73"}, - {file = "mypy-1.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:51f869f4b6b538229c1d1bcc1dd7d119817206e2bc54e8e374b3dfa202defcca"}, - {file = "mypy-1.13.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5c7051a3461ae84dfb5dd15eff5094640c61c5f22257c8b766794e6dd85e72d5"}, - {file = "mypy-1.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:39bb21c69a5d6342f4ce526e4584bc5c197fd20a60d14a8624d8743fffb9472e"}, - {file = "mypy-1.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:164f28cb9d6367439031f4c81e84d3ccaa1e19232d9d05d37cb0bd880d3f93c2"}, - {file = "mypy-1.13.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a4c1bfcdbce96ff5d96fc9b08e3831acb30dc44ab02671eca5953eadad07d6d0"}, - {file = "mypy-1.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0affb3a79a256b4183ba09811e3577c5163ed06685e4d4b46429a271ba174d2"}, - {file = "mypy-1.13.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a7b44178c9760ce1a43f544e595d35ed61ac2c3de306599fa59b38a6048e1aa7"}, - {file = "mypy-1.13.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5d5092efb8516d08440e36626f0153b5006d4088c1d663d88bf79625af3d1d62"}, - {file = "mypy-1.13.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2904956dac40ced10931ac967ae63c5089bd498542194b436eb097a9f77bc8"}, - {file = "mypy-1.13.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7bfd8836970d33c2105562650656b6846149374dc8ed77d98424b40b09340ba7"}, - {file = "mypy-1.13.0-cp313-cp313-win_amd64.whl", hash = "sha256:9f73dba9ec77acb86457a8fc04b5239822df0c14a082564737833d2963677dbc"}, - {file = "mypy-1.13.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:100fac22ce82925f676a734af0db922ecfea991e1d7ec0ceb1e115ebe501301a"}, - {file = "mypy-1.13.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7bcb0bb7f42a978bb323a7c88f1081d1b5dee77ca86f4100735a6f541299d8fb"}, - {file = "mypy-1.13.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bde31fc887c213e223bbfc34328070996061b0833b0a4cfec53745ed61f3519b"}, - {file = "mypy-1.13.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:07de989f89786f62b937851295ed62e51774722e5444a27cecca993fc3f9cd74"}, - {file = "mypy-1.13.0-cp38-cp38-win_amd64.whl", hash = "sha256:4bde84334fbe19bad704b3f5b78c4abd35ff1026f8ba72b29de70dda0916beb6"}, - {file = "mypy-1.13.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0246bcb1b5de7f08f2826451abd947bf656945209b140d16ed317f65a17dc7dc"}, - {file = "mypy-1.13.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7f5b7deae912cf8b77e990b9280f170381fdfbddf61b4ef80927edd813163732"}, - {file = "mypy-1.13.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7029881ec6ffb8bc233a4fa364736789582c738217b133f1b55967115288a2bc"}, - {file = "mypy-1.13.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3e38b980e5681f28f033f3be86b099a247b13c491f14bb8b1e1e134d23bb599d"}, - {file = "mypy-1.13.0-cp39-cp39-win_amd64.whl", hash = "sha256:a6789be98a2017c912ae6ccb77ea553bbaf13d27605d2ca20a76dfbced631b24"}, - {file = "mypy-1.13.0-py3-none-any.whl", hash = "sha256:9c250883f9fd81d212e0952c92dbfcc96fc237f4b7c92f56ac81fd48460b3e5a"}, - {file = "mypy-1.13.0.tar.gz", hash = "sha256:0291a61b6fbf3e6673e3405cfcc0e7650bebc7939659fdca2702958038bd835e"}, + {file = "mypy-1.15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:979e4e1a006511dacf628e36fadfecbcc0160a8af6ca7dad2f5025529e082c13"}, + {file = "mypy-1.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c4bb0e1bd29f7d34efcccd71cf733580191e9a264a2202b0239da95984c5b559"}, + {file = "mypy-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:be68172e9fd9ad8fb876c6389f16d1c1b5f100ffa779f77b1fb2176fcc9ab95b"}, + {file = "mypy-1.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c7be1e46525adfa0d97681432ee9fcd61a3964c2446795714699a998d193f1a3"}, + {file = "mypy-1.15.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2e2c2e6d3593f6451b18588848e66260ff62ccca522dd231cd4dd59b0160668b"}, + {file = "mypy-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:6983aae8b2f653e098edb77f893f7b6aca69f6cffb19b2cc7443f23cce5f4828"}, + {file = "mypy-1.15.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2922d42e16d6de288022e5ca321cd0618b238cfc5570e0263e5ba0a77dbef56f"}, + {file = "mypy-1.15.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2ee2d57e01a7c35de00f4634ba1bbf015185b219e4dc5909e281016df43f5ee5"}, + {file = "mypy-1.15.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:973500e0774b85d9689715feeffcc980193086551110fd678ebe1f4342fb7c5e"}, + {file = "mypy-1.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a95fb17c13e29d2d5195869262f8125dfdb5c134dc8d9a9d0aecf7525b10c2c"}, + {file = "mypy-1.15.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1905f494bfd7d85a23a88c5d97840888a7bd516545fc5aaedff0267e0bb54e2f"}, + {file = "mypy-1.15.0-cp311-cp311-win_amd64.whl", hash = "sha256:c9817fa23833ff189db061e6d2eff49b2f3b6ed9856b4a0a73046e41932d744f"}, + {file = "mypy-1.15.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:aea39e0583d05124836ea645f412e88a5c7d0fd77a6d694b60d9b6b2d9f184fd"}, + {file = "mypy-1.15.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2f2147ab812b75e5b5499b01ade1f4a81489a147c01585cda36019102538615f"}, + {file = "mypy-1.15.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ce436f4c6d218a070048ed6a44c0bbb10cd2cc5e272b29e7845f6a2f57ee4464"}, + {file = "mypy-1.15.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8023ff13985661b50a5928fc7a5ca15f3d1affb41e5f0a9952cb68ef090b31ee"}, + {file = "mypy-1.15.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1124a18bc11a6a62887e3e137f37f53fbae476dc36c185d549d4f837a2a6a14e"}, + {file = "mypy-1.15.0-cp312-cp312-win_amd64.whl", hash = "sha256:171a9ca9a40cd1843abeca0e405bc1940cd9b305eaeea2dda769ba096932bb22"}, + {file = "mypy-1.15.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:93faf3fdb04768d44bf28693293f3904bbb555d076b781ad2530214ee53e3445"}, + {file = "mypy-1.15.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:811aeccadfb730024c5d3e326b2fbe9249bb7413553f15499a4050f7c30e801d"}, + {file = "mypy-1.15.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:98b7b9b9aedb65fe628c62a6dc57f6d5088ef2dfca37903a7d9ee374d03acca5"}, + {file = "mypy-1.15.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c43a7682e24b4f576d93072216bf56eeff70d9140241f9edec0c104d0c515036"}, + {file = "mypy-1.15.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:baefc32840a9f00babd83251560e0ae1573e2f9d1b067719479bfb0e987c6357"}, + {file = "mypy-1.15.0-cp313-cp313-win_amd64.whl", hash = "sha256:b9378e2c00146c44793c98b8d5a61039a048e31f429fb0eb546d93f4b000bedf"}, + {file = "mypy-1.15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e601a7fa172c2131bff456bb3ee08a88360760d0d2f8cbd7a75a65497e2df078"}, + {file = "mypy-1.15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:712e962a6357634fef20412699a3655c610110e01cdaa6180acec7fc9f8513ba"}, + {file = "mypy-1.15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f95579473af29ab73a10bada2f9722856792a36ec5af5399b653aa28360290a5"}, + {file = "mypy-1.15.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8f8722560a14cde92fdb1e31597760dc35f9f5524cce17836c0d22841830fd5b"}, + {file = "mypy-1.15.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1fbb8da62dc352133d7d7ca90ed2fb0e9d42bb1a32724c287d3c76c58cbaa9c2"}, + {file = "mypy-1.15.0-cp39-cp39-win_amd64.whl", hash = "sha256:d10d994b41fb3497719bbf866f227b3489048ea4bbbb5015357db306249f7980"}, + {file = "mypy-1.15.0-py3-none-any.whl", hash = "sha256:5469affef548bd1895d86d3bf10ce2b44e33d86923c29e4d675b3e323437ea3e"}, + {file = "mypy-1.15.0.tar.gz", hash = "sha256:404534629d51d3efea5c800ee7c42b72a6554d6c400e6a79eafe15d11341fd43"}, ] [package.dependencies] -mypy-extensions = ">=1.0.0" -typing-extensions = ">=4.6.0" +mypy_extensions = ">=1.0.0" +typing_extensions = ">=4.6.0" [package.extras] dmypy = ["psutil (>=4.0)"] @@ -4962,7 +5108,7 @@ version = "1.26.4" description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.9" -groups = ["main", "indirect", "vdb"] +groups = ["main", "dev", "indirect", "vdb"] files = [ {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"}, {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"}, @@ -5690,6 +5836,21 @@ files = [ numpy = ">=1.23.5" types-pytz = ">=2022.1.1" +[[package]] +name = "pandoc" +version = "2.4" +description = "Pandoc Documents for Python" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "pandoc-2.4.tar.gz", hash = "sha256:ecd1f8cbb7f4180c6b5db4a17a7c1a74df519995f5f186ef81ce72a9cbd0dd9a"}, +] + +[package.dependencies] +plumbum = "*" +ply = "*" + [[package]] name = "pgvecto-rs" version = "0.2.2" @@ -5832,13 +5993,34 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "plumbum" +version = "1.9.0" +description = "Plumbum: shell combinators library" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "plumbum-1.9.0-py3-none-any.whl", hash = "sha256:9fd0d3b0e8d86e4b581af36edf3f3bbe9d1ae15b45b8caab28de1bcb27aaa7f5"}, + {file = "plumbum-1.9.0.tar.gz", hash = "sha256:e640062b72642c3873bd5bdc3effed75ba4d3c70ef6b6a7b907357a84d909219"}, +] + +[package.dependencies] +pywin32 = {version = "*", markers = "platform_system == \"Windows\" and platform_python_implementation != \"PyPy\""} + +[package.extras] +dev = ["coverage[toml]", "paramiko", "psutil", "pytest (>=6.0)", "pytest-cov", "pytest-mock", "pytest-timeout"] +docs = ["sphinx (>=4.0.0)", "sphinx-rtd-theme (>=1.0.0)"] +ssh = ["paramiko"] +test = ["coverage[toml]", "paramiko", "psutil", "pytest (>=6.0)", "pytest-cov", "pytest-mock", "pytest-timeout"] + [[package]] name = "ply" version = "3.11" description = "Python Lex & Yacc" optional = false python-versions = "*" -groups = ["lint"] +groups = ["main", "lint"] files = [ {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, @@ -6243,7 +6425,7 @@ files = [ {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, ] -markers = {storage = "platform_python_implementation != \"PyPy\"", vdb = "python_version < \"3.12\" or platform_python_implementation != \"PyPy\""} +markers = {storage = "platform_python_implementation != \"PyPy\"", vdb = "platform_python_implementation != \"PyPy\""} [[package]] name = "pycryptodome" @@ -7012,8 +7194,7 @@ version = "310" description = "Python for Window Extensions" optional = false python-versions = "*" -groups = ["vdb"] -markers = "platform_system == \"Windows\"" +groups = ["main", "vdb"] files = [ {file = "pywin32-310-cp310-cp310-win32.whl", hash = "sha256:6dd97011efc8bf51d6793a82292419eba2c71cf8e7250cfac03bba284454abc1"}, {file = "pywin32-310-cp310-cp310-win_amd64.whl", hash = "sha256:c3e78706e4229b915a0821941a84e7ef420bf2b77e08c9dae3c76fd03fd2ae3d"}, @@ -7032,6 +7213,7 @@ files = [ {file = "pywin32-310-cp39-cp39-win32.whl", hash = "sha256:851c8d927af0d879221e616ae1f66145253537bbdd321a77e8ef701b443a9a1a"}, {file = "pywin32-310-cp39-cp39-win_amd64.whl", hash = "sha256:96867217335559ac619f00ad70e513c0fcf84b8a3af9fc2bba3b59b97da70475"}, ] +markers = {main = "platform_python_implementation != \"PyPy\" and platform_system == \"Windows\"", vdb = "platform_system == \"Windows\""} [[package]] name = "pyxlsb" @@ -8223,6 +8405,28 @@ mpmath = ">=1.1.0,<1.4" [package.extras] dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"] +[[package]] +name = "tablestore" +version = "6.1.0" +description = "Aliyun TableStore(OTS) SDK" +optional = false +python-versions = "*" +groups = ["vdb"] +files = [ + {file = "tablestore-6.1.0.tar.gz", hash = "sha256:bfe6a3e0fe88a230729723c357f4a46b8869a06a4b936db20692ed587a721c1c"}, +] + +[package.dependencies] +certifi = ">=2016.2.28" +crc32c = ">=2.7.1" +enum34 = ">=1.1.6" +flatbuffers = ">=22.9.24" +future = ">=0.16.0" +numpy = ">=1.11.0" +protobuf = ">=3.20.0,<=5.27.4" +six = ">=1.11.0" +urllib3 = ">=1.14" + [[package]] name = "tabulate" version = "0.9.0" @@ -8238,21 +8442,46 @@ files = [ [package.extras] widechars = ["wcwidth"] +[[package]] +name = "tcvdb-text" +version = "1.1.1" +description = "Tencent VectorDB Sparse Vector SDK" +optional = false +python-versions = ">=3" +groups = ["vdb"] +files = [ + {file = "tcvdb_text-1.1.1-py3-none-any.whl", hash = "sha256:981eb2323c0668129942c066de05e8f0d2165be36f567877906646dea07d17a9"}, + {file = "tcvdb_text-1.1.1.tar.gz", hash = "sha256:db36b5d7b640b194ae72c0c429718c9613b8ef9de5fffb9d510aba5be75ff1cb"}, +] + +[package.dependencies] +jieba = ">=0.42.1" +mmh3 = "*" +numpy = "*" +tqdm = "*" + [[package]] name = "tcvectordb" -version = "1.3.2" +version = "1.6.4" description = "Tencent VectorDB Python SDK" optional = false python-versions = ">=3" groups = ["vdb"] files = [ - {file = "tcvectordb-1.3.2-py3-none-any.whl", hash = "sha256:c4b6922d5df4cf14fcd3e61220d9374d1d53ec7270c254216ae35f8a752908f3"}, - {file = "tcvectordb-1.3.2.tar.gz", hash = "sha256:2772f5871a69744ffc7c970b321312d626078533a721de3c744059a81aab419e"}, + {file = "tcvectordb-1.6.4-py3-none-any.whl", hash = "sha256:06ef13e7edb4575b04615065fc90e1a28374e318ada305f3786629aec5c9318a"}, + {file = "tcvectordb-1.6.4.tar.gz", hash = "sha256:6fb18e15ccc6744d5147e9bbd781f84df3d66112de7d9cc615878b3f72d3a29a"}, ] [package.dependencies] -cos-python-sdk-v5 = ">=1.9.26" +cachetools = "*" +cos-python-sdk-v5 = "*" +grpcio = "*" +grpcio-tools = "*" +numpy = "*" requests = "*" +tcvdb-text = "*" +ujson = "5.9.0" +urllib3 = "*" [[package]] name = "tenacity" @@ -8603,6 +8832,18 @@ rich = ">=10.11.0" shellingham = ">=1.3.0" typing-extensions = ">=3.7.4.3" +[[package]] +name = "types-aiofiles" +version = "24.1.0.20250326" +description = "Typing stubs for aiofiles" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "types_aiofiles-24.1.0.20250326-py3-none-any.whl", hash = "sha256:dfb58c9aa18bd449e80fb5d7f49dc3dd20d31de920a46223a61798ee4a521a70"}, + {file = "types_aiofiles-24.1.0.20250326.tar.gz", hash = "sha256:c4bbe432fd043911ba83fb635456f5cc54f6d05fda2aadf6bef12a84f07a6efe"}, +] + [[package]] name = "types-beautifulsoup4" version = "4.12.0.20250204" @@ -8618,6 +8859,42 @@ files = [ [package.dependencies] types-html5lib = "*" +[[package]] +name = "types-cachetools" +version = "5.5.0.20240820" +description = "Typing stubs for cachetools" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "types-cachetools-5.5.0.20240820.tar.gz", hash = "sha256:b888ab5c1a48116f7799cd5004b18474cd82b5463acb5ffb2db2fc9c7b053bc0"}, + {file = "types_cachetools-5.5.0.20240820-py3-none-any.whl", hash = "sha256:efb2ed8bf27a4b9d3ed70d33849f536362603a90b8090a328acf0cd42fda82e2"}, +] + +[[package]] +name = "types-colorama" +version = "0.4.15.20240311" +description = "Typing stubs for colorama" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "types-colorama-0.4.15.20240311.tar.gz", hash = "sha256:a28e7f98d17d2b14fb9565d32388e419f4108f557a7d939a66319969b2b99c7a"}, + {file = "types_colorama-0.4.15.20240311-py3-none-any.whl", hash = "sha256:6391de60ddc0db3f147e31ecb230006a6823e81e380862ffca1e4695c13a0b8e"}, +] + +[[package]] +name = "types-defusedxml" +version = "0.7.0.20240218" +description = "Typing stubs for defusedxml" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "types-defusedxml-0.7.0.20240218.tar.gz", hash = "sha256:05688a7724dc66ea74c4af5ca0efc554a150c329cb28c13a64902cab878d06ed"}, + {file = "types_defusedxml-0.7.0.20240218-py3-none-any.whl", hash = "sha256:2b7f3c5ca14fdbe728fab0b846f5f7eb98c4bd4fd2b83d25f79e923caa790ced"}, +] + [[package]] name = "types-deprecated" version = "1.2.15.20250304" @@ -8630,6 +8907,18 @@ files = [ {file = "types_deprecated-1.2.15.20250304.tar.gz", hash = "sha256:c329030553029de5cc6cb30f269c11f4e00e598c4241290179f63cda7d33f719"}, ] +[[package]] +name = "types-docutils" +version = "0.21.0.20241128" +description = "Typing stubs for docutils" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "types_docutils-0.21.0.20241128-py3-none-any.whl", hash = "sha256:e0409204009639e9b0bf4521eeabe58b5e574ce9c0db08421c2ac26c32be0039"}, + {file = "types_docutils-0.21.0.20241128.tar.gz", hash = "sha256:4dd059805b83ac6ec5a223699195c4e9eeb0446a4f7f2aeff1759a4a7cc17473"}, +] + [[package]] name = "types-flask-cors" version = "5.0.0.20240902" @@ -8661,6 +8950,34 @@ files = [ Flask = ">=2.0.0" Flask-SQLAlchemy = ">=3.0.1" +[[package]] +name = "types-gevent" +version = "24.11.0.20250401" +description = "Typing stubs for gevent" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "types_gevent-24.11.0.20250401-py3-none-any.whl", hash = "sha256:6764faf861ea99250c38179c58076392c44019ac3393029f71b06c4a15e8c1d1"}, + {file = "types_gevent-24.11.0.20250401.tar.gz", hash = "sha256:1443f796a442062698e67d818fca50aa88067dee4021d457a7c0c6bedd6f46ca"}, +] + +[package.dependencies] +types-greenlet = "*" +types-psutil = "*" + +[[package]] +name = "types-greenlet" +version = "3.1.0.20250401" +description = "Typing stubs for greenlet" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "types_greenlet-3.1.0.20250401-py3-none-any.whl", hash = "sha256:77987f3249b0f21415dc0254057e1ae4125a696a9bba28b0bcb67ee9e3dc14f6"}, + {file = "types_greenlet-3.1.0.20250401.tar.gz", hash = "sha256:949389b64c34ca9472f6335189e9fe0b2e9704436d4f0850e39e9b7145909082"}, +] + [[package]] name = "types-html5lib" version = "1.1.11.20241018" @@ -8673,6 +8990,54 @@ files = [ {file = "types_html5lib-1.1.11.20241018-py3-none-any.whl", hash = "sha256:3f1e064d9ed2c289001ae6392c84c93833abb0816165c6ff0abfc304a779f403"}, ] +[[package]] +name = "types-markdown" +version = "3.7.0.20250322" +description = "Typing stubs for Markdown" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "types_markdown-3.7.0.20250322-py3-none-any.whl", hash = "sha256:7e855503027b4290355a310fb834871940d9713da7c111f3e98a5e1cbc77acfb"}, + {file = "types_markdown-3.7.0.20250322.tar.gz", hash = "sha256:a48ed82dfcb6954592a10f104689d2d44df9125ce51b3cee20e0198a5216d55c"}, +] + +[[package]] +name = "types-oauthlib" +version = "3.2.0.20250403" +description = "Typing stubs for oauthlib" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "types_oauthlib-3.2.0.20250403-py3-none-any.whl", hash = "sha256:02466f91a01522adfa4aaf0d7e76274f00a102eed40034117c5ecae768a2571e"}, + {file = "types_oauthlib-3.2.0.20250403.tar.gz", hash = "sha256:40a4fcfb2e95235e399b5c0dd1cbe9d8c4b19415c09fb54c648d3397e02e0425"}, +] + +[[package]] +name = "types-objgraph" +version = "3.6.0.20240907" +description = "Typing stubs for objgraph" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "types-objgraph-3.6.0.20240907.tar.gz", hash = "sha256:2e3dee675843ae387889731550b0ddfed06e9420946cf78a4bca565b5fc53634"}, + {file = "types_objgraph-3.6.0.20240907-py3-none-any.whl", hash = "sha256:67207633a9b5789ee1911d740b269c3371081b79c0d8f68b00e7b8539f5c43f5"}, +] + +[[package]] +name = "types-olefile" +version = "0.47.0.20240806" +description = "Typing stubs for olefile" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "types-olefile-0.47.0.20240806.tar.gz", hash = "sha256:96490f208cbb449a52283855319d73688ba9167ae58858ef8c506bf7ca2c6b67"}, + {file = "types_olefile-0.47.0.20240806-py3-none-any.whl", hash = "sha256:c760a3deab7adb87a80d33b0e4edbbfbab865204a18d5121746022d7f8555118"}, +] + [[package]] name = "types-openpyxl" version = "3.1.5.20250306" @@ -8685,28 +9050,40 @@ files = [ {file = "types_openpyxl-3.1.5.20250306.tar.gz", hash = "sha256:aa7ad2425e8020ff46a31633becfe1f3c64114498d964c536199f654b464e6bc"}, ] +[[package]] +name = "types-pexpect" +version = "4.9.0.20241208" +description = "Typing stubs for pexpect" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "types_pexpect-4.9.0.20241208-py3-none-any.whl", hash = "sha256:1928f478528454f0fea3495c16cf1ee2e67fca5c9fe97d60b868ac48c1fd5633"}, + {file = "types_pexpect-4.9.0.20241208.tar.gz", hash = "sha256:bbca0d0819947a719989a5cfe83641d9212bef893e2f0a7a01e47926bc82401d"}, +] + [[package]] name = "types-protobuf" -version = "5.29.1.20250315" +version = "5.29.1.20250403" description = "Typing stubs for protobuf" optional = false python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "types_protobuf-5.29.1.20250315-py3-none-any.whl", hash = "sha256:57efd51fd0979d1f5e1d94053d1e7cfff9c028e8d05b17e341b91a1c7fce37c4"}, - {file = "types_protobuf-5.29.1.20250315.tar.gz", hash = "sha256:0b05bc34621d046de54b94fddd5f4eb3bf849fe2e13a50f8fb8e89f35045ff49"}, + {file = "types_protobuf-5.29.1.20250403-py3-none-any.whl", hash = "sha256:c71de04106a2d54e5b2173d0a422058fae0ef2d058d70cf369fb797bf61ffa59"}, + {file = "types_protobuf-5.29.1.20250403.tar.gz", hash = "sha256:7ff44f15022119c9d7558ce16e78b2d485bf7040b4fadced4dd069bb5faf77a2"}, ] [[package]] name = "types-psutil" -version = "6.1.0.20241221" +version = "7.0.0.20250401" description = "Typing stubs for psutil" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "types_psutil-6.1.0.20241221-py3-none-any.whl", hash = "sha256:8498dbe13285a9ba7d4b2fa934c569cc380efc74e3dacdb34ae16d2cdf389ec3"}, - {file = "types_psutil-6.1.0.20241221.tar.gz", hash = "sha256:600f5a36bd5e0eb8887f0e3f3ff2cf154d90690ad8123c8a707bba4ab94d3185"}, + {file = "types_psutil-7.0.0.20250401-py3-none-any.whl", hash = "sha256:ed23f7140368104afe4e05a6085a5fa56fbe8c880a0f4dfe8d63e041106071ed"}, + {file = "types_psutil-7.0.0.20250401.tar.gz", hash = "sha256:2a7d663c0888a079fc1643ebc109ad12e57a21c9552a9e2035da504191336dbf"}, ] [[package]] @@ -8721,6 +9098,33 @@ files = [ {file = "types_psycopg2-2.9.21.20250318.tar.gz", hash = "sha256:eb6eac5bfb16adfd5f16b818918b9e26a40ede147e0f2bbffdf53a6ef7025a87"}, ] +[[package]] +name = "types-pygments" +version = "2.19.0.20250305" +description = "Typing stubs for Pygments" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "types_pygments-2.19.0.20250305-py3-none-any.whl", hash = "sha256:ca88aae5ec426f9b107c0f7adc36dc096d2882d930a49f679eaf4b8b643db35d"}, + {file = "types_pygments-2.19.0.20250305.tar.gz", hash = "sha256:044c50e80ecd4128c00a7268f20355e16f5c55466d3d49dfda09be920af40b4b"}, +] + +[package.dependencies] +types-docutils = "*" + +[[package]] +name = "types-pymysql" +version = "1.1.0.20241103" +description = "Typing stubs for PyMySQL" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "types-PyMySQL-1.1.0.20241103.tar.gz", hash = "sha256:a7628542919a0ba87625fb79eefb2a2de45fb4ad32afe6e561e8f2f27fb58b8c"}, + {file = "types_PyMySQL-1.1.0.20241103-py3-none-any.whl", hash = "sha256:1a32efd8a74b5bf74c4de92a86c1cc6edaf3802dcfd5546635ab501eb5e3c096"}, +] + [[package]] name = "types-python-dateutil" version = "2.9.0.20241206" @@ -8735,82 +9139,166 @@ files = [ [[package]] name = "types-pytz" -version = "2024.2.0.20241221" +version = "2025.1.0.20250318" description = "Typing stubs for pytz" optional = false -python-versions = ">=3.8" -groups = ["main", "dev"] +python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "types_pytz-2024.2.0.20241221-py3-none-any.whl", hash = "sha256:8fc03195329c43637ed4f593663df721fef919b60a969066e22606edf0b53ad5"}, - {file = "types_pytz-2024.2.0.20241221.tar.gz", hash = "sha256:06d7cde9613e9f7504766a0554a270c369434b50e00975b3a4a0f6eed0f2c1a9"}, + {file = "types_pytz-2025.1.0.20250318-py3-none-any.whl", hash = "sha256:04dba4907c5415777083f9548693c6d9f80ec53adcaff55a38526a3f8ddcae04"}, + {file = "types_pytz-2025.1.0.20250318.tar.gz", hash = "sha256:97e0e35184c6fe14e3a5014512057f2c57bb0c6582d63c1cfcc4809f82180449"}, +] + +[[package]] +name = "types-pywin32" +version = "310.0.0.20250319" +description = "Typing stubs for pywin32" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "types_pywin32-310.0.0.20250319-py3-none-any.whl", hash = "sha256:baeb558a82251f7d430d135036b054740893902fdee3f9fe568322730ff49779"}, + {file = "types_pywin32-310.0.0.20250319.tar.gz", hash = "sha256:4d28fb85b3f268a92905a7242df48c530c847cfe4cdb112386101ab6407660d8"}, ] [[package]] name = "types-pyyaml" -version = "6.0.12.20241230" +version = "6.0.12.20250402" description = "Typing stubs for PyYAML" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "types_PyYAML-6.0.12.20241230-py3-none-any.whl", hash = "sha256:fa4d32565219b68e6dee5f67534c722e53c00d1cfc09c435ef04d7353e1e96e6"}, - {file = "types_pyyaml-6.0.12.20241230.tar.gz", hash = "sha256:7f07622dbd34bb9c8b264fe860a17e0efcad00d50b5f27e93984909d9363498c"}, + {file = "types_pyyaml-6.0.12.20250402-py3-none-any.whl", hash = "sha256:652348fa9e7a203d4b0d21066dfb00760d3cbd5a15ebb7cf8d33c88a49546681"}, + {file = "types_pyyaml-6.0.12.20250402.tar.gz", hash = "sha256:d7c13c3e6d335b6af4b0122a01ff1d270aba84ab96d1a1a1063ecba3e13ec075"}, ] [[package]] name = "types-regex" -version = "2024.11.6.20250318" +version = "2024.11.6.20250403" description = "Typing stubs for regex" optional = false python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "types_regex-2024.11.6.20250318-py3-none-any.whl", hash = "sha256:9309fe5918ee7ffe859c04c18040697655fade366c4dc844bbebe86976a9980b"}, - {file = "types_regex-2024.11.6.20250318.tar.gz", hash = "sha256:6d472d0acf37b138cb32f67bd5ab1e7a200e94da8c1aa93ca3625a63e2efe1f3"}, + {file = "types_regex-2024.11.6.20250403-py3-none-any.whl", hash = "sha256:e22c0f67d73f4b4af6086a340f387b6f7d03bed8a0bb306224b75c51a29b0001"}, + {file = "types_regex-2024.11.6.20250403.tar.gz", hash = "sha256:3fdf2a70bbf830de4b3a28e9649a52d43dabb57cdb18fbfe2252eefb53666665"}, ] [[package]] name = "types-requests" -version = "2.32.0.20250306" +version = "2.32.0.20250328" description = "Typing stubs for requests" optional = false python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "types_requests-2.32.0.20250306-py3-none-any.whl", hash = "sha256:25f2cbb5c8710b2022f8bbee7b2b66f319ef14aeea2f35d80f18c9dbf3b60a0b"}, - {file = "types_requests-2.32.0.20250306.tar.gz", hash = "sha256:0962352694ec5b2f95fda877ee60a159abdf84a0fc6fdace599f20acb41a03d1"}, + {file = "types_requests-2.32.0.20250328-py3-none-any.whl", hash = "sha256:72ff80f84b15eb3aa7a8e2625fffb6a93f2ad5a0c20215fc1dcfa61117bcb2a2"}, + {file = "types_requests-2.32.0.20250328.tar.gz", hash = "sha256:c9e67228ea103bd811c96984fac36ed2ae8da87a36a633964a21f199d60baf32"}, ] [package.dependencies] urllib3 = ">=2" +[[package]] +name = "types-requests-oauthlib" +version = "2.0.0.20250306" +description = "Typing stubs for requests-oauthlib" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "types_requests_oauthlib-2.0.0.20250306-py3-none-any.whl", hash = "sha256:37707de81d9ce54894afcccd70d4a845dbe4c59e747908faaeba59a96453d993"}, + {file = "types_requests_oauthlib-2.0.0.20250306.tar.gz", hash = "sha256:92e5f1ed35689b1804fdcd60b7ac39b0bd440a4b96693685879bc835b334797f"}, +] + +[package.dependencies] +types-oauthlib = "*" +types-requests = "*" + +[[package]] +name = "types-shapely" +version = "2.0.0.20250404" +description = "Typing stubs for shapely" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "types_shapely-2.0.0.20250404-py3-none-any.whl", hash = "sha256:170fb92f5c168a120db39b3287697fdec5c93ef3e1ad15e52552c36b25318821"}, + {file = "types_shapely-2.0.0.20250404.tar.gz", hash = "sha256:863f540b47fa626c33ae64eae06df171f9ab0347025d4458d2df496537296b4f"}, +] + +[package.dependencies] +numpy = ">=1.20" + +[[package]] +name = "types-simplejson" +version = "3.20.0.20250326" +description = "Typing stubs for simplejson" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "types_simplejson-3.20.0.20250326-py3-none-any.whl", hash = "sha256:db1ddea7b8f7623b27a137578f22fc6c618db8c83ccfb1828ca0d2f0ec11efa7"}, + {file = "types_simplejson-3.20.0.20250326.tar.gz", hash = "sha256:b2689bc91e0e672d7a5a947b4cb546b76ae7ddc2899c6678e72a10bf96cd97d2"}, +] + [[package]] name = "types-six" -version = "1.17.0.20250304" +version = "1.17.0.20250403" description = "Typing stubs for six" optional = false python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "types_six-1.17.0.20250304-py3-none-any.whl", hash = "sha256:e482df1d439375f4b7c1f2540b1b8584aea82850164a296203ead4a7024fe14f"}, - {file = "types_six-1.17.0.20250304.tar.gz", hash = "sha256:eeb240f9faec63ddd0498d6c0b6abd0496b154a66f960c004d4d733cf31bb4bd"}, + {file = "types_six-1.17.0.20250403-py3-none-any.whl", hash = "sha256:0bbb20fc34a18163afe7cac70b85864bd6937e6d73413c5b8f424def28760ae8"}, + {file = "types_six-1.17.0.20250403.tar.gz", hash = "sha256:82076f86e6e672a95adbf8b52625b1b3c72a8b9a893180344c1a02a6daabead6"}, ] +[[package]] +name = "types-tensorflow" +version = "2.18.0.20250404" +description = "Typing stubs for tensorflow" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "types_tensorflow-2.18.0.20250404-py3-none-any.whl", hash = "sha256:4ad86534e6cfd6b36b2c97239ef9d122c44b167b25630b7c873a1483f9befd15"}, + {file = "types_tensorflow-2.18.0.20250404.tar.gz", hash = "sha256:b38a427bbec805e4879d248f070baea802673c04cc5ccbe5979d742faa160670"}, +] + +[package.dependencies] +numpy = ">=1.20" +types-protobuf = "*" +types-requests = "*" + [[package]] name = "types-tqdm" -version = "4.67.0.20250301" +version = "4.67.0.20250404" description = "Typing stubs for tqdm" optional = false python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "types_tqdm-4.67.0.20250301-py3-none-any.whl", hash = "sha256:8af97deb8e6874af833555dc1fe0fcd456b1a789470bf6cd8813d4e7ee4f6c5b"}, - {file = "types_tqdm-4.67.0.20250301.tar.gz", hash = "sha256:5e89a38ad89b867823368eb97d9f90d2fc69806bb055dde62716a05da62b5e0d"}, + {file = "types_tqdm-4.67.0.20250404-py3-none-any.whl", hash = "sha256:4a9b897eb4036f757240f4cb4a794f296265c04de46fdd058e453891f0186eed"}, + {file = "types_tqdm-4.67.0.20250404.tar.gz", hash = "sha256:e9997c655ffbba3ab78f4418b5511c05a54e76824d073d212166dc73aa56c768"}, ] [package.dependencies] types-requests = "*" +[[package]] +name = "types-ujson" +version = "5.10.0.20250326" +description = "Typing stubs for ujson" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "types_ujson-5.10.0.20250326-py3-none-any.whl", hash = "sha256:acc0913f569def62ef6a892c8a47703f65d05669a3252391a97765cf207dca5b"}, + {file = "types_ujson-5.10.0.20250326.tar.gz", hash = "sha256:5469e05f2c31ecb3c4c0267cc8fe41bcd116826fbb4ded69801a645c687dd014"}, +] + [[package]] name = "typing-extensions" version = "4.12.2" @@ -8853,90 +9341,77 @@ files = [ [[package]] name = "ujson" -version = "5.10.0" +version = "5.9.0" description = "Ultra fast JSON encoder and decoder for Python" optional = false python-versions = ">=3.8" groups = ["vdb"] files = [ - {file = "ujson-5.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2601aa9ecdbee1118a1c2065323bda35e2c5a2cf0797ef4522d485f9d3ef65bd"}, - {file = "ujson-5.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:348898dd702fc1c4f1051bc3aacbf894caa0927fe2c53e68679c073375f732cf"}, - {file = "ujson-5.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22cffecf73391e8abd65ef5f4e4dd523162a3399d5e84faa6aebbf9583df86d6"}, - {file = "ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26b0e2d2366543c1bb4fbd457446f00b0187a2bddf93148ac2da07a53fe51569"}, - {file = "ujson-5.10.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:caf270c6dba1be7a41125cd1e4fc7ba384bf564650beef0df2dd21a00b7f5770"}, - {file = "ujson-5.10.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a245d59f2ffe750446292b0094244df163c3dc96b3ce152a2c837a44e7cda9d1"}, - {file = "ujson-5.10.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:94a87f6e151c5f483d7d54ceef83b45d3a9cca7a9cb453dbdbb3f5a6f64033f5"}, - {file = "ujson-5.10.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:29b443c4c0a113bcbb792c88bea67b675c7ca3ca80c3474784e08bba01c18d51"}, - {file = "ujson-5.10.0-cp310-cp310-win32.whl", hash = "sha256:c18610b9ccd2874950faf474692deee4223a994251bc0a083c114671b64e6518"}, - {file = "ujson-5.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:924f7318c31874d6bb44d9ee1900167ca32aa9b69389b98ecbde34c1698a250f"}, - {file = "ujson-5.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a5b366812c90e69d0f379a53648be10a5db38f9d4ad212b60af00bd4048d0f00"}, - {file = "ujson-5.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:502bf475781e8167f0f9d0e41cd32879d120a524b22358e7f205294224c71126"}, - {file = "ujson-5.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b91b5d0d9d283e085e821651184a647699430705b15bf274c7896f23fe9c9d8"}, - {file = "ujson-5.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:129e39af3a6d85b9c26d5577169c21d53821d8cf68e079060602e861c6e5da1b"}, - {file = "ujson-5.10.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f77b74475c462cb8b88680471193064d3e715c7c6074b1c8c412cb526466efe9"}, - {file = "ujson-5.10.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7ec0ca8c415e81aa4123501fee7f761abf4b7f386aad348501a26940beb1860f"}, - {file = "ujson-5.10.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ab13a2a9e0b2865a6c6db9271f4b46af1c7476bfd51af1f64585e919b7c07fd4"}, - {file = "ujson-5.10.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:57aaf98b92d72fc70886b5a0e1a1ca52c2320377360341715dd3933a18e827b1"}, - {file = "ujson-5.10.0-cp311-cp311-win32.whl", hash = "sha256:2987713a490ceb27edff77fb184ed09acdc565db700ee852823c3dc3cffe455f"}, - {file = "ujson-5.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:f00ea7e00447918ee0eff2422c4add4c5752b1b60e88fcb3c067d4a21049a720"}, - {file = "ujson-5.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:98ba15d8cbc481ce55695beee9f063189dce91a4b08bc1d03e7f0152cd4bbdd5"}, - {file = "ujson-5.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a9d2edbf1556e4f56e50fab7d8ff993dbad7f54bac68eacdd27a8f55f433578e"}, - {file = "ujson-5.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6627029ae4f52d0e1a2451768c2c37c0c814ffc04f796eb36244cf16b8e57043"}, - {file = "ujson-5.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8ccb77b3e40b151e20519c6ae6d89bfe3f4c14e8e210d910287f778368bb3d1"}, - {file = "ujson-5.10.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3caf9cd64abfeb11a3b661329085c5e167abbe15256b3b68cb5d914ba7396f3"}, - {file = "ujson-5.10.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6e32abdce572e3a8c3d02c886c704a38a1b015a1fb858004e03d20ca7cecbb21"}, - {file = "ujson-5.10.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a65b6af4d903103ee7b6f4f5b85f1bfd0c90ba4eeac6421aae436c9988aa64a2"}, - {file = "ujson-5.10.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:604a046d966457b6cdcacc5aa2ec5314f0e8c42bae52842c1e6fa02ea4bda42e"}, - {file = "ujson-5.10.0-cp312-cp312-win32.whl", hash = "sha256:6dea1c8b4fc921bf78a8ff00bbd2bfe166345f5536c510671bccececb187c80e"}, - {file = "ujson-5.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:38665e7d8290188b1e0d57d584eb8110951a9591363316dd41cf8686ab1d0abc"}, - {file = "ujson-5.10.0-cp313-cp313-macosx_10_9_x86_64.whl", hash = "sha256:618efd84dc1acbd6bff8eaa736bb6c074bfa8b8a98f55b61c38d4ca2c1f7f287"}, - {file = "ujson-5.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:38d5d36b4aedfe81dfe251f76c0467399d575d1395a1755de391e58985ab1c2e"}, - {file = "ujson-5.10.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67079b1f9fb29ed9a2914acf4ef6c02844b3153913eb735d4bf287ee1db6e557"}, - {file = "ujson-5.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7d0e0ceeb8fe2468c70ec0c37b439dd554e2aa539a8a56365fd761edb418988"}, - {file = "ujson-5.10.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:59e02cd37bc7c44d587a0ba45347cc815fb7a5fe48de16bf05caa5f7d0d2e816"}, - {file = "ujson-5.10.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2a890b706b64e0065f02577bf6d8ca3b66c11a5e81fb75d757233a38c07a1f20"}, - {file = "ujson-5.10.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:621e34b4632c740ecb491efc7f1fcb4f74b48ddb55e65221995e74e2d00bbff0"}, - {file = "ujson-5.10.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b9500e61fce0cfc86168b248104e954fead61f9be213087153d272e817ec7b4f"}, - {file = "ujson-5.10.0-cp313-cp313-win32.whl", hash = "sha256:4c4fc16f11ac1612f05b6f5781b384716719547e142cfd67b65d035bd85af165"}, - {file = "ujson-5.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:4573fd1695932d4f619928fd09d5d03d917274381649ade4328091ceca175539"}, - {file = "ujson-5.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a984a3131da7f07563057db1c3020b1350a3e27a8ec46ccbfbf21e5928a43050"}, - {file = "ujson-5.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:73814cd1b9db6fc3270e9d8fe3b19f9f89e78ee9d71e8bd6c9a626aeaeaf16bd"}, - {file = "ujson-5.10.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61e1591ed9376e5eddda202ec229eddc56c612b61ac6ad07f96b91460bb6c2fb"}, - {file = "ujson-5.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2c75269f8205b2690db4572a4a36fe47cd1338e4368bc73a7a0e48789e2e35a"}, - {file = "ujson-5.10.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7223f41e5bf1f919cd8d073e35b229295aa8e0f7b5de07ed1c8fddac63a6bc5d"}, - {file = "ujson-5.10.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d4dc2fd6b3067c0782e7002ac3b38cf48608ee6366ff176bbd02cf969c9c20fe"}, - {file = "ujson-5.10.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:232cc85f8ee3c454c115455195a205074a56ff42608fd6b942aa4c378ac14dd7"}, - {file = "ujson-5.10.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:cc6139531f13148055d691e442e4bc6601f6dba1e6d521b1585d4788ab0bfad4"}, - {file = "ujson-5.10.0-cp38-cp38-win32.whl", hash = "sha256:e7ce306a42b6b93ca47ac4a3b96683ca554f6d35dd8adc5acfcd55096c8dfcb8"}, - {file = "ujson-5.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:e82d4bb2138ab05e18f089a83b6564fee28048771eb63cdecf4b9b549de8a2cc"}, - {file = "ujson-5.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:dfef2814c6b3291c3c5f10065f745a1307d86019dbd7ea50e83504950136ed5b"}, - {file = "ujson-5.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4734ee0745d5928d0ba3a213647f1c4a74a2a28edc6d27b2d6d5bd9fa4319e27"}, - {file = "ujson-5.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d47ebb01bd865fdea43da56254a3930a413f0c5590372a1241514abae8aa7c76"}, - {file = "ujson-5.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dee5e97c2496874acbf1d3e37b521dd1f307349ed955e62d1d2f05382bc36dd5"}, - {file = "ujson-5.10.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7490655a2272a2d0b072ef16b0b58ee462f4973a8f6bbe64917ce5e0a256f9c0"}, - {file = "ujson-5.10.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ba17799fcddaddf5c1f75a4ba3fd6441f6a4f1e9173f8a786b42450851bd74f1"}, - {file = "ujson-5.10.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:2aff2985cef314f21d0fecc56027505804bc78802c0121343874741650a4d3d1"}, - {file = "ujson-5.10.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:ad88ac75c432674d05b61184178635d44901eb749786c8eb08c102330e6e8996"}, - {file = "ujson-5.10.0-cp39-cp39-win32.whl", hash = "sha256:2544912a71da4ff8c4f7ab5606f947d7299971bdd25a45e008e467ca638d13c9"}, - {file = "ujson-5.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:3ff201d62b1b177a46f113bb43ad300b424b7847f9c5d38b1b4ad8f75d4a282a"}, - {file = "ujson-5.10.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:5b6fee72fa77dc172a28f21693f64d93166534c263adb3f96c413ccc85ef6e64"}, - {file = "ujson-5.10.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:61d0af13a9af01d9f26d2331ce49bb5ac1fb9c814964018ac8df605b5422dcb3"}, - {file = "ujson-5.10.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecb24f0bdd899d368b715c9e6664166cf694d1e57be73f17759573a6986dd95a"}, - {file = "ujson-5.10.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbd8fd427f57a03cff3ad6574b5e299131585d9727c8c366da4624a9069ed746"}, - {file = "ujson-5.10.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:beeaf1c48e32f07d8820c705ff8e645f8afa690cca1544adba4ebfa067efdc88"}, - {file = "ujson-5.10.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:baed37ea46d756aca2955e99525cc02d9181de67f25515c468856c38d52b5f3b"}, - {file = "ujson-5.10.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7663960f08cd5a2bb152f5ee3992e1af7690a64c0e26d31ba7b3ff5b2ee66337"}, - {file = "ujson-5.10.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:d8640fb4072d36b08e95a3a380ba65779d356b2fee8696afeb7794cf0902d0a1"}, - {file = "ujson-5.10.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78778a3aa7aafb11e7ddca4e29f46bc5139131037ad628cc10936764282d6753"}, - {file = "ujson-5.10.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b0111b27f2d5c820e7f2dbad7d48e3338c824e7ac4d2a12da3dc6061cc39c8e6"}, - {file = "ujson-5.10.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:c66962ca7565605b355a9ed478292da628b8f18c0f2793021ca4425abf8b01e5"}, - {file = "ujson-5.10.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ba43cc34cce49cf2d4bc76401a754a81202d8aa926d0e2b79f0ee258cb15d3a4"}, - {file = "ujson-5.10.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:ac56eb983edce27e7f51d05bc8dd820586c6e6be1c5216a6809b0c668bb312b8"}, - {file = "ujson-5.10.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f44bd4b23a0e723bf8b10628288c2c7c335161d6840013d4d5de20e48551773b"}, - {file = "ujson-5.10.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c10f4654e5326ec14a46bcdeb2b685d4ada6911050aa8baaf3501e57024b804"}, - {file = "ujson-5.10.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0de4971a89a762398006e844ae394bd46991f7c385d7a6a3b93ba229e6dac17e"}, - {file = "ujson-5.10.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:e1402f0564a97d2a52310ae10a64d25bcef94f8dd643fcf5d310219d915484f7"}, - {file = "ujson-5.10.0.tar.gz", hash = "sha256:b3cd8f3c5d8c7738257f1018880444f7b7d9b66232c64649f562d7ba86ad4bc1"}, + {file = "ujson-5.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ab71bf27b002eaf7d047c54a68e60230fbd5cd9da60de7ca0aa87d0bccead8fa"}, + {file = "ujson-5.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7a365eac66f5aa7a7fdf57e5066ada6226700884fc7dce2ba5483538bc16c8c5"}, + {file = "ujson-5.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e015122b337858dba5a3dc3533af2a8fc0410ee9e2374092f6a5b88b182e9fcc"}, + {file = "ujson-5.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:779a2a88c53039bebfbccca934430dabb5c62cc179e09a9c27a322023f363e0d"}, + {file = "ujson-5.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10ca3c41e80509fd9805f7c149068fa8dbee18872bbdc03d7cca928926a358d5"}, + {file = "ujson-5.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4a566e465cb2fcfdf040c2447b7dd9718799d0d90134b37a20dff1e27c0e9096"}, + {file = "ujson-5.9.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:f833c529e922577226a05bc25b6a8b3eb6c4fb155b72dd88d33de99d53113124"}, + {file = "ujson-5.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b68a0caab33f359b4cbbc10065c88e3758c9f73a11a65a91f024b2e7a1257106"}, + {file = "ujson-5.9.0-cp310-cp310-win32.whl", hash = "sha256:7cc7e605d2aa6ae6b7321c3ae250d2e050f06082e71ab1a4200b4ae64d25863c"}, + {file = "ujson-5.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:a6d3f10eb8ccba4316a6b5465b705ed70a06011c6f82418b59278fbc919bef6f"}, + {file = "ujson-5.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b23bbb46334ce51ddb5dded60c662fbf7bb74a37b8f87221c5b0fec1ec6454b"}, + {file = "ujson-5.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6974b3a7c17bbf829e6c3bfdc5823c67922e44ff169851a755eab79a3dd31ec0"}, + {file = "ujson-5.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5964ea916edfe24af1f4cc68488448fbb1ec27a3ddcddc2b236da575c12c8ae"}, + {file = "ujson-5.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ba7cac47dd65ff88571eceeff48bf30ed5eb9c67b34b88cb22869b7aa19600d"}, + {file = "ujson-5.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6bbd91a151a8f3358c29355a491e915eb203f607267a25e6ab10531b3b157c5e"}, + {file = "ujson-5.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:829a69d451a49c0de14a9fecb2a2d544a9b2c884c2b542adb243b683a6f15908"}, + {file = "ujson-5.9.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:a807ae73c46ad5db161a7e883eec0fbe1bebc6a54890152ccc63072c4884823b"}, + {file = "ujson-5.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8fc2aa18b13d97b3c8ccecdf1a3c405f411a6e96adeee94233058c44ff92617d"}, + {file = "ujson-5.9.0-cp311-cp311-win32.whl", hash = "sha256:70e06849dfeb2548be48fdd3ceb53300640bc8100c379d6e19d78045e9c26120"}, + {file = "ujson-5.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:7309d063cd392811acc49b5016728a5e1b46ab9907d321ebbe1c2156bc3c0b99"}, + {file = "ujson-5.9.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:20509a8c9f775b3a511e308bbe0b72897ba6b800767a7c90c5cca59d20d7c42c"}, + {file = "ujson-5.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b28407cfe315bd1b34f1ebe65d3bd735d6b36d409b334100be8cdffae2177b2f"}, + {file = "ujson-5.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d302bd17989b6bd90d49bade66943c78f9e3670407dbc53ebcf61271cadc399"}, + {file = "ujson-5.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f21315f51e0db8ee245e33a649dd2d9dce0594522de6f278d62f15f998e050e"}, + {file = "ujson-5.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5635b78b636a54a86fdbf6f027e461aa6c6b948363bdf8d4fbb56a42b7388320"}, + {file = "ujson-5.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:82b5a56609f1235d72835ee109163c7041b30920d70fe7dac9176c64df87c164"}, + {file = "ujson-5.9.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:5ca35f484622fd208f55041b042d9d94f3b2c9c5add4e9af5ee9946d2d30db01"}, + {file = "ujson-5.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:829b824953ebad76d46e4ae709e940bb229e8999e40881338b3cc94c771b876c"}, + {file = "ujson-5.9.0-cp312-cp312-win32.whl", hash = "sha256:25fa46e4ff0a2deecbcf7100af3a5d70090b461906f2299506485ff31d9ec437"}, + {file = "ujson-5.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:60718f1720a61560618eff3b56fd517d107518d3c0160ca7a5a66ac949c6cf1c"}, + {file = "ujson-5.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d581db9db9e41d8ea0b2705c90518ba623cbdc74f8d644d7eb0d107be0d85d9c"}, + {file = "ujson-5.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ff741a5b4be2d08fceaab681c9d4bc89abf3c9db600ab435e20b9b6d4dfef12e"}, + {file = "ujson-5.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdcb02cabcb1e44381221840a7af04433c1dc3297af76fde924a50c3054c708c"}, + {file = "ujson-5.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e208d3bf02c6963e6ef7324dadf1d73239fb7008491fdf523208f60be6437402"}, + {file = "ujson-5.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4b3917296630a075e04d3d07601ce2a176479c23af838b6cf90a2d6b39b0d95"}, + {file = "ujson-5.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0c4d6adb2c7bb9eb7c71ad6f6f612e13b264942e841f8cc3314a21a289a76c4e"}, + {file = "ujson-5.9.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0b159efece9ab5c01f70b9d10bbb77241ce111a45bc8d21a44c219a2aec8ddfd"}, + {file = "ujson-5.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f0cb4a7814940ddd6619bdce6be637a4b37a8c4760de9373bac54bb7b229698b"}, + {file = "ujson-5.9.0-cp38-cp38-win32.whl", hash = "sha256:dc80f0f5abf33bd7099f7ac94ab1206730a3c0a2d17549911ed2cb6b7aa36d2d"}, + {file = "ujson-5.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:506a45e5fcbb2d46f1a51fead991c39529fc3737c0f5d47c9b4a1d762578fc30"}, + {file = "ujson-5.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d0fd2eba664a22447102062814bd13e63c6130540222c0aa620701dd01f4be81"}, + {file = "ujson-5.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bdf7fc21a03bafe4ba208dafa84ae38e04e5d36c0e1c746726edf5392e9f9f36"}, + {file = "ujson-5.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2f909bc08ce01f122fd9c24bc6f9876aa087188dfaf3c4116fe6e4daf7e194f"}, + {file = "ujson-5.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd4ea86c2afd41429751d22a3ccd03311c067bd6aeee2d054f83f97e41e11d8f"}, + {file = "ujson-5.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:63fb2e6599d96fdffdb553af0ed3f76b85fda63281063f1cb5b1141a6fcd0617"}, + {file = "ujson-5.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:32bba5870c8fa2a97f4a68f6401038d3f1922e66c34280d710af00b14a3ca562"}, + {file = "ujson-5.9.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:37ef92e42535a81bf72179d0e252c9af42a4ed966dc6be6967ebfb929a87bc60"}, + {file = "ujson-5.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f69f16b8f1c69da00e38dc5f2d08a86b0e781d0ad3e4cc6a13ea033a439c4844"}, + {file = "ujson-5.9.0-cp39-cp39-win32.whl", hash = "sha256:3382a3ce0ccc0558b1c1668950008cece9bf463ebb17463ebf6a8bfc060dae34"}, + {file = "ujson-5.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:6adef377ed583477cf005b58c3025051b5faa6b8cc25876e594afbb772578f21"}, + {file = "ujson-5.9.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ffdfebd819f492e48e4f31c97cb593b9c1a8251933d8f8972e81697f00326ff1"}, + {file = "ujson-5.9.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4eec2ddc046360d087cf35659c7ba0cbd101f32035e19047013162274e71fcf"}, + {file = "ujson-5.9.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fbb90aa5c23cb3d4b803c12aa220d26778c31b6e4b7a13a1f49971f6c7d088e"}, + {file = "ujson-5.9.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba0823cb70866f0d6a4ad48d998dd338dce7314598721bc1b7986d054d782dfd"}, + {file = "ujson-5.9.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:4e35d7885ed612feb6b3dd1b7de28e89baaba4011ecdf995e88be9ac614765e9"}, + {file = "ujson-5.9.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:b048aa93eace8571eedbd67b3766623e7f0acbf08ee291bef7d8106210432427"}, + {file = "ujson-5.9.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:323279e68c195110ef85cbe5edce885219e3d4a48705448720ad925d88c9f851"}, + {file = "ujson-5.9.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ac92d86ff34296f881e12aa955f7014d276895e0e4e868ba7fddebbde38e378"}, + {file = "ujson-5.9.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:6eecbd09b316cea1fd929b1e25f70382917542ab11b692cb46ec9b0a26c7427f"}, + {file = "ujson-5.9.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:473fb8dff1d58f49912323d7cb0859df5585cfc932e4b9c053bf8cf7f2d7c5c4"}, + {file = "ujson-5.9.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f91719c6abafe429c1a144cfe27883eace9fb1c09a9c5ef1bcb3ae80a3076a4e"}, + {file = "ujson-5.9.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b1c0991c4fe256f5fdb19758f7eac7f47caac29a6c57d0de16a19048eb86bad"}, + {file = "ujson-5.9.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a8ea0f55a1396708e564595aaa6696c0d8af532340f477162ff6927ecc46e21"}, + {file = "ujson-5.9.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:07e0cfdde5fd91f54cd2d7ffb3482c8ff1bf558abf32a8b953a5d169575ae1cd"}, + {file = "ujson-5.9.0.tar.gz", hash = "sha256:89cc92e73d5501b8a7f48575eeb14ad27156ad092c2e9fc7e3cf949f07e75532"}, ] [[package]] @@ -9114,7 +9589,7 @@ description = "Fast implementation of asyncio event loop on top of libuv" optional = false python-versions = ">=3.8.0" groups = ["vdb"] -markers = "sys_platform != \"win32\" and sys_platform != \"cygwin\" and platform_python_implementation != \"PyPy\"" +markers = "platform_python_implementation != \"PyPy\" and sys_platform != \"win32\" and sys_platform != \"cygwin\"" files = [ {file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ec7e6b09a6fdded42403182ab6b832b71f4edaf7f37a9a0e371a01db5f0cb45f"}, {file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:196274f2adb9689a289ad7d65700d37df0c0930fd8e4e743fa4834e850d7719d"}, @@ -9905,4 +10380,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = ">=3.11,<3.13" -content-hash = "adc577504435813e7e78b7433b9efb3dc6551f4eec2a65bc9aed762a8ef6540c" +content-hash = "7bdb4c26ad249bacd8149e8931f4cdc25d9d0cb319329b1e939e1b4f2c7f40b1" diff --git a/api/pyproject.toml b/api/pyproject.toml index 3355a550f..c6818c462 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dify-api" requires-python = ">=3.11,<3.13" -dynamic = [ "dependencies" ] +dynamic = ["dependencies"] [build-system] requires = ["poetry-core>=2.0.0"] @@ -52,6 +52,7 @@ openpyxl = "~3.1.5" opik = "~1.3.4" pandas = { version = "~2.2.2", extras = ["performance", "excel", "output-formatting"] } pandas-stubs = "~2.2.3.241009" +pandoc = "~2.4" psycogreen = "~1.0.2" psycopg2-binary = "~2.9.6" pycryptodome = "3.19.1" @@ -134,7 +135,8 @@ pymilvus = "~2.5.0" pymochow = "1.3.1" pyobvector = "~0.1.6" qdrant-client = "1.7.3" -tcvectordb = "1.3.2" +tablestore = "6.1.0" +tcvectordb = "~1.6.4" tidb-vector = "0.0.9" upstash-vector = "0.6.0" volcengine-compat = "~1.0.156" @@ -150,27 +152,47 @@ optional = true [tool.poetry.group.dev.dependencies] coverage = "~7.2.4" faker = "~32.1.0" -mypy = "~1.13.0" +lxml-stubs = "~0.5.1" +mypy = "~1.15.0" pytest = "~8.3.2" pytest-benchmark = "~4.0.0" pytest-env = "~1.1.3" pytest-mock = "~3.14.0" -types-beautifulsoup4 = "~4.12.0.20241020" -types-deprecated = "~1.2.15.20250304" -types-flask-cors = "~5.0.0.20240902" -types-flask-migrate = "~4.1.0.20250112" -types-html5lib = "~1.1.11.20241018" -types-openpyxl = "~3.1.5.20241225" -types-protobuf = "~5.29.1.20241207" -types-psutil = "~6.1.0.20241221" -types-psycopg2 = "~2.9.21.20250121" -types-python-dateutil = "~2.9.0.20241206" -types-pytz = "~2024.2.0.20241221" -types-pyyaml = "~6.0.12.20241230" -types-regex = "~2024.11.6.20241221" -types-requests = "~2.32.0.20241016" -types-six = "~1.17.0.20241205" -types-tqdm = "~4.67.0.20241221" +types-aiofiles = "~24.1.0" +types-beautifulsoup4 = "~4.12.0" +types-cachetools = "~5.5.0" +types-colorama = "~0.4.15" +types-defusedxml = "~0.7.0" +types-deprecated = "~1.2.15" +types-docutils = "~0.21.0" +types-flask-cors = "~5.0.0" +types-flask-migrate = "~4.1.0" +types-gevent = "~24.11.0" +types-greenlet = "~3.1.0" +types-html5lib = "~1.1.11" +types-markdown = "~3.7.0" +types-oauthlib = "~3.2.0" +types-objgraph = "~3.6.0" +types-olefile = "~0.47.0" +types-openpyxl = "~3.1.5" +types-pexpect = "~4.9.0" +types-protobuf = "~5.29.1" +types-psutil = "~7.0.0" +types-psycopg2 = "~2.9.21" +types-pygments = "~2.19.0" +types-pymysql = "~1.1.0" +types-python-dateutil = "~2.9.0" +types-pywin32 = "~310.0.0" +types-pyyaml = "~6.0.12" +types-regex = "~2024.11.6" +types-requests = "~2.32.0" +types-requests-oauthlib = "~2.0.0" +types-shapely = "~2.0.0" +types-simplejson = "~3.20.0" +types-six = "~1.17.0" +types-tensorflow = "~2.18.0" +types-tqdm = "~4.67.0" +types-ujson = "~5.10.0" ############################################################ # [ Lint ] dependency group diff --git a/api/schedule/mail_clean_document_notify_task.py b/api/schedule/mail_clean_document_notify_task.py index fe6839288..b3d0e0978 100644 --- a/api/schedule/mail_clean_document_notify_task.py +++ b/api/schedule/mail_clean_document_notify_task.py @@ -15,11 +15,11 @@ from services.feature_service import FeatureService @app.celery.task(queue="dataset") -def send_document_clean_notify_task(): +def mail_clean_document_notify_task(): """ Async Send document clean notify mail - Usage: send_document_clean_notify_task.delay() + Usage: mail_clean_document_notify_task.delay() """ if not mail.is_inited(): return diff --git a/api/services/account_service.py b/api/services/account_service.py index 47730298b..8329ef364 100644 --- a/api/services/account_service.py +++ b/api/services/account_service.py @@ -913,6 +913,8 @@ class RegisterService: db.session.commit() except WorkSpaceNotAllowedCreateError: db.session.rollback() + logging.exception("Register failed") + raise AccountRegisterError("Workspace is not allowed to create.") except AccountRegisterError as are: db.session.rollback() logging.exception("Register failed") diff --git a/api/services/app_dsl_service.py b/api/services/app_dsl_service.py index d33d277d4..2e2b72902 100644 --- a/api/services/app_dsl_service.py +++ b/api/services/app_dsl_service.py @@ -1,3 +1,5 @@ +import base64 +import hashlib import logging import uuid from collections.abc import Mapping @@ -7,6 +9,8 @@ from urllib.parse import urlparse from uuid import uuid4 import yaml # type: ignore +from Crypto.Cipher import AES +from Crypto.Util.Padding import pad, unpad from packaging import version from pydantic import BaseModel, Field from sqlalchemy import select @@ -478,6 +482,15 @@ class AppDslService: unique_hash = current_draft_workflow.unique_hash else: unique_hash = None + graph = workflow_data.get("graph", {}) + for node in graph.get("nodes", []): + if node.get("data", {}).get("type", "") == NodeType.KNOWLEDGE_RETRIEVAL.value: + dataset_ids = node["data"].get("dataset_ids", []) + node["data"]["dataset_ids"] = [ + decrypted_id + for dataset_id in dataset_ids + if (decrypted_id := self.decrypt_dataset_id(encrypted_data=dataset_id, tenant_id=app.tenant_id)) + ] workflow_service.sync_draft_workflow( app_model=app, graph=workflow_data.get("graph", {}), @@ -513,6 +526,7 @@ class AppDslService: """ Export app :param app_model: App instance + :param include_secret: Whether include secret variable :return: """ app_mode = AppMode.value_of(app_model.mode) @@ -551,7 +565,15 @@ class AppDslService: if not workflow: raise ValueError("Missing draft workflow configuration, please check.") - export_data["workflow"] = workflow.to_dict(include_secret=include_secret) + workflow_dict = workflow.to_dict(include_secret=include_secret) + for node in workflow_dict.get("graph", {}).get("nodes", []): + if node.get("data", {}).get("type", "") == NodeType.KNOWLEDGE_RETRIEVAL.value: + dataset_ids = node["data"].get("dataset_ids", []) + node["data"]["dataset_ids"] = [ + cls.encrypt_dataset_id(dataset_id=dataset_id, tenant_id=app_model.tenant_id) + for dataset_id in dataset_ids + ] + export_data["workflow"] = workflow_dict dependencies = cls._extract_dependencies_from_workflow(workflow) export_data["dependencies"] = [ jsonable_encoder(d.model_dump()) @@ -723,3 +745,29 @@ class AppDslService: return [] return DependenciesAnalysisService.get_leaked_dependencies(tenant_id=tenant_id, dependencies=dependencies) + + @staticmethod + def _generate_aes_key(tenant_id: str) -> bytes: + """Generate AES key based on tenant_id""" + return hashlib.sha256(tenant_id.encode()).digest() + + @classmethod + def encrypt_dataset_id(cls, dataset_id: str, tenant_id: str) -> str: + """Encrypt dataset_id using AES-CBC mode""" + key = cls._generate_aes_key(tenant_id) + iv = key[:16] + cipher = AES.new(key, AES.MODE_CBC, iv) + ct_bytes = cipher.encrypt(pad(dataset_id.encode(), AES.block_size)) + return base64.b64encode(ct_bytes).decode() + + @classmethod + def decrypt_dataset_id(cls, encrypted_data: str, tenant_id: str) -> str | None: + """AES decryption""" + try: + key = cls._generate_aes_key(tenant_id) + iv = key[:16] + cipher = AES.new(key, AES.MODE_CBC, iv) + pt = unpad(cipher.decrypt(base64.b64decode(encrypted_data)), AES.block_size) + return pt.decode() + except Exception: + return None diff --git a/api/services/audio_service.py b/api/services/audio_service.py index 294dfe4c8..a259f5a4c 100644 --- a/api/services/audio_service.py +++ b/api/services/audio_service.py @@ -5,6 +5,7 @@ from typing import Optional from werkzeug.datastructures import FileStorage +from constants import AUDIO_EXTENSIONS from core.model_manager import ModelManager from core.model_runtime.entities.model_entities import ModelType from models.model import App, AppMode, AppModelConfig, Message @@ -18,7 +19,6 @@ from services.errors.audio import ( FILE_SIZE = 30 FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024 -ALLOWED_EXTENSIONS = ["mp3", "mp4", "mpeg", "mpga", "m4a", "wav", "webm", "amr"] logger = logging.getLogger(__name__) @@ -44,7 +44,7 @@ class AudioService: raise NoAudioUploadedServiceError() extension = file.mimetype - if extension not in [f"audio/{ext}" for ext in ALLOWED_EXTENSIONS]: + if extension not in [f"audio/{ext}" for ext in AUDIO_EXTENSIONS]: raise UnsupportedAudioTypeServiceError() file_content = file.read() diff --git a/api/services/auth/api_key_auth_factory.py b/api/services/auth/api_key_auth_factory.py index f91c448fb..7ae31b076 100644 --- a/api/services/auth/api_key_auth_factory.py +++ b/api/services/auth/api_key_auth_factory.py @@ -17,6 +17,10 @@ class ApiKeyAuthFactory: from services.auth.firecrawl.firecrawl import FirecrawlAuth return FirecrawlAuth + case AuthType.WATERCRAWL: + from services.auth.watercrawl.watercrawl import WatercrawlAuth + + return WatercrawlAuth case AuthType.JINA: from services.auth.jina.jina import JinaAuth diff --git a/api/services/auth/auth_type.py b/api/services/auth/auth_type.py index 2e1946841..ec7118df2 100644 --- a/api/services/auth/auth_type.py +++ b/api/services/auth/auth_type.py @@ -3,4 +3,5 @@ from enum import StrEnum class AuthType(StrEnum): FIRECRAWL = "firecrawl" + WATERCRAWL = "watercrawl" JINA = "jinareader" diff --git a/api/services/auth/watercrawl/__init__.py b/api/services/auth/watercrawl/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/api/services/auth/watercrawl/watercrawl.py b/api/services/auth/watercrawl/watercrawl.py new file mode 100644 index 000000000..153ab5ba7 --- /dev/null +++ b/api/services/auth/watercrawl/watercrawl.py @@ -0,0 +1,44 @@ +import json +from urllib.parse import urljoin + +import requests + +from services.auth.api_key_auth_base import ApiKeyAuthBase + + +class WatercrawlAuth(ApiKeyAuthBase): + def __init__(self, credentials: dict): + super().__init__(credentials) + auth_type = credentials.get("auth_type") + if auth_type != "x-api-key": + raise ValueError("Invalid auth type, WaterCrawl auth type must be x-api-key") + self.api_key = credentials.get("config", {}).get("api_key", None) + self.base_url = credentials.get("config", {}).get("base_url", "https://app.watercrawl.dev") + + if not self.api_key: + raise ValueError("No API key provided") + + def validate_credentials(self): + headers = self._prepare_headers() + url = urljoin(self.base_url, "/api/v1/core/crawl-requests/") + response = self._get_request(url, headers) + if response.status_code == 200: + return True + else: + self._handle_error(response) + + def _prepare_headers(self): + return {"Content-Type": "application/json", "X-API-KEY": self.api_key} + + def _get_request(self, url, headers): + return requests.get(url, headers=headers) + + def _handle_error(self, response): + if response.status_code in {402, 409, 500}: + error_message = response.json().get("error", "Unknown error occurred") + raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") + else: + if response.text: + error_message = json.loads(response.text).get("error", "Unknown error occurred") + raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") + raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}") diff --git a/api/services/billing_service.py b/api/services/billing_service.py index ab68aad45..d44483ad8 100644 --- a/api/services/billing_service.py +++ b/api/services/billing_service.py @@ -6,7 +6,7 @@ from tenacity import retry, retry_if_exception_type, stop_before_delay, wait_fix from extensions.ext_database import db from libs.helper import RateLimiter -from models.account import TenantAccountJoin, TenantAccountRole +from models.account import Account, TenantAccountJoin, TenantAccountRole class BillingService: @@ -106,6 +106,48 @@ class BillingService: json = {"email": email, "feedback": feedback} return cls._send_request("POST", "/account/delete-feedback", json=json) + class EducationIdentity: + verification_rate_limit = RateLimiter(prefix="edu_verification_rate_limit", max_attempts=10, time_window=60) + activation_rate_limit = RateLimiter(prefix="edu_activation_rate_limit", max_attempts=10, time_window=60) + + @classmethod + def verify(cls, account_id: str, account_email: str): + if cls.verification_rate_limit.is_rate_limited(account_email): + from controllers.console.error import EducationVerifyLimitError + + raise EducationVerifyLimitError() + + cls.verification_rate_limit.increment_rate_limit(account_email) + + params = {"account_id": account_id} + return BillingService._send_request("GET", "/education/verify", params=params) + + @classmethod + def is_active(cls, account_id: str): + params = {"account_id": account_id} + return BillingService._send_request("GET", "/education/status", params=params) + + @classmethod + def activate(cls, account: Account, token: str, institution: str, role: str): + if cls.activation_rate_limit.is_rate_limited(account.email): + from controllers.console.error import EducationActivateLimitError + + raise EducationActivateLimitError() + + cls.activation_rate_limit.increment_rate_limit(account.email) + params = {"account_id": account.id, "curr_tenant_id": account.current_tenant_id} + json = { + "institution": institution, + "token": token, + "role": role, + } + return BillingService._send_request("POST", "/education/", json=json, params=params) + + @classmethod + def autocomplete(cls, keywords: str, page: int = 0, limit: int = 20): + params = {"keywords": keywords, "page": page, "limit": limit} + return BillingService._send_request("GET", "/education/autocomplete", params=params) + @classmethod def get_compliance_download_link( cls, diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index d3654a3d4..b019cf6b6 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -880,6 +880,9 @@ class DocumentService: website_info = knowledge_config.data_source.info_list.website_info_list count = len(website_info.urls) # type: ignore batch_upload_limit = int(dify_config.BATCH_UPLOAD_LIMIT) + + if features.billing.subscription.plan == "sandbox" and count > 1: + raise ValueError("Your current plan does not support batch upload, please upgrade your plan.") if count > batch_upload_limit: raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.") @@ -1328,6 +1331,8 @@ class DocumentService: website_info = knowledge_config.data_source.info_list.website_info_list # type: ignore if website_info: count = len(website_info.urls) + if features.billing.subscription.plan == "sandbox" and count > 1: + raise ValueError("Your current plan does not support batch upload, please upgrade your plan.") batch_upload_limit = int(dify_config.BATCH_UPLOAD_LIMIT) if count > batch_upload_limit: raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.") @@ -1663,6 +1668,7 @@ class SegmentService: content=content, word_count=len(content), tokens=tokens, + keywords=segment_item.get("keywords", []), status="completed", indexing_at=datetime.datetime.now(datetime.UTC).replace(tzinfo=None), completed_at=datetime.datetime.now(datetime.UTC).replace(tzinfo=None), @@ -1780,12 +1786,8 @@ class SegmentService: ) elif document.doc_form in (IndexType.PARAGRAPH_INDEX, IndexType.QA_INDEX): if args.enabled or keyword_changed: - VectorService.create_segments_vector( - [args.keywords] if args.keywords else None, - [segment], - dataset, - document.doc_form, - ) + # update segment vector index + VectorService.update_segment_vector(args.keywords, segment, dataset) else: segment_hash = helper.generate_text_hash(content) tokens = 0 @@ -2140,6 +2142,88 @@ class SegmentService: query = query.where(ChildChunk.content.ilike(f"%{keyword}%")) return query.paginate(page=page, per_page=limit, max_per_page=100, error_out=False) + @classmethod + def get_child_chunk_by_id(cls, child_chunk_id: str, tenant_id: str) -> Optional[ChildChunk]: + """Get a child chunk by its ID.""" + result = ChildChunk.query.filter(ChildChunk.id == child_chunk_id, ChildChunk.tenant_id == tenant_id).first() + return result if isinstance(result, ChildChunk) else None + + @classmethod + def get_segments( + cls, document_id: str, tenant_id: str, status_list: list[str] | None = None, keyword: str | None = None + ): + """Get segments for a document with optional filtering.""" + query = DocumentSegment.query.filter( + DocumentSegment.document_id == document_id, DocumentSegment.tenant_id == tenant_id + ) + + if status_list: + query = query.filter(DocumentSegment.status.in_(status_list)) + + if keyword: + query = query.filter(DocumentSegment.content.ilike(f"%{keyword}%")) + + segments = query.order_by(DocumentSegment.position.asc()).all() + total = len(segments) + + return segments, total + + @classmethod + def update_segment_by_id( + cls, tenant_id: str, dataset_id: str, document_id: str, segment_id: str, segment_data: dict, user_id: str + ) -> tuple[DocumentSegment, Document]: + """Update a segment by its ID with validation and checks.""" + # check dataset + dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() + if not dataset: + raise NotFound("Dataset not found.") + + # check user's model setting + DatasetService.check_dataset_model_setting(dataset) + + # check document + document = DocumentService.get_document(dataset_id, document_id) + if not document: + raise NotFound("Document not found.") + + # check embedding model setting if high quality + if dataset.indexing_technique == "high_quality": + try: + model_manager = ModelManager() + model_manager.get_model_instance( + tenant_id=user_id, + provider=dataset.embedding_model_provider, + model_type=ModelType.TEXT_EMBEDDING, + model=dataset.embedding_model, + ) + except LLMBadRequestError: + raise ValueError( + "No Embedding Model available. Please configure a valid provider in the Settings -> Model Provider." + ) + except ProviderTokenNotInitError as ex: + raise ValueError(ex.description) + + # check segment + segment = DocumentSegment.query.filter( + DocumentSegment.id == segment_id, DocumentSegment.tenant_id == user_id + ).first() + if not segment: + raise NotFound("Segment not found.") + + # validate and update segment + cls.segment_create_args_validate(segment_data, document) + updated_segment = cls.update_segment(SegmentUpdateArgs(**segment_data), segment, document, dataset) + + return updated_segment, document + + @classmethod + def get_segment_by_id(cls, segment_id: str, tenant_id: str) -> Optional[DocumentSegment]: + """Get a segment by its ID.""" + result = DocumentSegment.query.filter( + DocumentSegment.id == segment_id, DocumentSegment.tenant_id == tenant_id + ).first() + return result if isinstance(result, DocumentSegment) else None + class DatasetCollectionBindingService: @classmethod diff --git a/api/services/entities/knowledge_entities/knowledge_entities.py b/api/services/entities/knowledge_entities/knowledge_entities.py index 51ce596e5..bb3be61f8 100644 --- a/api/services/entities/knowledge_entities/knowledge_entities.py +++ b/api/services/entities/knowledge_entities/knowledge_entities.py @@ -1,4 +1,4 @@ -from enum import Enum +from enum import StrEnum from typing import Literal, Optional from pydantic import BaseModel @@ -11,7 +11,7 @@ class SegmentUpdateEntity(BaseModel): enabled: Optional[bool] = None -class ParentMode(str, Enum): +class ParentMode(StrEnum): FULL_DOC = "full-doc" PARAGRAPH = "paragraph" @@ -95,7 +95,7 @@ class WeightKeywordSetting(BaseModel): class WeightModel(BaseModel): - weight_type: str + weight_type: Optional[str] = None vector_setting: Optional[WeightVectorSetting] = None keyword_setting: Optional[WeightKeywordSetting] = None diff --git a/api/services/feature_service.py b/api/services/feature_service.py index 905c36d6e..a73baddb9 100644 --- a/api/services/feature_service.py +++ b/api/services/feature_service.py @@ -19,6 +19,11 @@ class BillingModel(BaseModel): subscription: SubscriptionModel = SubscriptionModel() +class EducationModel(BaseModel): + enabled: bool = False + activated: bool = False + + class LimitationModel(BaseModel): size: int = 0 limit: int = 0 @@ -40,6 +45,7 @@ class LicenseModel(BaseModel): class FeatureModel(BaseModel): billing: BillingModel = BillingModel() + education: EducationModel = EducationModel() members: LimitationModel = LimitationModel(size=0, limit=1) apps: LimitationModel = LimitationModel(size=0, limit=10) vector_space: LimitationModel = LimitationModel(size=0, limit=5) @@ -144,6 +150,7 @@ class FeatureService: features.can_replace_logo = dify_config.CAN_REPLACE_LOGO features.model_load_balancing_enabled = dify_config.MODEL_LB_ENABLED features.dataset_operator_enabled = dify_config.DATASET_OPERATOR_ENABLED + features.education.enabled = dify_config.EDUCATION_ENABLED @classmethod def _fulfill_params_from_billing_api(cls, features: FeatureModel, tenant_id: str): @@ -152,6 +159,7 @@ class FeatureService: features.billing.enabled = billing_info["enabled"] features.billing.subscription.plan = billing_info["subscription"]["plan"] features.billing.subscription.interval = billing_info["subscription"]["interval"] + features.education.activated = billing_info["subscription"].get("education", False) if "members" in billing_info: features.members.size = billing_info["members"]["size"] diff --git a/api/services/file_service.py b/api/services/file_service.py index d417e8173..284e96c97 100644 --- a/api/services/file_service.py +++ b/api/services/file_service.py @@ -1,5 +1,6 @@ import datetime import hashlib +import os import uuid from typing import Any, Literal, Union @@ -38,7 +39,12 @@ class FileService: source_url: str = "", ) -> UploadFile: # get file extension - extension = filename.split(".")[-1].lower() + extension = os.path.splitext(filename)[1].lstrip(".").lower() + + # check if filename contains invalid characters + if any(c in filename for c in ["/", "\\", ":", "*", "?", '"', "<", ">", "|"]): + raise ValueError("Filename contains invalid characters") + if len(filename) > 200: filename = filename.split(".")[0][:200] + "." + extension diff --git a/api/services/hit_testing_service.py b/api/services/hit_testing_service.py index f8c1c1d29..0b98065f5 100644 --- a/api/services/hit_testing_service.py +++ b/api/services/hit_testing_service.py @@ -29,15 +29,6 @@ class HitTestingService: external_retrieval_model: dict, limit: int = 10, ) -> dict: - if dataset.available_document_count == 0 or dataset.available_segment_count == 0: - return { - "query": { - "content": query, - "tsne_position": {"x": 0, "y": 0}, - }, - "records": [], - } - start = time.perf_counter() # get retrieval model , if the model is not setting , using default diff --git a/api/services/plugin/data_migration.py b/api/services/plugin/data_migration.py index 7228a1663..597585588 100644 --- a/api/services/plugin/data_migration.py +++ b/api/services/plugin/data_migration.py @@ -127,18 +127,32 @@ limit 1000""" processed_count = 0 failed_ids = [] + last_id = "00000000-0000-0000-0000-000000000000" + while True: - sql = f"""select id, {provider_column_name} as provider_name from {table_name} -where {provider_column_name} not like '%/%' and {provider_column_name} is not null and {provider_column_name} != '' -limit 1000""" + sql = f""" + SELECT id, {provider_column_name} AS provider_name + FROM {table_name} + WHERE {provider_column_name} NOT LIKE '%/%' + AND {provider_column_name} IS NOT NULL + AND {provider_column_name} != '' + AND id > :last_id + ORDER BY id ASC + LIMIT 5000 + """ + params = {"last_id": last_id or ""} + with db.engine.begin() as conn: - rs = conn.execute(db.text(sql)) + rs = conn.execute(db.text(sql), params) current_iter_count = 0 + batch_updates = [] + for i in rs: current_iter_count += 1 processed_count += 1 record_id = str(i.id) + last_id = record_id provider_name = str(i.provider_name) if record_id in failed_ids: @@ -152,19 +166,9 @@ limit 1000""" ) try: - # update provider name append with "langgenius/{provider_name}/{provider_name}" - sql = f"""update {table_name} - set {provider_column_name} = - concat('{DEFAULT_PLUGIN_ID}/', {provider_column_name}, '/', {provider_column_name}) - where id = :record_id""" - conn.execute(db.text(sql), {"record_id": record_id}) - click.echo( - click.style( - f"[{processed_count}] Migrated [{table_name}] {record_id} ({provider_name})", - fg="green", - ) - ) - except Exception: + updated_value = f"{DEFAULT_PLUGIN_ID}/{provider_name}/{provider_name}" + batch_updates.append((updated_value, record_id)) + except Exception as e: failed_ids.append(record_id) click.echo( click.style( @@ -177,6 +181,20 @@ limit 1000""" ) continue + if batch_updates: + update_sql = f""" + UPDATE {table_name} + SET {provider_column_name} = :updated_value + WHERE id = :record_id + """ + conn.execute(db.text(update_sql), [{"updated_value": u, "record_id": r} for u, r in batch_updates]) + click.echo( + click.style( + f"[{processed_count}] Batch migrated [{len(batch_updates)}] records from [{table_name}]", + fg="green", + ) + ) + if not current_iter_count: break diff --git a/api/services/tag_service.py b/api/services/tag_service.py index 8cc903bde..1fbaee96e 100644 --- a/api/services/tag_service.py +++ b/api/services/tag_service.py @@ -20,7 +20,7 @@ class TagService: ) if keyword: query = query.filter(db.and_(Tag.name.ilike(f"%{keyword}%"))) - query = query.group_by(Tag.id, Tag.type, Tag.name) + query = query.group_by(Tag.id, Tag.type, Tag.name, Tag.created_at) results: list = query.order_by(Tag.created_at.desc()).all() return results diff --git a/api/services/tools/builtin_tools_manage_service.py b/api/services/tools/builtin_tools_manage_service.py index 51b56ab58..075c60842 100644 --- a/api/services/tools/builtin_tools_manage_service.py +++ b/api/services/tools/builtin_tools_manage_service.py @@ -28,7 +28,6 @@ class BuiltinToolManageService: """ list builtin tool provider tools - :param user_id: the id of the user :param tenant_id: the id of the tenant :param provider: the name of the provider diff --git a/api/services/tools/tools_transform_service.py b/api/services/tools/tools_transform_service.py index d44151bef..367121125 100644 --- a/api/services/tools/tools_transform_service.py +++ b/api/services/tools/tools_transform_service.py @@ -60,6 +60,7 @@ class ToolTransformService: """ repack provider + :param tenant_id: the tenant id :param provider: the provider dict """ if isinstance(provider, dict) and "icon" in provider: diff --git a/api/services/tools/workflow_tools_manage_service.py b/api/services/tools/workflow_tools_manage_service.py index e486ed7b8..c6b205557 100644 --- a/api/services/tools/workflow_tools_manage_service.py +++ b/api/services/tools/workflow_tools_manage_service.py @@ -222,7 +222,7 @@ class WorkflowToolManageService: Delete a workflow tool. :param user_id: the user id :param tenant_id: the tenant id - :param workflow_app_id: the workflow app id + :param workflow_tool_id: the workflow tool id """ db.session.query(WorkflowToolProvider).filter( WorkflowToolProvider.tenant_id == tenant_id, WorkflowToolProvider.id == workflow_tool_id @@ -238,7 +238,7 @@ class WorkflowToolManageService: Get a workflow tool. :param user_id: the user id :param tenant_id: the tenant id - :param workflow_app_id: the workflow app id + :param workflow_tool_id: the workflow tool id :return: the tool """ db_tool: WorkflowToolProvider | None = ( @@ -313,7 +313,7 @@ class WorkflowToolManageService: List workflow tool provider tools. :param user_id: the user id :param tenant_id: the tenant id - :param workflow_app_id: the workflow app id + :param workflow_tool_id: the workflow tool id :return: the list of tools """ db_tool: WorkflowToolProvider | None = ( diff --git a/api/services/website_service.py b/api/services/website_service.py index 85d32c9e8..460a637a4 100644 --- a/api/services/website_service.py +++ b/api/services/website_service.py @@ -7,6 +7,7 @@ from flask_login import current_user # type: ignore from core.helper import encrypter from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp +from core.rag.extractor.watercrawl.provider import WaterCrawlProvider from extensions.ext_redis import redis_client from extensions.ext_storage import storage from services.auth.api_key_auth_service import ApiKeyAuthService @@ -59,6 +60,13 @@ class WebsiteService: time = str(datetime.datetime.now().timestamp()) redis_client.setex(website_crawl_time_cache_key, 3600, time) return {"status": "active", "job_id": job_id} + elif provider == "watercrawl": + # decrypt api_key + api_key = encrypter.decrypt_token( + tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") + ) + return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).crawl_url(url, options) + elif provider == "jinareader": api_key = encrypter.decrypt_token( tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") @@ -116,6 +124,14 @@ class WebsiteService: time_consuming = abs(end_time - float(start_time)) crawl_status_data["time_consuming"] = f"{time_consuming:.2f}" redis_client.delete(website_crawl_time_cache_key) + elif provider == "watercrawl": + # decrypt api_key + api_key = encrypter.decrypt_token( + tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") + ) + crawl_status_data = WaterCrawlProvider( + api_key, credentials.get("config").get("base_url", None) + ).get_crawl_status(job_id) elif provider == "jinareader": api_key = encrypter.decrypt_token( tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") @@ -180,6 +196,11 @@ class WebsiteService: if item.get("source_url") == url: return dict(item) return None + elif provider == "watercrawl": + api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key")) + return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).get_crawl_url_data( + job_id, url + ) elif provider == "jinareader": if not job_id: response = requests.get( @@ -223,5 +244,8 @@ class WebsiteService: params = {"onlyMainContent": only_main_content} result = firecrawl_app.scrape_url(url, params) return result + elif provider == "watercrawl": + api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key")) + return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).scrape_url(url) else: raise ValueError("Invalid provider") diff --git a/api/tasks/add_document_to_index_task.py b/api/tasks/add_document_to_index_task.py index c5a5ddaad..0b7d2ad31 100644 --- a/api/tasks/add_document_to_index_task.py +++ b/api/tasks/add_document_to_index_task.py @@ -4,7 +4,6 @@ import time import click from celery import shared_task # type: ignore -from werkzeug.exceptions import NotFound from core.rag.index_processor.constant.index_type import IndexType from core.rag.index_processor.index_processor_factory import IndexProcessorFactory @@ -28,7 +27,9 @@ def add_document_to_index_task(dataset_document_id: str): dataset_document = db.session.query(DatasetDocument).filter(DatasetDocument.id == dataset_document_id).first() if not dataset_document: - raise NotFound("Document not found") + logging.info(click.style("Document not found: {}".format(dataset_document_id), fg="red")) + db.session.close() + return if dataset_document.indexing_status != "completed": return @@ -59,7 +60,7 @@ def add_document_to_index_task(dataset_document_id: str): }, ) if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX: - child_chunks = segment.child_chunks + child_chunks = segment.get_child_chunks() if child_chunks: child_documents = [] for child_chunk in child_chunks: diff --git a/api/tasks/annotation/add_annotation_to_index_task.py b/api/tasks/annotation/add_annotation_to_index_task.py index aab21a441..2a93c21ab 100644 --- a/api/tasks/annotation/add_annotation_to_index_task.py +++ b/api/tasks/annotation/add_annotation_to_index_task.py @@ -6,6 +6,7 @@ from celery import shared_task # type: ignore from core.rag.datasource.vdb.vector_factory import Vector from core.rag.models.document import Document +from extensions.ext_database import db from models.dataset import Dataset from services.dataset_service import DatasetCollectionBindingService @@ -55,3 +56,5 @@ def add_annotation_to_index_task( ) except Exception: logging.exception("Build index for annotation failed") + finally: + db.session.close() diff --git a/api/tasks/annotation/batch_import_annotations_task.py b/api/tasks/annotation/batch_import_annotations_task.py index 06162b02d..6144a4fe3 100644 --- a/api/tasks/annotation/batch_import_annotations_task.py +++ b/api/tasks/annotation/batch_import_annotations_task.py @@ -88,3 +88,5 @@ def batch_import_annotations_task(job_id: str, content_list: list[dict], app_id: indexing_error_msg_key = "app_annotation_batch_import_error_msg_{}".format(str(job_id)) redis_client.setex(indexing_error_msg_key, 600, str(e)) logging.exception("Build index for batch import annotations failed") + finally: + db.session.close() diff --git a/api/tasks/annotation/delete_annotation_index_task.py b/api/tasks/annotation/delete_annotation_index_task.py index a6a598ce4..a6657e813 100644 --- a/api/tasks/annotation/delete_annotation_index_task.py +++ b/api/tasks/annotation/delete_annotation_index_task.py @@ -5,6 +5,7 @@ import click from celery import shared_task # type: ignore from core.rag.datasource.vdb.vector_factory import Vector +from extensions.ext_database import db from models.dataset import Dataset from services.dataset_service import DatasetCollectionBindingService @@ -39,3 +40,5 @@ def delete_annotation_index_task(annotation_id: str, app_id: str, tenant_id: str ) except Exception as e: logging.exception("Annotation deleted index failed") + finally: + db.session.close() diff --git a/api/tasks/annotation/disable_annotation_reply_task.py b/api/tasks/annotation/disable_annotation_reply_task.py index 26bf1c7c9..747fce578 100644 --- a/api/tasks/annotation/disable_annotation_reply_task.py +++ b/api/tasks/annotation/disable_annotation_reply_task.py @@ -3,7 +3,6 @@ import time import click from celery import shared_task # type: ignore -from werkzeug.exceptions import NotFound from core.rag.datasource.vdb.vector_factory import Vector from extensions.ext_database import db @@ -23,14 +22,18 @@ def disable_annotation_reply_task(job_id: str, app_id: str, tenant_id: str): app = db.session.query(App).filter(App.id == app_id, App.tenant_id == tenant_id, App.status == "normal").first() annotations_count = db.session.query(MessageAnnotation).filter(MessageAnnotation.app_id == app_id).count() if not app: - raise NotFound("App not found") + logging.info(click.style("App not found: {}".format(app_id), fg="red")) + db.session.close() + return app_annotation_setting = ( db.session.query(AppAnnotationSetting).filter(AppAnnotationSetting.app_id == app_id).first() ) if not app_annotation_setting: - raise NotFound("App annotation setting not found") + logging.info(click.style("App annotation setting not found: {}".format(app_id), fg="red")) + db.session.close() + return disable_app_annotation_key = "disable_app_annotation_{}".format(str(app_id)) disable_app_annotation_job_key = "disable_app_annotation_job_{}".format(str(job_id)) @@ -46,7 +49,7 @@ def disable_annotation_reply_task(job_id: str, app_id: str, tenant_id: str): try: if annotations_count > 0: vector = Vector(dataset, attributes=["doc_id", "annotation_id", "app_id"]) - vector.delete_by_metadata_field("app_id", app_id) + vector.delete() except Exception: logging.exception("Delete annotation index failed when annotation deleted.") redis_client.setex(disable_app_annotation_job_key, 600, "completed") @@ -66,3 +69,4 @@ def disable_annotation_reply_task(job_id: str, app_id: str, tenant_id: str): redis_client.setex(disable_app_annotation_error_key, 600, str(e)) finally: redis_client.delete(disable_app_annotation_key) + db.session.close() diff --git a/api/tasks/annotation/enable_annotation_reply_task.py b/api/tasks/annotation/enable_annotation_reply_task.py index b42af0c7f..c04f1be84 100644 --- a/api/tasks/annotation/enable_annotation_reply_task.py +++ b/api/tasks/annotation/enable_annotation_reply_task.py @@ -4,7 +4,6 @@ import time import click from celery import shared_task # type: ignore -from werkzeug.exceptions import NotFound from core.rag.datasource.vdb.vector_factory import Vector from core.rag.models.document import Document @@ -34,7 +33,9 @@ def enable_annotation_reply_task( app = db.session.query(App).filter(App.id == app_id, App.tenant_id == tenant_id, App.status == "normal").first() if not app: - raise NotFound("App not found") + logging.info(click.style("App not found: {}".format(app_id), fg="red")) + db.session.close() + return annotations = db.session.query(MessageAnnotation).filter(MessageAnnotation.app_id == app_id).all() enable_app_annotation_key = "enable_app_annotation_{}".format(str(app_id)) @@ -49,6 +50,27 @@ def enable_annotation_reply_task( db.session.query(AppAnnotationSetting).filter(AppAnnotationSetting.app_id == app_id).first() ) if annotation_setting: + if dataset_collection_binding.id != annotation_setting.collection_binding_id: + old_dataset_collection_binding = ( + DatasetCollectionBindingService.get_dataset_collection_binding_by_id_and_type( + annotation_setting.collection_binding_id, "annotation" + ) + ) + if old_dataset_collection_binding and annotations: + old_dataset = Dataset( + id=app_id, + tenant_id=tenant_id, + indexing_technique="high_quality", + embedding_model_provider=old_dataset_collection_binding.provider_name, + embedding_model=old_dataset_collection_binding.model_name, + collection_binding_id=old_dataset_collection_binding.id, + ) + + old_vector = Vector(old_dataset, attributes=["doc_id", "annotation_id", "app_id"]) + try: + old_vector.delete() + except Exception as e: + logging.info(click.style("Delete annotation index error: {}".format(str(e)), fg="red")) annotation_setting.score_threshold = score_threshold annotation_setting.collection_binding_id = dataset_collection_binding.id annotation_setting.updated_user_id = user_id @@ -100,3 +122,4 @@ def enable_annotation_reply_task( db.session.rollback() finally: redis_client.delete(enable_app_annotation_key) + db.session.close() diff --git a/api/tasks/annotation/update_annotation_to_index_task.py b/api/tasks/annotation/update_annotation_to_index_task.py index 8c675feaa..596ba829a 100644 --- a/api/tasks/annotation/update_annotation_to_index_task.py +++ b/api/tasks/annotation/update_annotation_to_index_task.py @@ -6,6 +6,7 @@ from celery import shared_task # type: ignore from core.rag.datasource.vdb.vector_factory import Vector from core.rag.models.document import Document +from extensions.ext_database import db from models.dataset import Dataset from services.dataset_service import DatasetCollectionBindingService @@ -56,3 +57,5 @@ def update_annotation_to_index_task( ) except Exception: logging.exception("Build index for annotation failed") + finally: + db.session.close() diff --git a/api/tasks/batch_clean_document_task.py b/api/tasks/batch_clean_document_task.py index 8376ab1b0..97efc47b3 100644 --- a/api/tasks/batch_clean_document_task.py +++ b/api/tasks/batch_clean_document_task.py @@ -74,3 +74,5 @@ def batch_clean_document_task(document_ids: list[str], dataset_id: str, doc_form ) except Exception: logging.exception("Cleaned documents when documents deleted failed") + finally: + db.session.close() diff --git a/api/tasks/batch_create_segment_to_index_task.py b/api/tasks/batch_create_segment_to_index_task.py index 648f92b0f..f32bc4f18 100644 --- a/api/tasks/batch_create_segment_to_index_task.py +++ b/api/tasks/batch_create_segment_to_index_task.py @@ -127,3 +127,5 @@ def batch_create_segment_to_index_task( except Exception: logging.exception("Segments batch created index failed") redis_client.setex(indexing_cache_key, 600, "error") + finally: + db.session.close() diff --git a/api/tasks/clean_dataset_task.py b/api/tasks/clean_dataset_task.py index 4d77f1fb6..6bac71839 100644 --- a/api/tasks/clean_dataset_task.py +++ b/api/tasks/clean_dataset_task.py @@ -11,6 +11,8 @@ from extensions.ext_storage import storage from models.dataset import ( AppDatasetJoin, Dataset, + DatasetMetadata, + DatasetMetadataBinding, DatasetProcessRule, DatasetQuery, Document, @@ -86,7 +88,9 @@ def clean_dataset_task( db.session.query(DatasetProcessRule).filter(DatasetProcessRule.dataset_id == dataset_id).delete() db.session.query(DatasetQuery).filter(DatasetQuery.dataset_id == dataset_id).delete() db.session.query(AppDatasetJoin).filter(AppDatasetJoin.dataset_id == dataset_id).delete() - + # delete dataset metadata + db.session.query(DatasetMetadata).filter(DatasetMetadata.dataset_id == dataset_id).delete() + db.session.query(DatasetMetadataBinding).filter(DatasetMetadataBinding.dataset_id == dataset_id).delete() # delete files if documents: for document in documents: @@ -117,3 +121,5 @@ def clean_dataset_task( ) except Exception: logging.exception("Cleaned dataset when dataset deleted failed") + finally: + db.session.close() diff --git a/api/tasks/clean_document_task.py b/api/tasks/clean_document_task.py index 5a4d7a52b..5824121e8 100644 --- a/api/tasks/clean_document_task.py +++ b/api/tasks/clean_document_task.py @@ -9,7 +9,7 @@ from core.rag.index_processor.index_processor_factory import IndexProcessorFacto from core.tools.utils.rag_web_reader import get_image_upload_file_ids from extensions.ext_database import db from extensions.ext_storage import storage -from models.dataset import Dataset, DocumentSegment +from models.dataset import Dataset, DatasetMetadataBinding, DocumentSegment from models.model import UploadFile @@ -67,6 +67,12 @@ def clean_document_task(document_id: str, dataset_id: str, doc_form: str, file_i db.session.delete(file) db.session.commit() + # delete dataset metadata binding + db.session.query(DatasetMetadataBinding).filter( + DatasetMetadataBinding.dataset_id == dataset_id, + DatasetMetadataBinding.document_id == document_id, + ).delete() + end_at = time.perf_counter() logging.info( click.style( @@ -76,3 +82,5 @@ def clean_document_task(document_id: str, dataset_id: str, doc_form: str, file_i ) except Exception: logging.exception("Cleaned document when document deleted failed") + finally: + db.session.close() diff --git a/api/tasks/clean_notion_document_task.py b/api/tasks/clean_notion_document_task.py index 5a6eb00a6..1087a3776 100644 --- a/api/tasks/clean_notion_document_task.py +++ b/api/tasks/clean_notion_document_task.py @@ -53,3 +53,5 @@ def clean_notion_document_task(document_ids: list[str], dataset_id: str): ) except Exception: logging.exception("Cleaned document when import form notion document deleted failed") + finally: + db.session.close() diff --git a/api/tasks/create_segment_to_index_task.py b/api/tasks/create_segment_to_index_task.py index dfa053a43..4500b2a44 100644 --- a/api/tasks/create_segment_to_index_task.py +++ b/api/tasks/create_segment_to_index_task.py @@ -5,7 +5,6 @@ from typing import Optional import click from celery import shared_task # type: ignore -from werkzeug.exceptions import NotFound from core.rag.index_processor.index_processor_factory import IndexProcessorFactory from core.rag.models.document import Document @@ -27,7 +26,9 @@ def create_segment_to_index_task(segment_id: str, keywords: Optional[list[str]] segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_id).first() if not segment: - raise NotFound("Segment not found") + logging.info(click.style("Segment not found: {}".format(segment_id), fg="red")) + db.session.close() + return if segment.status != "waiting": return @@ -93,3 +94,4 @@ def create_segment_to_index_task(segment_id: str, keywords: Optional[list[str]] db.session.commit() finally: redis_client.delete(indexing_cache_key) + db.session.close() diff --git a/api/tasks/deal_dataset_vector_index_task.py b/api/tasks/deal_dataset_vector_index_task.py index a9b5ab91a..075453e28 100644 --- a/api/tasks/deal_dataset_vector_index_task.py +++ b/api/tasks/deal_dataset_vector_index_task.py @@ -130,7 +130,7 @@ def deal_dataset_vector_index_task(dataset_id: str, action: str): }, ) if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX: - child_chunks = segment.child_chunks + child_chunks = segment.get_child_chunks() if child_chunks: child_documents = [] for child_chunk in child_chunks: @@ -167,3 +167,5 @@ def deal_dataset_vector_index_task(dataset_id: str, action: str): ) except Exception: logging.exception("Deal dataset vector index failed") + finally: + db.session.close() diff --git a/api/tasks/delete_segment_from_index_task.py b/api/tasks/delete_segment_from_index_task.py index e4fbd5465..a93babc31 100644 --- a/api/tasks/delete_segment_from_index_task.py +++ b/api/tasks/delete_segment_from_index_task.py @@ -41,3 +41,5 @@ def delete_segment_from_index_task(index_node_ids: list, dataset_id: str, docume logging.info(click.style("Segment deleted from index latency: {}".format(end_at - start_at), fg="green")) except Exception: logging.exception("delete segment from index failed") + finally: + db.session.close() diff --git a/api/tasks/disable_segment_from_index_task.py b/api/tasks/disable_segment_from_index_task.py index f30a1cc7a..327eed472 100644 --- a/api/tasks/disable_segment_from_index_task.py +++ b/api/tasks/disable_segment_from_index_task.py @@ -3,7 +3,6 @@ import time import click from celery import shared_task # type: ignore -from werkzeug.exceptions import NotFound from core.rag.index_processor.index_processor_factory import IndexProcessorFactory from extensions.ext_database import db @@ -24,10 +23,14 @@ def disable_segment_from_index_task(segment_id: str): segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_id).first() if not segment: - raise NotFound("Segment not found") + logging.info(click.style("Segment not found: {}".format(segment_id), fg="red")) + db.session.close() + return if segment.status != "completed": - raise NotFound("Segment is not completed , disable action is not allowed.") + logging.info(click.style("Segment is not completed, disable is not allowed: {}".format(segment_id), fg="red")) + db.session.close() + return indexing_cache_key = "segment_{}_indexing".format(segment.id) @@ -62,3 +65,4 @@ def disable_segment_from_index_task(segment_id: str): db.session.commit() finally: redis_client.delete(indexing_cache_key) + db.session.close() diff --git a/api/tasks/disable_segments_from_index_task.py b/api/tasks/disable_segments_from_index_task.py index d43fb90ed..8b77b290c 100644 --- a/api/tasks/disable_segments_from_index_task.py +++ b/api/tasks/disable_segments_from_index_task.py @@ -26,15 +26,18 @@ def disable_segments_from_index_task(segment_ids: list, dataset_id: str, documen dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first() if not dataset: logging.info(click.style("Dataset {} not found, pass.".format(dataset_id), fg="cyan")) + db.session.close() return dataset_document = db.session.query(DatasetDocument).filter(DatasetDocument.id == document_id).first() if not dataset_document: logging.info(click.style("Document {} not found, pass.".format(document_id), fg="cyan")) + db.session.close() return if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed": logging.info(click.style("Document {} status is invalid, pass.".format(document_id), fg="cyan")) + db.session.close() return # sync index processor index_processor = IndexProcessorFactory(dataset_document.doc_form).init_index_processor() @@ -50,6 +53,7 @@ def disable_segments_from_index_task(segment_ids: list, dataset_id: str, documen ) if not segments: + db.session.close() return try: @@ -76,3 +80,4 @@ def disable_segments_from_index_task(segment_ids: list, dataset_id: str, documen for segment in segments: indexing_cache_key = "segment_{}_indexing".format(segment.id) redis_client.delete(indexing_cache_key) + db.session.close() diff --git a/api/tasks/document_indexing_sync_task.py b/api/tasks/document_indexing_sync_task.py index d686698b9..2e68dcb0f 100644 --- a/api/tasks/document_indexing_sync_task.py +++ b/api/tasks/document_indexing_sync_task.py @@ -4,7 +4,6 @@ import time import click from celery import shared_task # type: ignore -from werkzeug.exceptions import NotFound from core.indexing_runner import DocumentIsPausedError, IndexingRunner from core.rag.extractor.notion_extractor import NotionExtractor @@ -29,7 +28,9 @@ def document_indexing_sync_task(dataset_id: str, document_id: str): document = db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first() if not document: - raise NotFound("Document not found") + logging.info(click.style("Document not found: {}".format(document_id), fg="red")) + db.session.close() + return data_source_info = document.data_source_info_dict if document.data_source_type == "notion_import": diff --git a/api/tasks/document_indexing_task.py b/api/tasks/document_indexing_task.py index a8e3a69f1..ee470d44e 100644 --- a/api/tasks/document_indexing_task.py +++ b/api/tasks/document_indexing_task.py @@ -27,6 +27,7 @@ def document_indexing_task(dataset_id: str, document_ids: list): dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first() if not dataset: logging.info(click.style("Dataset is not found: {}".format(dataset_id), fg="yellow")) + db.session.close() return # check document limit features = FeatureService.get_features(dataset.tenant_id) @@ -35,6 +36,8 @@ def document_indexing_task(dataset_id: str, document_ids: list): vector_space = features.vector_space count = len(document_ids) batch_upload_limit = int(dify_config.BATCH_UPLOAD_LIMIT) + if features.billing.subscription.plan == "sandbox" and count > 1: + raise ValueError("Your current plan does not support batch upload, please upgrade your plan.") if count > batch_upload_limit: raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.") if 0 < vector_space.limit <= vector_space.size: @@ -53,6 +56,7 @@ def document_indexing_task(dataset_id: str, document_ids: list): document.stopped_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) db.session.add(document) db.session.commit() + db.session.close() return for document_id in document_ids: @@ -78,3 +82,5 @@ def document_indexing_task(dataset_id: str, document_ids: list): logging.info(click.style(str(ex), fg="yellow")) except Exception: pass + finally: + db.session.close() diff --git a/api/tasks/document_indexing_update_task.py b/api/tasks/document_indexing_update_task.py index d8f14830c..b9ed11a8d 100644 --- a/api/tasks/document_indexing_update_task.py +++ b/api/tasks/document_indexing_update_task.py @@ -4,7 +4,6 @@ import time import click from celery import shared_task # type: ignore -from werkzeug.exceptions import NotFound from core.indexing_runner import DocumentIsPausedError, IndexingRunner from core.rag.index_processor.index_processor_factory import IndexProcessorFactory @@ -27,7 +26,9 @@ def document_indexing_update_task(dataset_id: str, document_id: str): document = db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first() if not document: - raise NotFound("Document not found") + logging.info(click.style("Document not found: {}".format(document_id), fg="red")) + db.session.close() + return document.indexing_status = "parsing" document.processing_started_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) @@ -73,3 +74,5 @@ def document_indexing_update_task(dataset_id: str, document_id: str): logging.info(click.style(str(ex), fg="yellow")) except Exception: pass + finally: + db.session.close() diff --git a/api/tasks/duplicate_document_indexing_task.py b/api/tasks/duplicate_document_indexing_task.py index b0cd48647..100fc257c 100644 --- a/api/tasks/duplicate_document_indexing_task.py +++ b/api/tasks/duplicate_document_indexing_task.py @@ -27,7 +27,9 @@ def duplicate_document_indexing_task(dataset_id: str, document_ids: list): dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first() if dataset is None: - raise ValueError("Dataset not found") + logging.info(click.style("Dataset not found: {}".format(dataset_id), fg="red")) + db.session.close() + return # check document limit features = FeatureService.get_features(dataset.tenant_id) @@ -35,6 +37,8 @@ def duplicate_document_indexing_task(dataset_id: str, document_ids: list): if features.billing.enabled: vector_space = features.vector_space count = len(document_ids) + if features.billing.subscription.plan == "sandbox" and count > 1: + raise ValueError("Your current plan does not support batch upload, please upgrade your plan.") batch_upload_limit = int(dify_config.BATCH_UPLOAD_LIMIT) if count > batch_upload_limit: raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.") @@ -55,6 +59,8 @@ def duplicate_document_indexing_task(dataset_id: str, document_ids: list): db.session.add(document) db.session.commit() return + finally: + db.session.close() for document_id in document_ids: logging.info(click.style("Start process document: {}".format(document_id), fg="green")) @@ -94,3 +100,5 @@ def duplicate_document_indexing_task(dataset_id: str, document_ids: list): logging.info(click.style(str(ex), fg="yellow")) except Exception: pass + finally: + db.session.close() diff --git a/api/tasks/enable_segment_to_index_task.py b/api/tasks/enable_segment_to_index_task.py index 76522f472..21f08f40a 100644 --- a/api/tasks/enable_segment_to_index_task.py +++ b/api/tasks/enable_segment_to_index_task.py @@ -4,7 +4,6 @@ import time import click from celery import shared_task # type: ignore -from werkzeug.exceptions import NotFound from core.rag.index_processor.constant.index_type import IndexType from core.rag.index_processor.index_processor_factory import IndexProcessorFactory @@ -27,10 +26,14 @@ def enable_segment_to_index_task(segment_id: str): segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_id).first() if not segment: - raise NotFound("Segment not found") + logging.info(click.style("Segment not found: {}".format(segment_id), fg="red")) + db.session.close() + return if segment.status != "completed": - raise NotFound("Segment is not completed, enable action is not allowed.") + logging.info(click.style("Segment is not completed, enable is not allowed: {}".format(segment_id), fg="red")) + db.session.close() + return indexing_cache_key = "segment_{}_indexing".format(segment.id) @@ -63,7 +66,7 @@ def enable_segment_to_index_task(segment_id: str): index_processor = IndexProcessorFactory(dataset_document.doc_form).init_index_processor() if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX: - child_chunks = segment.child_chunks + child_chunks = segment.get_child_chunks() if child_chunks: child_documents = [] for child_chunk in child_chunks: @@ -94,3 +97,4 @@ def enable_segment_to_index_task(segment_id: str): db.session.commit() finally: redis_client.delete(indexing_cache_key) + db.session.close() diff --git a/api/tasks/enable_segments_to_index_task.py b/api/tasks/enable_segments_to_index_task.py index 3942268af..625a3b582 100644 --- a/api/tasks/enable_segments_to_index_task.py +++ b/api/tasks/enable_segments_to_index_task.py @@ -34,9 +34,11 @@ def enable_segments_to_index_task(segment_ids: list, dataset_id: str, document_i if not dataset_document: logging.info(click.style("Document {} not found, pass.".format(document_id), fg="cyan")) + db.session.close() return if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed": logging.info(click.style("Document {} status is invalid, pass.".format(document_id), fg="cyan")) + db.session.close() return # sync index processor index_processor = IndexProcessorFactory(dataset_document.doc_form).init_index_processor() @@ -51,6 +53,8 @@ def enable_segments_to_index_task(segment_ids: list, dataset_id: str, document_i .all() ) if not segments: + logging.info(click.style("Segments not found: {}".format(segment_ids), fg="cyan")) + db.session.close() return try: @@ -67,7 +71,7 @@ def enable_segments_to_index_task(segment_ids: list, dataset_id: str, document_i ) if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX: - child_chunks = segment.child_chunks + child_chunks = segment.get_child_chunks() if child_chunks: child_documents = [] for child_chunk in child_chunks: @@ -108,3 +112,4 @@ def enable_segments_to_index_task(segment_ids: list, dataset_id: str, document_i for segment in segments: indexing_cache_key = "segment_{}_indexing".format(segment.id) redis_client.delete(indexing_cache_key) + db.session.close() diff --git a/api/tasks/mail_account_deletion_task.py b/api/tasks/mail_account_deletion_task.py index 49a3a6d28..0c60ae53d 100644 --- a/api/tasks/mail_account_deletion_task.py +++ b/api/tasks/mail_account_deletion_task.py @@ -10,11 +10,7 @@ from extensions.ext_mail import mail @shared_task(queue="mail") def send_deletion_success_task(to): - """Send email to user regarding account deletion. - - Args: - log (AccountDeletionLog): Account deletion log object - """ + """Send email to user regarding account deletion.""" if not mail.is_inited(): return diff --git a/api/tasks/ops_trace_task.py b/api/tasks/ops_trace_task.py index bb3b9e17e..2b49e4bb2 100644 --- a/api/tasks/ops_trace_task.py +++ b/api/tasks/ops_trace_task.py @@ -17,8 +17,6 @@ from models.workflow import WorkflowRun def process_trace_tasks(file_info): """ Async process trace tasks - :param tasks_data: List of dictionaries containing task data - Usage: process_trace_tasks.delay(tasks_data) """ from core.ops.ops_trace_manager import OpsTraceManager diff --git a/api/tasks/recover_document_indexing_task.py b/api/tasks/recover_document_indexing_task.py index b603d689b..eada2ff9d 100644 --- a/api/tasks/recover_document_indexing_task.py +++ b/api/tasks/recover_document_indexing_task.py @@ -3,7 +3,6 @@ import time import click from celery import shared_task # type: ignore -from werkzeug.exceptions import NotFound from core.indexing_runner import DocumentIsPausedError, IndexingRunner from extensions.ext_database import db @@ -25,7 +24,9 @@ def recover_document_indexing_task(dataset_id: str, document_id: str): document = db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first() if not document: - raise NotFound("Document not found") + logging.info(click.style("Document not found: {}".format(document_id), fg="red")) + db.session.close() + return try: indexing_runner = IndexingRunner() @@ -43,3 +44,5 @@ def recover_document_indexing_task(dataset_id: str, document_id: str): logging.info(click.style(str(ex), fg="yellow")) except Exception: pass + finally: + db.session.close() diff --git a/api/tasks/remove_document_from_index_task.py b/api/tasks/remove_document_from_index_task.py index d0c4382f5..0e2960788 100644 --- a/api/tasks/remove_document_from_index_task.py +++ b/api/tasks/remove_document_from_index_task.py @@ -4,7 +4,6 @@ import time import click from celery import shared_task # type: ignore -from werkzeug.exceptions import NotFound from core.rag.index_processor.index_processor_factory import IndexProcessorFactory from extensions.ext_database import db @@ -25,9 +24,13 @@ def remove_document_from_index_task(document_id: str): document = db.session.query(Document).filter(Document.id == document_id).first() if not document: - raise NotFound("Document not found") + logging.info(click.style("Document not found: {}".format(document_id), fg="red")) + db.session.close() + return if document.indexing_status != "completed": + logging.info(click.style("Document is not completed, remove is not allowed: {}".format(document_id), fg="red")) + db.session.close() return indexing_cache_key = "document_{}_indexing".format(document.id) @@ -71,3 +74,4 @@ def remove_document_from_index_task(document_id: str): db.session.commit() finally: redis_client.delete(indexing_cache_key) + db.session.close() diff --git a/api/tasks/retry_document_indexing_task.py b/api/tasks/retry_document_indexing_task.py index 83ddbcfcc..7e50eb9f8 100644 --- a/api/tasks/retry_document_indexing_task.py +++ b/api/tasks/retry_document_indexing_task.py @@ -27,7 +27,9 @@ def retry_document_indexing_task(dataset_id: str, document_ids: list[str]): dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first() if not dataset: - raise ValueError("Dataset not found") + logging.info(click.style("Dataset not found: {}".format(dataset_id), fg="red")) + db.session.close() + return for document_id in document_ids: retry_indexing_cache_key = "document_{}_is_retried".format(document_id) @@ -52,6 +54,7 @@ def retry_document_indexing_task(dataset_id: str, document_ids: list[str]): db.session.add(document) db.session.commit() redis_client.delete(retry_indexing_cache_key) + db.session.close() return logging.info(click.style("Start retry document: {}".format(document_id), fg="green")) @@ -60,6 +63,7 @@ def retry_document_indexing_task(dataset_id: str, document_ids: list[str]): ) if not document: logging.info(click.style("Document not found: {}".format(document_id), fg="yellow")) + db.session.close() return try: # clean old data @@ -92,5 +96,7 @@ def retry_document_indexing_task(dataset_id: str, document_ids: list[str]): logging.info(click.style(str(ex), fg="yellow")) redis_client.delete(retry_indexing_cache_key) pass + finally: + db.session.close() end_at = time.perf_counter() logging.info(click.style("Retry dataset: {} latency: {}".format(dataset_id, end_at - start_at), fg="green")) diff --git a/api/tests/integration_tests/vdb/__mock/tcvectordb.py b/api/tests/integration_tests/vdb/__mock/tcvectordb.py index 68a1e290a..ae5f9761b 100644 --- a/api/tests/integration_tests/vdb/__mock/tcvectordb.py +++ b/api/tests/integration_tests/vdb/__mock/tcvectordb.py @@ -1,58 +1,90 @@ import os -from typing import Optional +from typing import Optional, Union import pytest from _pytest.monkeypatch import MonkeyPatch from requests.adapters import HTTPAdapter -from tcvectordb import VectorDBClient # type: ignore -from tcvectordb.model.database import Collection, Database # type: ignore -from tcvectordb.model.document import Document, Filter # type: ignore +from tcvectordb import RPCVectorDBClient # type: ignore +from tcvectordb.model import enum +from tcvectordb.model.collection import FilterIndexConfig +from tcvectordb.model.document import AnnSearch, Document, Filter, KeywordSearch, Rerank # type: ignore from tcvectordb.model.enum import ReadConsistency # type: ignore -from tcvectordb.model.index import Index # type: ignore +from tcvectordb.model.index import FilterIndex, HNSWParams, Index, IndexField, VectorIndex # type: ignore +from tcvectordb.rpc.model.collection import RPCCollection +from tcvectordb.rpc.model.database import RPCDatabase from xinference_client.types import Embedding # type: ignore class MockTcvectordbClass: def mock_vector_db_client( self, - url=None, + url: str, username="", key="", read_consistency: ReadConsistency = ReadConsistency.EVENTUAL_CONSISTENCY, - timeout=5, + timeout=10, adapter: HTTPAdapter = None, + pool_size: int = 2, + proxies: Optional[dict] = None, + password: Optional[str] = None, + **kwargs, ): self._conn = None self._read_consistency = read_consistency - def list_databases(self) -> list[Database]: - return [ - Database( - conn=self._conn, + def create_database_if_not_exists(self, database_name: str, timeout: Optional[float] = None) -> RPCDatabase: + return RPCDatabase( + name="dify", + read_consistency=self._read_consistency, + ) + + def exists_collection(self, database_name: str, collection_name: str) -> bool: + return True + + def describe_collection( + self, database_name: str, collection_name: str, timeout: Optional[float] = None + ) -> RPCCollection: + index = Index( + FilterIndex("id", enum.FieldType.String, enum.IndexType.PRIMARY_KEY), + VectorIndex( + "vector", + 128, + enum.IndexType.HNSW, + enum.MetricType.IP, + HNSWParams(m=16, efconstruction=200), + ), + FilterIndex("text", enum.FieldType.String, enum.IndexType.FILTER), + FilterIndex("metadata", enum.FieldType.String, enum.IndexType.FILTER), + ) + return RPCCollection( + RPCDatabase( + name=database_name, read_consistency=self._read_consistency, - name="dify", - ) - ] - - def list_collections(self, timeout: Optional[float] = None) -> list[Collection]: - return [] - - def drop_collection(self, name: str, timeout: Optional[float] = None): - return {"code": 0, "msg": "operation success"} + ), + collection_name, + index=index, + ) def create_collection( self, - name: str, + database_name: str, + collection_name: str, shard: int, replicas: int, - description: str, - index: Index, + description: Optional[str] = None, + index: Index = None, embedding: Embedding = None, timeout: Optional[float] = None, - ) -> Collection: - return Collection( - self, - name, + ttl_config: Optional[dict] = None, + filter_index_config: FilterIndexConfig = None, + indexes: Optional[list[IndexField]] = None, + ) -> RPCCollection: + return RPCCollection( + RPCDatabase( + name="dify", + read_consistency=self._read_consistency, + ), + collection_name, shard, replicas, description, @@ -60,19 +92,26 @@ class MockTcvectordbClass: embedding=embedding, read_consistency=self._read_consistency, timeout=timeout, + ttl_config=ttl_config, + filter_index_config=filter_index_config, + indexes=indexes, ) - def describe_collection(self, name: str, timeout: Optional[float] = None) -> Collection: - collection = Collection(self, name, shard=1, replicas=2, description=name, timeout=timeout) - return collection - def collection_upsert( - self, documents: list[Document], timeout: Optional[float] = None, build_index: bool = True, **kwargs + self, + database_name: str, + collection_name: str, + documents: list[Union[Document, dict]], + timeout: Optional[float] = None, + build_index: bool = True, + **kwargs, ): return {"code": 0, "msg": "operation success"} def collection_search( self, + database_name: str, + collection_name: str, vectors: list[list[float]], filter: Filter = None, params=None, @@ -81,10 +120,29 @@ class MockTcvectordbClass: output_fields: Optional[list[str]] = None, timeout: Optional[float] = None, ) -> list[list[dict]]: - return [[{"metadata": '{"doc_id":"foo1"}', "text": "text", "doc_id": "foo1", "score": 0.1}]] + return [[{"metadata": {"doc_id": "foo1"}, "text": "text", "doc_id": "foo1", "score": 0.1}]] + + def collection_hybrid_search( + self, + database_name: str, + collection_name: str, + ann: Optional[Union[list[AnnSearch], AnnSearch]] = None, + match: Optional[Union[list[KeywordSearch], KeywordSearch]] = None, + filter: Union[Filter, str] = None, + rerank: Optional[Rerank] = None, + retrieve_vector: Optional[bool] = None, + output_fields: Optional[list[str]] = None, + limit: Optional[int] = None, + timeout: Optional[float] = None, + return_pd_object=False, + **kwargs, + ) -> list[list[dict]]: + return [[{"metadata": {"doc_id": "foo1"}, "text": "text", "doc_id": "foo1", "score": 0.1}]] def collection_query( self, + database_name: str, + collection_name: str, document_ids: Optional[list] = None, retrieve_vector: bool = False, limit: Optional[int] = None, @@ -97,12 +155,17 @@ class MockTcvectordbClass: def collection_delete( self, + database_name: str, + collection_name: str, document_ids: Optional[list[str]] = None, filter: Filter = None, timeout: Optional[float] = None, ): return {"code": 0, "msg": "operation success"} + def drop_collection(self, database_name: str, collection_name: str, timeout: Optional[float] = None) -> dict: + return {"code": 0, "msg": "operation success"} + MOCK = os.getenv("MOCK_SWITCH", "false").lower() == "true" @@ -110,16 +173,19 @@ MOCK = os.getenv("MOCK_SWITCH", "false").lower() == "true" @pytest.fixture def setup_tcvectordb_mock(request, monkeypatch: MonkeyPatch): if MOCK: - monkeypatch.setattr(VectorDBClient, "__init__", MockTcvectordbClass.mock_vector_db_client) - monkeypatch.setattr(VectorDBClient, "list_databases", MockTcvectordbClass.list_databases) - monkeypatch.setattr(Database, "collection", MockTcvectordbClass.describe_collection) - monkeypatch.setattr(Database, "list_collections", MockTcvectordbClass.list_collections) - monkeypatch.setattr(Database, "drop_collection", MockTcvectordbClass.drop_collection) - monkeypatch.setattr(Database, "create_collection", MockTcvectordbClass.create_collection) - monkeypatch.setattr(Collection, "upsert", MockTcvectordbClass.collection_upsert) - monkeypatch.setattr(Collection, "search", MockTcvectordbClass.collection_search) - monkeypatch.setattr(Collection, "query", MockTcvectordbClass.collection_query) - monkeypatch.setattr(Collection, "delete", MockTcvectordbClass.collection_delete) + monkeypatch.setattr(RPCVectorDBClient, "__init__", MockTcvectordbClass.mock_vector_db_client) + monkeypatch.setattr( + RPCVectorDBClient, "create_database_if_not_exists", MockTcvectordbClass.create_database_if_not_exists + ) + monkeypatch.setattr(RPCVectorDBClient, "exists_collection", MockTcvectordbClass.exists_collection) + monkeypatch.setattr(RPCVectorDBClient, "create_collection", MockTcvectordbClass.create_collection) + monkeypatch.setattr(RPCVectorDBClient, "describe_collection", MockTcvectordbClass.describe_collection) + monkeypatch.setattr(RPCVectorDBClient, "upsert", MockTcvectordbClass.collection_upsert) + monkeypatch.setattr(RPCVectorDBClient, "search", MockTcvectordbClass.collection_search) + monkeypatch.setattr(RPCVectorDBClient, "hybrid_search", MockTcvectordbClass.collection_hybrid_search) + monkeypatch.setattr(RPCVectorDBClient, "query", MockTcvectordbClass.collection_query) + monkeypatch.setattr(RPCVectorDBClient, "delete", MockTcvectordbClass.collection_delete) + monkeypatch.setattr(RPCVectorDBClient, "drop_collection", MockTcvectordbClass.drop_collection) yield diff --git a/api/tests/integration_tests/vdb/tablestore/__init__.py b/api/tests/integration_tests/vdb/tablestore/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/api/tests/integration_tests/vdb/tablestore/test_tablestore.py b/api/tests/integration_tests/vdb/tablestore/test_tablestore.py new file mode 100644 index 000000000..da890d0b7 --- /dev/null +++ b/api/tests/integration_tests/vdb/tablestore/test_tablestore.py @@ -0,0 +1,34 @@ +import os + +from core.rag.datasource.vdb.tablestore.tablestore_vector import ( + TableStoreConfig, + TableStoreVector, +) +from tests.integration_tests.vdb.test_vector_store import ( + AbstractVectorTest, + setup_mock_redis, +) + + +class TableStoreVectorTest(AbstractVectorTest): + def __init__(self): + super().__init__() + self.vector = TableStoreVector( + collection_name=self.collection_name, + config=TableStoreConfig( + endpoint=os.getenv("TABLESTORE_ENDPOINT"), + instance_name=os.getenv("TABLESTORE_INSTANCE_NAME"), + access_key_id=os.getenv("TABLESTORE_ACCESS_KEY_ID"), + access_key_secret=os.getenv("TABLESTORE_ACCESS_KEY_SECRET"), + ), + ) + + def get_ids_by_metadata_field(self): + ids = self.vector.get_ids_by_metadata_field(key="doc_id", value=self.example_doc_id) + assert ids is not None + assert len(ids) == 1 + assert ids[0] == self.example_doc_id + + +def test_tablestore_vector(setup_mock_redis): + TableStoreVectorTest().run_all_tests() diff --git a/api/tests/integration_tests/vdb/tcvectordb/test_tencent.py b/api/tests/integration_tests/vdb/tcvectordb/test_tencent.py index 1b9466e27..9227bbdcd 100644 --- a/api/tests/integration_tests/vdb/tcvectordb/test_tencent.py +++ b/api/tests/integration_tests/vdb/tcvectordb/test_tencent.py @@ -21,6 +21,7 @@ class TencentVectorTest(AbstractVectorTest): database="dify", shard=1, replicas=2, + enable_hybrid_search=True, ), ) @@ -30,7 +31,7 @@ class TencentVectorTest(AbstractVectorTest): def search_by_full_text(self): hits_by_full_text = self.vector.search_by_full_text(query=get_example_text()) - assert len(hits_by_full_text) == 0 + assert len(hits_by_full_text) >= 0 def test_tencent_vector(setup_mock_redis, setup_tcvectordb_mock): diff --git a/api/tests/unit_tests/core/agent/output_parser/test_cot_output_parser.py b/api/tests/unit_tests/core/agent/output_parser/test_cot_output_parser.py new file mode 100644 index 000000000..4a613e35b --- /dev/null +++ b/api/tests/unit_tests/core/agent/output_parser/test_cot_output_parser.py @@ -0,0 +1,70 @@ +import json +from collections.abc import Generator + +from core.agent.entities import AgentScratchpadUnit +from core.agent.output_parser.cot_output_parser import CotAgentOutputParser +from core.model_runtime.entities.llm_entities import AssistantPromptMessage, LLMResultChunk, LLMResultChunkDelta + + +def mock_llm_response(text) -> Generator[LLMResultChunk, None, None]: + for i in range(len(text)): + yield LLMResultChunk( + model="model", + prompt_messages=[], + delta=LLMResultChunkDelta(index=0, message=AssistantPromptMessage(content=text[i], tool_calls=[])), + ) + + +def test_cot_output_parser(): + test_cases = [ + { + "input": 'Through: abc\nAction: ```{"action": "Final Answer", "action_input": "```echarts\n {}\n```"}```', + "action": {"action": "Final Answer", "action_input": "```echarts\n {}\n```"}, + "output": 'Through: abc\n {"action": "Final Answer", "action_input": "```echarts\\n {}\\n```"}', + }, + # code block with json + { + "input": 'Through: abc\nAction: ```json\n{"action": "Final Answer", "action_input": "```echarts\n {' + '}\n```"}```', + "action": {"action": "Final Answer", "action_input": "```echarts\n {}\n```"}, + "output": 'Through: abc\n {"action": "Final Answer", "action_input": "```echarts\\n {}\\n```"}', + }, + # code block with JSON + { + "input": 'Through: abc\nAction: ```JSON\n{"action": "Final Answer", "action_input": "```echarts\n {' + '}\n```"}```', + "action": {"action": "Final Answer", "action_input": "```echarts\n {}\n```"}, + "output": 'Through: abc\n {"action": "Final Answer", "action_input": "```echarts\\n {}\\n```"}', + }, + # list + { + "input": 'Through: abc\nAction: ```[{"action": "Final Answer", "action_input": "```echarts\n {}\n```"}]```', + "action": {"action": "Final Answer", "action_input": "```echarts\n {}\n```"}, + "output": 'Through: abc\n {"action": "Final Answer", "action_input": "```echarts\\n {}\\n```"}', + }, + # no code block + { + "input": 'Through: abc\nAction: {"action": "Final Answer", "action_input": "```echarts\n {}\n```"}', + "action": {"action": "Final Answer", "action_input": "```echarts\n {}\n```"}, + "output": 'Through: abc\n {"action": "Final Answer", "action_input": "```echarts\\n {}\\n```"}', + }, + # no code block and json + {"input": "Through: abc\nAction: efg", "action": {}, "output": "Through: abc\n efg"}, + ] + + parser = CotAgentOutputParser() + usage_dict = {} + for test_case in test_cases: + # mock llm_response as a generator by text + llm_response: Generator[LLMResultChunk, None, None] = mock_llm_response(test_case["input"]) + results = parser.handle_react_stream_output(llm_response, usage_dict) + output = "" + for result in results: + if isinstance(result, str): + output += result + elif isinstance(result, AgentScratchpadUnit.Action): + if test_case["action"]: + assert result.to_dict() == test_case["action"] + output += json.dumps(result.to_dict()) + if test_case["output"]: + assert output == test_case["output"] diff --git a/dev/reformat b/dev/reformat index 82f96b8e8..daab53895 100755 --- a/dev/reformat +++ b/dev/reformat @@ -16,3 +16,6 @@ poetry run -C api ruff format ./ # run dotenv-linter linter poetry run -P api dotenv-linter ./api/.env.example ./web/.env.example + +# run mypy check +dev/run-mypy diff --git a/dev/run-mypy b/dev/run-mypy new file mode 100755 index 000000000..cdbbef515 --- /dev/null +++ b/dev/run-mypy @@ -0,0 +1,11 @@ +#!/bin/bash + +set -x + +if ! command -v mypy &> /dev/null; then + poetry install -C api --with dev +fi + +# run mypy checks +poetry run -C api \ + python -m mypy --install-types --non-interactive . diff --git a/docker/.env.example b/docker/.env.example index 2e069db9e..29d33360e 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -383,7 +383,7 @@ SUPABASE_URL=your-server-url # ------------------------------ # The type of vector store to use. -# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`, `opengauss`. +# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`, `opengauss`, `tablestore`. VECTOR_STORE=weaviate # The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`. @@ -515,6 +515,7 @@ TENCENT_VECTOR_DB_USERNAME=dify TENCENT_VECTOR_DB_DATABASE=dify TENCENT_VECTOR_DB_SHARD=1 TENCENT_VECTOR_DB_REPLICAS=2 +TENCENT_VECTOR_DB_ENABLE_HYBRID_SEARCH=false # ElasticSearch configuration, only available when VECTOR_STORE is `elasticsearch` ELASTICSEARCH_HOST=0.0.0.0 @@ -554,6 +555,7 @@ OCEANBASE_VECTOR_PASSWORD=difyai123456 OCEANBASE_VECTOR_DATABASE=test OCEANBASE_CLUSTER_NAME=difyai OCEANBASE_MEMORY_LIMIT=6G +OCEANBASE_ENABLE_HYBRID_SEARCH=false # opengauss configurations, only available when VECTOR_STORE is `opengauss` OPENGAUSS_HOST=opengauss @@ -569,6 +571,13 @@ OPENGAUSS_ENABLE_PQ=false UPSTASH_VECTOR_URL=https://xxx-vector.upstash.io UPSTASH_VECTOR_TOKEN=dify +# TableStore Vector configuration +# (only used when VECTOR_STORE is tablestore) +TABLESTORE_ENDPOINT=https://instance-name.cn-hangzhou.ots.aliyuncs.com +TABLESTORE_INSTANCE_NAME=instance-name +TABLESTORE_ACCESS_KEY_ID=xxx +TABLESTORE_ACCESS_KEY_SECRET=xxx + # ------------------------------ # Knowledge Configuration # ------------------------------ @@ -742,6 +751,9 @@ MAX_TOOLS_NUM=10 # Maximum number of Parallelism branches in the workflow MAX_PARALLEL_LIMIT=10 +# The maximum number of iterations for agent setting +MAX_ITERATIONS_NUM=5 + # ------------------------------ # Environment Variables for web Service # ------------------------------ @@ -993,3 +1005,28 @@ PLUGIN_PYTHON_ENV_INIT_TIMEOUT=120 PLUGIN_MAX_EXECUTION_TIMEOUT=600 # PIP_MIRROR_URL=https://pypi.tuna.tsinghua.edu.cn/simple PIP_MIRROR_URL= + +# https://github.com/langgenius/dify-plugin-daemon/blob/main/.env.example +# Plugin storage type, local aws_s3 tencent_cos azure_blob +PLUGIN_STORAGE_TYPE=local +PLUGIN_STORAGE_LOCAL_ROOT=/app/storage +PLUGIN_WORKING_PATH=/app/storage/cwd +PLUGIN_INSTALLED_PATH=plugin +PLUGIN_PACKAGE_CACHE_PATH=plugin_packages +PLUGIN_MEDIA_CACHE_PATH=assets +# Plugin oss bucket +PLUGIN_STORAGE_OSS_BUCKET= +# Plugin oss s3 credentials +PLUGIN_S3_USE_AWS_MANAGED_IAM= +PLUGIN_S3_ENDPOINT= +PLUGIN_S3_USE_PATH_STYLE= +PLUGIN_AWS_ACCESS_KEY= +PLUGIN_AWS_SECRET_KEY= +PLUGIN_AWS_REGION= +# Plugin oss azure blob +PLUGIN_AZURE_BLOB_STORAGE_CONTAINER_NAME= +PLUGIN_AZURE_BLOB_STORAGE_CONNECTION_STRING= +# Plugin oss tencent cos +PLUGIN_TENCENT_COS_SECRET_KEY= +PLUGIN_TENCENT_COS_SECRET_ID= +PLUGIN_TENCENT_COS_REGION= diff --git a/docker/README.md b/docker/README.md index 20d5cce91..b21d18ce8 100644 --- a/docker/README.md +++ b/docker/README.md @@ -36,7 +36,8 @@ Welcome to the new `docker` directory for deploying Dify using Docker Compose. T - Navigate to the `docker` directory. - Ensure the `middleware.env` file is created by running `cp middleware.env.example middleware.env` (refer to the `middleware.env.example` file). 2. **Running Middleware Services**: - - Execute `docker-compose -f docker-compose.middleware.yaml up --env-file middleware.env -d` to start the middleware services. + - Navigate to the `docker` directory. + - Execute `docker compose -f docker-compose.middleware.yaml --profile weaviate -p dify up -d` to start the middleware services. (Change the profile to other vector database if you are not using weaviate) ### Migration for Existing Users diff --git a/docker/docker-compose-template.yaml b/docker/docker-compose-template.yaml index 520cb948c..d7bcff7ed 100644 --- a/docker/docker-compose-template.yaml +++ b/docker/docker-compose-template.yaml @@ -2,7 +2,7 @@ x-shared-env: &shared-api-worker-env services: # API service api: - image: langgenius/dify-api:1.1.3 + image: langgenius/dify-api:1.2.0 restart: always environment: # Use the shared environment variables. @@ -29,7 +29,7 @@ services: # worker service # The Celery worker for processing the queue. worker: - image: langgenius/dify-api:1.1.3 + image: langgenius/dify-api:1.2.0 restart: always environment: # Use the shared environment variables. @@ -53,7 +53,7 @@ services: # Frontend web application. web: - image: langgenius/dify-web:1.1.3 + image: langgenius/dify-web:1.2.0 restart: always environment: CONSOLE_API_URL: ${CONSOLE_API_URL:-} @@ -70,6 +70,7 @@ services: LOOP_NODE_MAX_COUNT: ${LOOP_NODE_MAX_COUNT:-100} MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10} MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10} + MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5} # The postgres database. db: @@ -133,7 +134,7 @@ services: # plugin daemon plugin_daemon: - image: langgenius/dify-plugin-daemon:0.0.6-local + image: langgenius/dify-plugin-daemon:0.0.7-local restart: always environment: # Use the shared environment variables. @@ -152,6 +153,23 @@ services: PYTHON_ENV_INIT_TIMEOUT: ${PLUGIN_PYTHON_ENV_INIT_TIMEOUT:-120} PLUGIN_MAX_EXECUTION_TIMEOUT: ${PLUGIN_MAX_EXECUTION_TIMEOUT:-600} PIP_MIRROR_URL: ${PIP_MIRROR_URL:-} + PLUGIN_STORAGE_TYPE: ${PLUGIN_STORAGE_TYPE:-local} + PLUGIN_STORAGE_LOCAL_ROOT: ${PLUGIN_STORAGE_LOCAL_ROOT:-/app/storage} + PLUGIN_INSTALLED_PATH: ${PLUGIN_INSTALLED_PATH:-plugin} + PLUGIN_PACKAGE_CACHE_PATH: ${PLUGIN_PACKAGE_CACHE_PATH:-plugin_packages} + PLUGIN_MEDIA_CACHE_PATH: ${PLUGIN_MEDIA_CACHE_PATH:-assets} + PLUGIN_STORAGE_OSS_BUCKET: ${PLUGIN_STORAGE_OSS_BUCKET:-} + S3_USE_AWS_MANAGED_IAM: ${PLUGIN_S3_USE_AWS_MANAGED_IAM:-} + S3_ENDPOINT: ${PLUGIN_S3_ENDPOINT:-} + S3_USE_PATH_STYLE: ${PLUGIN_S3_USE_PATH_STYLE:-} + AWS_ACCESS_KEY: ${PLUGIN_AWS_ACCESS_KEY:-} + PAWS_SECRET_KEY: ${PLUGIN_AWS_SECRET_KEY:-} + AWS_REGION: ${PLUGIN_AWS_REGION:-} + AZURE_BLOB_STORAGE_CONNECTION_STRING: ${PLUGIN_AZURE_BLOB_STORAGE_CONNECTION_STRING:-} + AZURE_BLOB_STORAGE_CONTAINER_NAME: ${PLUGIN_AZURE_BLOB_STORAGE_CONTAINER_NAME:-} + TENCENT_COS_SECRET_KEY: ${PLUGIN_TENCENT_COS_SECRET_KEY:-} + TENCENT_COS_SECRET_ID: ${PLUGIN_TENCENT_COS_SECRET_ID:-} + TENCENT_COS_REGION: ${PLUGIN_TENCENT_COS_REGION:-} ports: - "${EXPOSE_PLUGIN_DEBUGGING_PORT:-5003}:${PLUGIN_DEBUGGING_PORT:-5003}" volumes: @@ -373,7 +391,8 @@ services: # OceanBase vector database oceanbase: - image: quay.io/oceanbase/oceanbase-ce:4.3.3.0-100000142024101215 + image: oceanbase/oceanbase-ce:4.3.5.1-101000042025031818 + container_name: oceanbase profiles: - oceanbase restart: always @@ -386,7 +405,9 @@ services: OB_SYS_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456} OB_TENANT_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456} OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai} - OB_SERVER_IP: '127.0.0.1' + MODE: MINI + ports: + - "${OCEANBASE_VECTOR_PORT:-2881}:2881" # Oracle vector database oracle: diff --git a/docker/docker-compose.middleware.yaml b/docker/docker-compose.middleware.yaml index b4f772cc8..de238d966 100644 --- a/docker/docker-compose.middleware.yaml +++ b/docker/docker-compose.middleware.yaml @@ -29,6 +29,8 @@ services: redis: image: redis:6-alpine restart: always + env_file: + - ./middleware.env environment: REDISCLI_AUTH: ${REDIS_PASSWORD:-difyai123456} volumes: @@ -45,6 +47,8 @@ services: sandbox: image: langgenius/dify-sandbox:0.2.11 restart: always + env_file: + - ./middleware.env environment: # The DifySandbox configurations # Make sure you are changing this key for your deployment with a strong key. @@ -66,8 +70,10 @@ services: # plugin daemon plugin_daemon: - image: langgenius/dify-plugin-daemon:0.0.6-local + image: langgenius/dify-plugin-daemon:0.0.7-local restart: always + env_file: + - ./middleware.env environment: # Use the shared environment variables. DB_HOST: ${DB_HOST:-db} @@ -91,6 +97,23 @@ services: PYTHON_ENV_INIT_TIMEOUT: ${PLUGIN_PYTHON_ENV_INIT_TIMEOUT:-120} PLUGIN_MAX_EXECUTION_TIMEOUT: ${PLUGIN_MAX_EXECUTION_TIMEOUT:-600} PIP_MIRROR_URL: ${PIP_MIRROR_URL:-} + PLUGIN_STORAGE_TYPE: ${PLUGIN_STORAGE_TYPE:-local} + PLUGIN_STORAGE_LOCAL_ROOT: ${PLUGIN_STORAGE_LOCAL_ROOT:-/app/storage} + PLUGIN_INSTALLED_PATH: ${PLUGIN_INSTALLED_PATH:-plugin} + PLUGIN_PACKAGE_CACHE_PATH: ${PLUGIN_PACKAGE_CACHE_PATH:-plugin_packages} + PLUGIN_MEDIA_CACHE_PATH: ${PLUGIN_MEDIA_CACHE_PATH:-assets} + PLUGIN_STORAGE_OSS_BUCKET: ${PLUGIN_STORAGE_OSS_BUCKET:-} + S3_USE_AWS_MANAGED_IAM: ${PLUGIN_S3_USE_AWS_MANAGED_IAM:-} + S3_ENDPOINT: ${PLUGIN_S3_ENDPOINT:-} + S3_USE_PATH_STYLE: ${PLUGIN_S3_USE_PATH_STYLE:-} + AWS_ACCESS_KEY: ${PLUGIN_AWS_ACCESS_KEY:-} + PAWS_SECRET_KEY: ${PLUGIN_AWS_SECRET_KEY:-} + AWS_REGION: ${PLUGIN_AWS_REGION:-} + AZURE_BLOB_STORAGE_CONNECTION_STRING: ${PLUGIN_AZURE_BLOB_STORAGE_CONNECTION_STRING:-} + AZURE_BLOB_STORAGE_CONTAINER_NAME: ${PLUGIN_AZURE_BLOB_STORAGE_CONTAINER_NAME:-} + TENCENT_COS_SECRET_KEY: ${PLUGIN_TENCENT_COS_SECRET_KEY:-} + TENCENT_COS_SECRET_ID: ${PLUGIN_TENCENT_COS_SECRET_ID:-} + TENCENT_COS_REGION: ${PLUGIN_TENCENT_COS_REGION:-} ports: - "${EXPOSE_PLUGIN_DAEMON_PORT:-5002}:${PLUGIN_DAEMON_PORT:-5002}" - "${EXPOSE_PLUGIN_DEBUGGING_PORT:-5003}:${PLUGIN_DEBUGGING_PORT:-5003}" @@ -107,6 +130,8 @@ services: - ./ssrf_proxy/squid.conf.template:/etc/squid/squid.conf.template - ./ssrf_proxy/docker-entrypoint.sh:/docker-entrypoint-mount.sh entrypoint: [ "sh", "-c", "cp /docker-entrypoint-mount.sh /docker-entrypoint.sh && sed -i 's/\r$$//' /docker-entrypoint.sh && chmod +x /docker-entrypoint.sh && /docker-entrypoint.sh" ] + env_file: + - ./middleware.env environment: # pls clearly modify the squid env vars to fit your network environment. HTTP_PORT: ${SSRF_HTTP_PORT:-3128} diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 4a40cb1ae..f120e19be 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -223,6 +223,7 @@ x-shared-env: &shared-api-worker-env TENCENT_VECTOR_DB_DATABASE: ${TENCENT_VECTOR_DB_DATABASE:-dify} TENCENT_VECTOR_DB_SHARD: ${TENCENT_VECTOR_DB_SHARD:-1} TENCENT_VECTOR_DB_REPLICAS: ${TENCENT_VECTOR_DB_REPLICAS:-2} + TENCENT_VECTOR_DB_ENABLE_HYBRID_SEARCH: ${TENCENT_VECTOR_DB_ENABLE_HYBRID_SEARCH:-false} ELASTICSEARCH_HOST: ${ELASTICSEARCH_HOST:-0.0.0.0} ELASTICSEARCH_PORT: ${ELASTICSEARCH_PORT:-9200} ELASTICSEARCH_USERNAME: ${ELASTICSEARCH_USERNAME:-elastic} @@ -252,6 +253,7 @@ x-shared-env: &shared-api-worker-env OCEANBASE_VECTOR_DATABASE: ${OCEANBASE_VECTOR_DATABASE:-test} OCEANBASE_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai} OCEANBASE_MEMORY_LIMIT: ${OCEANBASE_MEMORY_LIMIT:-6G} + OCEANBASE_ENABLE_HYBRID_SEARCH: ${OCEANBASE_ENABLE_HYBRID_SEARCH:-false} OPENGAUSS_HOST: ${OPENGAUSS_HOST:-opengauss} OPENGAUSS_PORT: ${OPENGAUSS_PORT:-6600} OPENGAUSS_USER: ${OPENGAUSS_USER:-postgres} @@ -262,6 +264,10 @@ x-shared-env: &shared-api-worker-env OPENGAUSS_ENABLE_PQ: ${OPENGAUSS_ENABLE_PQ:-false} UPSTASH_VECTOR_URL: ${UPSTASH_VECTOR_URL:-https://xxx-vector.upstash.io} UPSTASH_VECTOR_TOKEN: ${UPSTASH_VECTOR_TOKEN:-dify} + TABLESTORE_ENDPOINT: ${TABLESTORE_ENDPOINT:-https://instance-name.cn-hangzhou.ots.aliyuncs.com} + TABLESTORE_INSTANCE_NAME: ${TABLESTORE_INSTANCE_NAME:-instance-name} + TABLESTORE_ACCESS_KEY_ID: ${TABLESTORE_ACCESS_KEY_ID:-xxx} + TABLESTORE_ACCESS_KEY_SECRET: ${TABLESTORE_ACCESS_KEY_SECRET:-xxx} UPLOAD_FILE_SIZE_LIMIT: ${UPLOAD_FILE_SIZE_LIMIT:-15} UPLOAD_FILE_BATCH_LIMIT: ${UPLOAD_FILE_BATCH_LIMIT:-5} ETL_TYPE: ${ETL_TYPE:-dify} @@ -324,6 +330,7 @@ x-shared-env: &shared-api-worker-env LOOP_NODE_MAX_COUNT: ${LOOP_NODE_MAX_COUNT:-100} MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10} MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10} + MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5} TEXT_GENERATION_TIMEOUT_MS: ${TEXT_GENERATION_TIMEOUT_MS:-60000} PGUSER: ${PGUSER:-${DB_USERNAME}} POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-${DB_PASSWORD}} @@ -429,11 +436,29 @@ x-shared-env: &shared-api-worker-env PLUGIN_PYTHON_ENV_INIT_TIMEOUT: ${PLUGIN_PYTHON_ENV_INIT_TIMEOUT:-120} PLUGIN_MAX_EXECUTION_TIMEOUT: ${PLUGIN_MAX_EXECUTION_TIMEOUT:-600} PIP_MIRROR_URL: ${PIP_MIRROR_URL:-} + PLUGIN_STORAGE_TYPE: ${PLUGIN_STORAGE_TYPE:-local} + PLUGIN_STORAGE_LOCAL_ROOT: ${PLUGIN_STORAGE_LOCAL_ROOT:-/app/storage} + PLUGIN_WORKING_PATH: ${PLUGIN_WORKING_PATH:-/app/storage/cwd} + PLUGIN_INSTALLED_PATH: ${PLUGIN_INSTALLED_PATH:-plugin} + PLUGIN_PACKAGE_CACHE_PATH: ${PLUGIN_PACKAGE_CACHE_PATH:-plugin_packages} + PLUGIN_MEDIA_CACHE_PATH: ${PLUGIN_MEDIA_CACHE_PATH:-assets} + PLUGIN_STORAGE_OSS_BUCKET: ${PLUGIN_STORAGE_OSS_BUCKET:-} + PLUGIN_S3_USE_AWS_MANAGED_IAM: ${PLUGIN_S3_USE_AWS_MANAGED_IAM:-} + PLUGIN_S3_ENDPOINT: ${PLUGIN_S3_ENDPOINT:-} + PLUGIN_S3_USE_PATH_STYLE: ${PLUGIN_S3_USE_PATH_STYLE:-} + PLUGIN_AWS_ACCESS_KEY: ${PLUGIN_AWS_ACCESS_KEY:-} + PLUGIN_AWS_SECRET_KEY: ${PLUGIN_AWS_SECRET_KEY:-} + PLUGIN_AWS_REGION: ${PLUGIN_AWS_REGION:-} + PLUGIN_AZURE_BLOB_STORAGE_CONTAINER_NAME: ${PLUGIN_AZURE_BLOB_STORAGE_CONTAINER_NAME:-} + PLUGIN_AZURE_BLOB_STORAGE_CONNECTION_STRING: ${PLUGIN_AZURE_BLOB_STORAGE_CONNECTION_STRING:-} + PLUGIN_TENCENT_COS_SECRET_KEY: ${PLUGIN_TENCENT_COS_SECRET_KEY:-} + PLUGIN_TENCENT_COS_SECRET_ID: ${PLUGIN_TENCENT_COS_SECRET_ID:-} + PLUGIN_TENCENT_COS_REGION: ${PLUGIN_TENCENT_COS_REGION:-} services: # API service api: - image: langgenius/dify-api:1.1.3 + image: langgenius/dify-api:1.2.0 restart: always environment: # Use the shared environment variables. @@ -460,7 +485,7 @@ services: # worker service # The Celery worker for processing the queue. worker: - image: langgenius/dify-api:1.1.3 + image: langgenius/dify-api:1.2.0 restart: always environment: # Use the shared environment variables. @@ -484,7 +509,7 @@ services: # Frontend web application. web: - image: langgenius/dify-web:1.1.3 + image: langgenius/dify-web:1.2.0 restart: always environment: CONSOLE_API_URL: ${CONSOLE_API_URL:-} @@ -501,6 +526,7 @@ services: LOOP_NODE_MAX_COUNT: ${LOOP_NODE_MAX_COUNT:-100} MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10} MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10} + MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5} # The postgres database. db: @@ -564,7 +590,7 @@ services: # plugin daemon plugin_daemon: - image: langgenius/dify-plugin-daemon:0.0.6-local + image: langgenius/dify-plugin-daemon:0.0.7-local restart: always environment: # Use the shared environment variables. @@ -583,6 +609,23 @@ services: PYTHON_ENV_INIT_TIMEOUT: ${PLUGIN_PYTHON_ENV_INIT_TIMEOUT:-120} PLUGIN_MAX_EXECUTION_TIMEOUT: ${PLUGIN_MAX_EXECUTION_TIMEOUT:-600} PIP_MIRROR_URL: ${PIP_MIRROR_URL:-} + PLUGIN_STORAGE_TYPE: ${PLUGIN_STORAGE_TYPE:-local} + PLUGIN_STORAGE_LOCAL_ROOT: ${PLUGIN_STORAGE_LOCAL_ROOT:-/app/storage} + PLUGIN_INSTALLED_PATH: ${PLUGIN_INSTALLED_PATH:-plugin} + PLUGIN_PACKAGE_CACHE_PATH: ${PLUGIN_PACKAGE_CACHE_PATH:-plugin_packages} + PLUGIN_MEDIA_CACHE_PATH: ${PLUGIN_MEDIA_CACHE_PATH:-assets} + PLUGIN_STORAGE_OSS_BUCKET: ${PLUGIN_STORAGE_OSS_BUCKET:-} + S3_USE_AWS_MANAGED_IAM: ${PLUGIN_S3_USE_AWS_MANAGED_IAM:-} + S3_ENDPOINT: ${PLUGIN_S3_ENDPOINT:-} + S3_USE_PATH_STYLE: ${PLUGIN_S3_USE_PATH_STYLE:-} + AWS_ACCESS_KEY: ${PLUGIN_AWS_ACCESS_KEY:-} + PAWS_SECRET_KEY: ${PLUGIN_AWS_SECRET_KEY:-} + AWS_REGION: ${PLUGIN_AWS_REGION:-} + AZURE_BLOB_STORAGE_CONNECTION_STRING: ${PLUGIN_AZURE_BLOB_STORAGE_CONNECTION_STRING:-} + AZURE_BLOB_STORAGE_CONTAINER_NAME: ${PLUGIN_AZURE_BLOB_STORAGE_CONTAINER_NAME:-} + TENCENT_COS_SECRET_KEY: ${PLUGIN_TENCENT_COS_SECRET_KEY:-} + TENCENT_COS_SECRET_ID: ${PLUGIN_TENCENT_COS_SECRET_ID:-} + TENCENT_COS_REGION: ${PLUGIN_TENCENT_COS_REGION:-} ports: - "${EXPOSE_PLUGIN_DEBUGGING_PORT:-5003}:${PLUGIN_DEBUGGING_PORT:-5003}" volumes: @@ -804,7 +847,8 @@ services: # OceanBase vector database oceanbase: - image: quay.io/oceanbase/oceanbase-ce:4.3.3.0-100000142024101215 + image: oceanbase/oceanbase-ce:4.3.5.1-101000042025031818 + container_name: oceanbase profiles: - oceanbase restart: always @@ -817,7 +861,9 @@ services: OB_SYS_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456} OB_TENANT_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456} OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai} - OB_SERVER_IP: '127.0.0.1' + MODE: MINI + ports: + - "${OCEANBASE_VECTOR_PORT:-2881}:2881" # Oracle vector database oracle: diff --git a/docker/middleware.env.example b/docker/middleware.env.example index d01f9abe5..eb38526d5 100644 --- a/docker/middleware.env.example +++ b/docker/middleware.env.example @@ -119,4 +119,29 @@ FORCE_VERIFYING_SIGNATURE=true PLUGIN_PYTHON_ENV_INIT_TIMEOUT=120 PLUGIN_MAX_EXECUTION_TIMEOUT=600 # PIP_MIRROR_URL=https://pypi.tuna.tsinghua.edu.cn/simple -PIP_MIRROR_URL= \ No newline at end of file +PIP_MIRROR_URL= + +# https://github.com/langgenius/dify-plugin-daemon/blob/main/.env.example +# Plugin storage type, local aws_s3 tencent_cos azure_blob +PLUGIN_STORAGE_TYPE=local +PLUGIN_STORAGE_LOCAL_ROOT=/app/storage +PLUGIN_WORKING_PATH=/app/storage/cwd +PLUGIN_INSTALLED_PATH=plugin +PLUGIN_PACKAGE_CACHE_PATH=plugin_packages +PLUGIN_MEDIA_CACHE_PATH=assets +# Plugin oss bucket +PLUGIN_STORAGE_OSS_BUCKET= +# Plugin oss s3 credentials +PLUGIN_S3_USE_AWS_MANAGED_IAM= +PLUGIN_S3_ENDPOINT= +PLUGIN_S3_USE_PATH_STYLE= +PLUGIN_AWS_ACCESS_KEY= +PLUGIN_AWS_SECRET_KEY= +PLUGIN_AWS_REGION= +# Plugin oss azure blob +PLUGIN_AZURE_BLOB_STORAGE_CONTAINER_NAME= +PLUGIN_AZURE_BLOB_STORAGE_CONNECTION_STRING= +# Plugin oss tencent cos +PLUGIN_TENCENT_COS_SECRET_KEY= +PLUGIN_TENCENT_COS_SECRET_ID= +PLUGIN_TENCENT_COS_REGION= \ No newline at end of file diff --git a/docker/ssrf_proxy/squid.conf.template b/docker/ssrf_proxy/squid.conf.template index 7e85e84a6..c74c1fb67 100644 --- a/docker/ssrf_proxy/squid.conf.template +++ b/docker/ssrf_proxy/squid.conf.template @@ -51,3 +51,6 @@ http_port ${REVERSE_PROXY_PORT} accel vhost cache_peer ${SANDBOX_HOST} parent ${SANDBOX_PORT} 0 no-query originserver acl src_all src all http_access allow src_all + +# Unless the option's size is increased, an error will occur when uploading more than two files. +client_request_buffer_max_size 100 MB \ No newline at end of file diff --git a/images/wechat.png b/images/wechat.png deleted file mode 100644 index 3cc6077ed..000000000 Binary files a/images/wechat.png and /dev/null differ diff --git a/web/.env.example b/web/.env.example index d0e88aa94..386fb8d12 100644 --- a/web/.env.example +++ b/web/.env.example @@ -47,6 +47,9 @@ NEXT_PUBLIC_MAX_TOOLS_NUM=10 # Maximum number of Parallelism branches in the workflow NEXT_PUBLIC_MAX_PARALLEL_LIMIT=10 +# The maximum number of iterations for agent setting +NEXT_PUBLIC_MAX_ITERATIONS_NUM=5 + # Default Domain Extend(二开新增配置) NEXT_PUBLIC_DEFAULT_DOMAIN= diff --git a/web/.vscode/extensions.json b/web/.vscode/extensions.json index d7680d74a..a9afbcc64 100644 --- a/web/.vscode/extensions.json +++ b/web/.vscode/extensions.json @@ -1,6 +1,7 @@ { "recommendations": [ "bradlc.vscode-tailwindcss", - "firsttris.vscode-jest-runner" + "firsttris.vscode-jest-runner", + "kisstkondoros.vscode-codemetrics" ] -} +} \ No newline at end of file diff --git a/web/Dockerfile b/web/Dockerfile index f44d91fdf..d3b6e9220 100644 --- a/web/Dockerfile +++ b/web/Dockerfile @@ -1,12 +1,12 @@ # base image -FROM node:20-alpine3.20 AS base +FROM node:22-alpine3.21 AS base LABEL maintainer="takatost@gmail.com" # if you located in China, you can use aliyun mirror to speed up # RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.aliyun.com/g' /etc/apk/repositories RUN apk add --no-cache tzdata -RUN npm install -g pnpm@9.12.2 +RUN npm install -g pnpm@10.8.0 ENV PNPM_HOME="/pnpm" ENV PATH="$PNPM_HOME:$PATH" @@ -42,8 +42,8 @@ ENV EDITION=SELF_HOSTED ENV DEPLOY_ENV=PRODUCTION ENV CONSOLE_API_URL=http://127.0.0.1:5001 ENV APP_API_URL=http://127.0.0.1:5001 -ENV MARKETPLACE_API_URL=http://127.0.0.1:5001 -ENV MARKETPLACE_URL=http://127.0.0.1:5001 +ENV MARKETPLACE_API_URL=https://marketplace.dify.ai +ENV MARKETPLACE_URL=https://marketplace.dify.ai ENV PORT=3000 ENV NEXT_TELEMETRY_DISABLED=1 ENV PM2_INSTANCES=2 diff --git a/web/README.md b/web/README.md index 900924f34..3236347e8 100644 --- a/web/README.md +++ b/web/README.md @@ -6,7 +6,9 @@ This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next ### Run by source code -To start the web frontend service, you will need [Node.js v18.x (LTS)](https://nodejs.org/en) and [pnpm version 9.12.2](https://pnpm.io). +Before starting the web frontend service, please make sure the following environment is ready. +- [Node.js](https://nodejs.org) >= v18.x +- [pnpm](https://pnpm.io) v10.x First, install the dependencies: diff --git a/web/app/(commonLayout)/apps/Apps.tsx b/web/app/(commonLayout)/apps/Apps.tsx index 2bc02ba34..891007efa 100644 --- a/web/app/(commonLayout)/apps/Apps.tsx +++ b/web/app/(commonLayout)/apps/Apps.tsx @@ -1,7 +1,9 @@ 'use client' import { useCallback, useEffect, useRef, useState } from 'react' -import { useRouter } from 'next/navigation' +import { + useRouter, +} from 'next/navigation' import useSWRInfinite from 'swr/infinite' import { useTranslation } from 'react-i18next' import { useDebounceFn } from 'ahooks' diff --git a/web/app/(commonLayout)/apps/page.tsx b/web/app/(commonLayout)/apps/page.tsx index 85fe43344..4a146d9b6 100644 --- a/web/app/(commonLayout)/apps/page.tsx +++ b/web/app/(commonLayout)/apps/page.tsx @@ -7,9 +7,12 @@ import style from '../list.module.css' import Apps from './Apps' import AppContext from '@/context/app-context' import { LicenseStatus } from '@/types/feature' +import { useEducationInit } from '@/app/education-apply/hooks' const AppList = () => { const { t } = useTranslation() + useEducationInit() + const systemFeatures = useContextSelector(AppContext, v => v.systemFeatures) return ( diff --git a/web/app/(commonLayout)/datasets/Container.tsx b/web/app/(commonLayout)/datasets/Container.tsx index c1d9950bf..95f8baea1 100644 --- a/web/app/(commonLayout)/datasets/Container.tsx +++ b/web/app/(commonLayout)/datasets/Container.tsx @@ -38,6 +38,8 @@ const Container = () => { const { showExternalApiPanel, setShowExternalApiPanel } = useExternalApiPanel() const [includeAll, { toggle: toggleIncludeAll }] = useBoolean(false) + document.title = `${t('dataset.knowledge')} - Dify` + const options = useMemo(() => { return [ { value: 'dataset', text: t('dataset.datasets') }, diff --git a/web/app/(commonLayout)/datasets/Doc.tsx b/web/app/(commonLayout)/datasets/Doc.tsx index 00ab967dc..57d4b8dfe 100644 --- a/web/app/(commonLayout)/datasets/Doc.tsx +++ b/web/app/(commonLayout)/datasets/Doc.tsx @@ -6,6 +6,7 @@ import { useTranslation } from 'react-i18next' import { RiListUnordered } from '@remixicon/react' import TemplateEn from './template/template.en.mdx' import TemplateZh from './template/template.zh.mdx' +import TemplateJa from './template/template.ja.mdx' import I18n from '@/context/i18n' import { LanguagesSupported } from '@/i18n/language' @@ -106,10 +107,16 @@ const Doc = ({ apiBaseUrl }: DocProps) => { )}
- {locale !== LanguagesSupported[1] - ? - : - } + {(() => { + switch (locale) { + case LanguagesSupported[1]: + return + case LanguagesSupported[7]: + return + default: + return + } + })()}
) diff --git a/web/app/(commonLayout)/datasets/page.tsx b/web/app/(commonLayout)/datasets/page.tsx index 096a1b897..678de47c9 100644 --- a/web/app/(commonLayout)/datasets/page.tsx +++ b/web/app/(commonLayout)/datasets/page.tsx @@ -4,8 +4,4 @@ const AppList = async () => { return } -export const metadata = { - title: 'Datasets - Dify', -} - export default AppList diff --git a/web/app/(commonLayout)/datasets/template/template.en.mdx b/web/app/(commonLayout)/datasets/template/template.en.mdx index d0e1d9fbc..862344659 100644 --- a/web/app/(commonLayout)/datasets/template/template.en.mdx +++ b/web/app/(commonLayout)/datasets/template/template.en.mdx @@ -439,6 +439,195 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
+ + + + ### Query + + + Knowledge Base ID + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "id": "eaedb485-95ac-4ffd-ab1e-18da6d676a2f", + "name": "Test Knowledge Base", + "description": "", + "provider": "vendor", + "permission": "only_me", + "data_source_type": null, + "indexing_technique": null, + "app_count": 0, + "document_count": 0, + "word_count": 0, + "created_by": "e99a1635-f725-4951-a99a-1daaaa76cfc6", + "created_at": 1735620612, + "updated_by": "e99a1635-f725-4951-a99a-1daaaa76cfc6", + "updated_at": 1735620612, + "embedding_model": null, + "embedding_model_provider": null, + "embedding_available": true, + "retrieval_model_dict": { + "search_method": "semantic_search", + "reranking_enable": false, + "reranking_mode": null, + "reranking_model": { + "reranking_provider_name": "", + "reranking_model_name": "" + }, + "weights": null, + "top_k": 2, + "score_threshold_enabled": false, + "score_threshold": null + }, + "tags": [], + "doc_form": null, + "external_knowledge_info": { + "external_knowledge_id": null, + "external_knowledge_api_id": null, + "external_knowledge_api_name": null, + "external_knowledge_api_endpoint": null + }, + "external_retrieval_model": { + "top_k": 2, + "score_threshold": 0.0, + "score_threshold_enabled": null + } + } + ``` + + + + +
+ + + + + ### Query + + + Knowledge Base ID + + + Index technique (optional) + - high_quality High quality + - economy Economy + + + Permission + - only_me Only me + - all_team_members All team members + - partial_members Partial members + + + Specified embedding model provider, must be set up in the system first, corresponding to the provider field(Optional) + + + Specified embedding model, corresponding to the model field(Optional) + + + Specified retrieval model, corresponding to the model field(Optional) + + + Partial member list(Optional) + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{"name": "Test Knowledge Base", "indexing_technique": "high_quality", "permission": "only_me",\ + "embedding_model_provider": "zhipuai", "embedding_model": "embedding-3", "retrieval_model": "", "partial_member_list": []}' + ``` + + + ```json {{ title: 'Response' }} + { + "id": "eaedb485-95ac-4ffd-ab1e-18da6d676a2f", + "name": "Test Knowledge Base", + "description": "", + "provider": "vendor", + "permission": "only_me", + "data_source_type": null, + "indexing_technique": "high_quality", + "app_count": 0, + "document_count": 0, + "word_count": 0, + "created_by": "e99a1635-f725-4951-a99a-1daaaa76cfc6", + "created_at": 1735620612, + "updated_by": "e99a1635-f725-4951-a99a-1daaaa76cfc6", + "updated_at": 1735622679, + "embedding_model": "embedding-3", + "embedding_model_provider": "zhipuai", + "embedding_available": null, + "retrieval_model_dict": { + "search_method": "semantic_search", + "reranking_enable": false, + "reranking_mode": null, + "reranking_model": { + "reranking_provider_name": "", + "reranking_model_name": "" + }, + "weights": null, + "top_k": 2, + "score_threshold_enabled": false, + "score_threshold": null + }, + "tags": [], + "doc_form": null, + "external_knowledge_info": { + "external_knowledge_id": null, + "external_knowledge_api_id": null, + "external_knowledge_api_name": null, + "external_knowledge_api_endpoint": null + }, + "external_retrieval_model": { + "top_k": 2, + "score_threshold": 0.0, + "score_threshold_enabled": null + }, + "partial_member_list": [] + } + ``` + + + + +
+ + + + + ### Params + + + Knowledge ID + + + Document ID + + + Segment ID + + + + ### Request Body + + + Child chunk content + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "content": "Child chunk content" + }' + ``` + + + ```json {{ title: 'Response' }} + { + "data": { + "id": "", + "segment_id": "", + "content": "Child chunk content", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + } + } + ``` + + + + +
+ + + + + ### Params + + + Knowledge ID + + + Document ID + + + Segment ID + + + + ### Query + + + Search keyword (optional) + + + Page number (optional, default: 1) + + + Items per page (optional, default: 20, max: 100) + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks?page=1&limit=20' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "data": [{ + "id": "", + "segment_id": "", + "content": "Child chunk content", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + }], + "total": 1, + "total_pages": 1, + "page": 1, + "limit": 20 + } + ``` + + + + +
+ + + + + ### Params + + + Knowledge ID + + + Document ID + + + Segment ID + + + Child Chunk ID + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request DELETE '${props.apiBaseUrl}/datasets/{dataset_id}/segments/{segment_id}/child_chunks/{child_chunk_id}' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "result": "success" + } + ``` + + + + +
+ + + + + ### Params + + + Knowledge ID + + + Document ID + + + Segment ID + + + Child Chunk ID + + + + ### Request Body + + + Child chunk content + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks/{child_chunk_id}' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "content": "Updated child chunk content" + }' + ``` + + + ```json {{ title: 'Response' }} + { + "data": { + "id": "", + "segment_id": "", + "content": "Updated child chunk content", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + } + } + ``` + + + + +
+ ```bash {{ title: 'cURL' }} ``` @@ -1600,6 +2059,110 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi +
+ + + + + ### Query + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/workspaces/current/models/model-types/text-embedding' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + ``` + + + ```json {{ title: 'Response' }} + { + "data": [ + { + "provider": "zhipuai", + "label": { + "zh_Hans": "智谱 AI", + "en_US": "ZHIPU AI" + }, + "icon_small": { + "zh_Hans": "http://127.0.0.1:5001/console/api/workspaces/current/model-providers/zhipuai/icon_small/zh_Hans", + "en_US": "http://127.0.0.1:5001/console/api/workspaces/current/model-providers/zhipuai/icon_small/en_US" + }, + "icon_large": { + "zh_Hans": "http://127.0.0.1:5001/console/api/workspaces/current/model-providers/zhipuai/icon_large/zh_Hans", + "en_US": "http://127.0.0.1:5001/console/api/workspaces/current/model-providers/zhipuai/icon_large/en_US" + }, + "status": "active", + "models": [ + { + "model": "embedding-3", + "label": { + "zh_Hans": "embedding-3", + "en_US": "embedding-3" + }, + "model_type": "text-embedding", + "features": null, + "fetch_from": "predefined-model", + "model_properties": { + "context_size": 8192 + }, + "deprecated": false, + "status": "active", + "load_balancing_enabled": false + }, + { + "model": "embedding-2", + "label": { + "zh_Hans": "embedding-2", + "en_US": "embedding-2" + }, + "model_type": "text-embedding", + "features": null, + "fetch_from": "predefined-model", + "model_properties": { + "context_size": 8192 + }, + "deprecated": false, + "status": "active", + "load_balancing_enabled": false + }, + { + "model": "text_embedding", + "label": { + "zh_Hans": "text_embedding", + "en_US": "text_embedding" + }, + "model_type": "text-embedding", + "features": null, + "fetch_from": "predefined-model", + "model_properties": { + "context_size": 512 + }, + "deprecated": false, + "status": "active", + "load_balancing_enabled": false + } + ] + } + ] + } + ``` + + + +
@@ -1704,4 +2267,4 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi -
+
\ No newline at end of file diff --git a/web/app/(commonLayout)/datasets/template/template.ja.mdx b/web/app/(commonLayout)/datasets/template/template.ja.mdx new file mode 100644 index 000000000..e2bdd27d8 --- /dev/null +++ b/web/app/(commonLayout)/datasets/template/template.ja.mdx @@ -0,0 +1,1977 @@ +import { CodeGroup } from '@/app/components/develop/code.tsx' +import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstruction, Paragraph } from '@/app/components/develop/md.tsx' + +# ナレッジ API + +
+ ### 認証 + + Dify のサービス API は `API-Key` を使用して認証します。 + + 開発者は、`API-Key` をクライアント側で共有または保存するのではなく、バックエンドに保存することを推奨します。これにより、`API-Key` の漏洩による財産損失を防ぐことができます。 + + すべての API リクエストには、以下のように **`Authorization`** HTTP ヘッダーに `API-Key` を含める必要があります: + + + ```javascript + Authorization: Bearer {API_KEY} + + ``` + +
+ +
+ + + + + この API は既存のナレッジに基づいており、このナレッジを基にテキストを使用して新しいドキュメントを作成します。 + + ### パラメータ + + + ナレッジ ID + + + + ### リクエストボディ + + + ドキュメント名 + + + ドキュメント内容 + + + インデックスモード + - high_quality 高品質: 埋め込みモデルを使用してベクトルデータベースインデックスを構築 + - economy 経済: キーワードテーブルインデックスの反転インデックスを構築 + + + インデックス化された内容の形式 + - text_model テキストドキュメントは直接埋め込まれます; `economy` モードではこの形式がデフォルト + - hierarchical_model 親子モード + - qa_model Q&A モード: 分割されたドキュメントの質問と回答ペアを生成し、質問を埋め込みます + + + Q&A モードでは、ドキュメントの言語を指定します。例: English, Chinese + + + 処理ルール + - mode (string) クリーニング、セグメンテーションモード、自動 / カスタム + - rules (object) カスタムルール (自動モードでは、このフィールドは空) + - pre_processing_rules (array[object]) 前処理ルール + - id (string) 前処理ルールの一意識別子 + - 列挙 + - remove_extra_spaces 連続するスペース、改行、タブを置換 + - remove_urls_emails URL、メールアドレスを削除 + - enabled (bool) このルールを選択するかどうか。ドキュメント ID が渡されない場合、デフォルト値を表します。 + - segmentation (object) セグメンテーションルール + - separator カスタムセグメント識別子。現在は 1 つの区切り文字のみ設定可能。デフォルトは \n + - max_tokens 最大長 (トークン) デフォルトは 1000 + - parent_mode 親チャンクの検索モード: full-doc 全文検索 / paragraph 段落検索 + - subchunk_segmentation (object) 子チャンクルール + - separator セグメンテーション識別子。現在は 1 つの区切り文字のみ許可。デフォルトは *** + - max_tokens 最大長 (トークン) は親チャンクの長さより短いことを検証する必要があります + - chunk_overlap 隣接するチャンク間の重複を定義 (オプション) + + ナレッジベースにパラメータが設定されていない場合、最初のアップロードには以下のパラメータを提供する必要があります。提供されない場合、デフォルトパラメータが使用されます。 + + 検索モデル + - search_method (string) 検索方法 + - hybrid_search ハイブリッド検索 + - semantic_search セマンティック検索 + - full_text_search 全文検索 + - reranking_enable (bool) 再ランキングを有効にするかどうか + - reranking_mode (object) 再ランキングモデル構成 + - reranking_provider_name (string) 再ランキングモデルプロバイダー + - reranking_model_name (string) 再ランキングモデル名 + - top_k (int) 返される結果の数 + - score_threshold_enabled (bool) スコア閾値を有効にするかどうか + - score_threshold (float) スコア閾値 + + + 埋め込みモデル名 + + + 埋め込みモデルプロバイダー + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}/document/create-by-text' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "name": "text", + "text": "text", + "indexing_technique": "high_quality", + "process_rule": { + "mode": "automatic" + } + }' + ``` + + + ```json {{ title: 'Response' }} + { + "document": { + "id": "", + "position": 1, + "data_source_type": "upload_file", + "data_source_info": { + "upload_file_id": "" + }, + "dataset_process_rule_id": "", + "name": "text.txt", + "created_from": "api", + "created_by": "", + "created_at": 1695690280, + "tokens": 0, + "indexing_status": "waiting", + "error": null, + "enabled": true, + "disabled_at": null, + "disabled_by": null, + "archived": false, + "display_status": "queuing", + "word_count": 0, + "hit_count": 0, + "doc_form": "text_model" + }, + "batch": "" + } + ``` + + + + +
+ + + + + この API は既存のナレッジに基づいており、このナレッジを基にファイルを使用して新しいドキュメントを作成します。 + + ### パラメータ + + + ナレッジ ID + + + + ### リクエストボディ + + + - original_document_id 元のドキュメント ID (オプション) + - ドキュメントを再アップロードまたはクリーニングとセグメンテーション構成を変更するために使用されます。欠落している情報は元のドキュメントからコピーされます。 + - 元のドキュメントはアーカイブされたドキュメントであってはなりません。 + - original_document_id が渡された場合、更新操作が実行されます。process_rule は入力可能な項目です。入力されない場合、元のドキュメントのセグメンテーション方法がデフォルトで使用されます。 + - original_document_id が渡されない場合、新しい操作が実行され、process_rule が必要です。 + + - indexing_technique インデックスモード + - high_quality 高品質: 埋め込みモデルを使用してベクトルデータベースインデックスを構築 + - economy 経済: キーワードテーブルインデックスの反転インデックスを構築 + + - doc_form インデックス化された内容の形式 + - text_model テキストドキュメントは直接埋め込まれます; `economy` モードではこの形式がデフォルト + - hierarchical_model 親子モード + - qa_model Q&A モード: 分割されたドキュメントの質問と回答ペアを生成し、質問を埋め込みます + + - doc_language Q&A モードでは、ドキュメントの言語を指定します。例: English, Chinese + + - process_rule 処理ルール + - mode (string) クリーニング、セグメンテーションモード、自動 / カスタム + - rules (object) カスタムルール (自動モードでは、このフィールドは空) + - pre_processing_rules (array[object]) 前処理ルール + - id (string) 前処理ルールの一意識別子 + - 列挙 + - remove_extra_spaces 連続するスペース、改行、タブを置換 + - remove_urls_emails URL、メールアドレスを削除 + - enabled (bool) このルールを選択するかどうか。ドキュメント ID が渡されない場合、デフォルト値を表します。 + - segmentation (object) セグメンテーションルール + - separator カスタムセグメント識別子。現在は 1 つの区切り文字のみ設定可能。デフォルトは \n + - max_tokens 最大長 (トークン) デフォルトは 1000 + - parent_mode 親チャンクの検索モード: full-doc 全文検索 / paragraph 段落検索 + - subchunk_segmentation (object) 子チャンクルール + - separator セグメンテーション識別子。現在は 1 つの区切り文字のみ許可。デフォルトは *** + - max_tokens 最大長 (トークン) は親チャンクの長さより短いことを検証する必要があります + - chunk_overlap 隣接するチャンク間の重複を定義 (オプション) + + + アップロードする必要があるファイル。 + + ナレッジベースにパラメータが設定されていない場合、最初のアップロードには以下のパラメータを提供する必要があります。提供されない場合、デフォルトパラメータが使用されます。 + + 検索モデル + - search_method (string) 検索方法 + - hybrid_search ハイブリッド検索 + - semantic_search セマンティック検索 + - full_text_search 全文検索 + - reranking_enable (bool) 再ランキングを有効にするかどうか + - reranking_mode (object) 再ランキングモデル構成 + - reranking_provider_name (string) 再ランキングモデルプロバイダー + - reranking_model_name (string) 再ランキングモデル名 + - top_k (int) 返される結果の数 + - score_threshold_enabled (bool) スコア閾値を有効にするかどうか + - score_threshold (float) スコア閾値 + + + 埋め込みモデル名 + + + 埋め込みモデルプロバイダー + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}/document/create-by-file' \ + --header 'Authorization: Bearer {api_key}' \ + --form 'data="{\"name\":\"Dify\",\"indexing_technique\":\"high_quality\",\"process_rule\":{\"rules\":{\"pre_processing_rules\":[{\"id\":\"remove_extra_spaces\",\"enabled\":true},{\"id\":\"remove_urls_emails\",\"enabled\":true}],\"segmentation\":{\"separator\":\"###\",\"max_tokens\":500}},\"mode\":\"custom\"}}";type=text/plain' \ + --form 'file=@"/path/to/file"' + ``` + + + ```json {{ title: 'Response' }} + { + "document": { + "id": "", + "position": 1, + "data_source_type": "upload_file", + "data_source_info": { + "upload_file_id": "" + }, + "dataset_process_rule_id": "", + "name": "Dify.txt", + "created_from": "api", + "created_by": "", + "created_at": 1695308667, + "tokens": 0, + "indexing_status": "waiting", + "error": null, + "enabled": true, + "disabled_at": null, + "disabled_by": null, + "archived": false, + "display_status": "queuing", + "word_count": 0, + "hit_count": 0, + "doc_form": "text_model" + }, + "batch": "" + } + ``` + + + + +
+ + + + + ### リクエストボディ + + + ナレッジ名 + + + ナレッジの説明 (オプション) + + + インデックス技術 (オプション) + - high_quality 高品質 + - economy 経済 + + + 権限 + - only_me 自分のみ + - all_team_members すべてのチームメンバー + - partial_members 一部のメンバー + + + プロバイダー (オプション、デフォルト: vendor) + - vendor ベンダー + - external 外部ナレッジ + + + 外部ナレッジ API ID (オプション) + + + 外部ナレッジ ID (オプション) + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request POST '${apiBaseUrl}/v1/datasets' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "name": "name", + "permission": "only_me" + }' + ``` + + + ```json {{ title: 'Response' }} + { + "id": "", + "name": "name", + "description": null, + "provider": "vendor", + "permission": "only_me", + "data_source_type": null, + "indexing_technique": null, + "app_count": 0, + "document_count": 0, + "word_count": 0, + "created_by": "", + "created_at": 1695636173, + "updated_by": "", + "updated_at": 1695636173, + "embedding_model": null, + "embedding_model_provider": null, + "embedding_available": null + } + ``` + + + + +
+ + + + + ### クエリ + + + ページ番号 + + + 返されるアイテム数、デフォルトは 20、範囲は 1-100 + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/datasets?page=1&limit=20' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "data": [ + { + "id": "", + "name": "name", + "description": "desc", + "permission": "only_me", + "data_source_type": "upload_file", + "indexing_technique": "", + "app_count": 2, + "document_count": 10, + "word_count": 1200, + "created_by": "", + "created_at": "", + "updated_by": "", + "updated_at": "" + }, + ... + ], + "has_more": true, + "limit": 20, + "total": 50, + "page": 1 + } + ``` + + + + +
+ + + + + ### パラメータ + + + ナレッジ ID + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request DELETE '${props.apiBaseUrl}/datasets/{dataset_id}' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```text {{ title: 'Response' }} + 204 No Content + ``` + + + + +
+ + + + + この API は既存のナレッジに基づいており、このナレッジを基にテキストを使用してドキュメントを更新します。 + + ### パラメータ + + + ナレッジ ID + + + ドキュメント ID + + + + ### リクエストボディ + + + ドキュメント名 (オプション) + + + ドキュメント内容 (オプション) + + + 処理ルール + - mode (string) クリーニング、セグメンテーションモード、自動 / カスタム + - rules (object) カスタムルール (自動モードでは、このフィールドは空) + - pre_processing_rules (array[object]) 前処理ルール + - id (string) 前処理ルールの一意識別子 + - 列挙 + - remove_extra_spaces 連続するスペース、改行、タブを置換 + - remove_urls_emails URL、メールアドレスを削除 + - enabled (bool) このルールを選択するかどうか。ドキュメント ID が渡されない場合、デフォルト値を表します。 + - segmentation (object) セグメンテーションルール + - separator カスタムセグメント識別子。現在は 1 つの区切り文字のみ設定可能。デフォルトは \n + - max_tokens 最大長 (トークン) デフォルトは 1000 + - parent_mode 親チャンクの検索モード: full-doc 全文検索 / paragraph 段落検索 + - subchunk_segmentation (object) 子チャンクルール + - separator セグメンテーション識別子。現在は 1 つの区切り文字のみ許可。デフォルトは *** + - max_tokens 最大長 (トークン) は親チャンクの長さより短いことを検証する必要があります + - chunk_overlap 隣接するチャンク間の重複を定義 (オプション) + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/update-by-text' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "name": "name", + "text": "text" + }' + ``` + + + ```json {{ title: 'Response' }} + { + "document": { + "id": "", + "position": 1, + "data_source_type": "upload_file", + "data_source_info": { + "upload_file_id": "" + }, + "dataset_process_rule_id": "", + "name": "name.txt", + "created_from": "api", + "created_by": "", + "created_at": 1695308667, + "tokens": 0, + "indexing_status": "waiting", + "error": null, + "enabled": true, + "disabled_at": null, + "disabled_by": null, + "archived": false, + "display_status": "queuing", + "word_count": 0, + "hit_count": 0, + "doc_form": "text_model" + }, + "batch": "" + } + ``` + + + + +
+ + + + + この API は既存のナレッジに基づいており、このナレッジを基にファイルを使用してドキュメントを更新します。 + + ### パラメータ + + + ナレッジ ID + + + ドキュメント ID + + + + ### リクエストボディ + + + ドキュメント名 (オプション) + + + アップロードするファイル + + + 処理ルール + - mode (string) クリーニング、セグメンテーションモード、自動 / カスタム + - rules (object) カスタムルール (自動モードでは、このフィールドは空) + - pre_processing_rules (array[object]) 前処理ルール + - id (string) 前処理ルールの一意識別子 + - 列挙 + - remove_extra_spaces 連続するスペース、改行、タブを置換 + - remove_urls_emails URL、メールアドレスを削除 + - enabled (bool) このルールを選択するかどうか。ドキュメント ID が渡されない場合、デフォルト値を表します。 + - segmentation (object) セグメンテーションルール + - separator カスタムセグメント識別子。現在は 1 つの区切り文字のみ設定可能。デフォルトは \n + - max_tokens 最大長 (トークン) デフォルトは 1000 + - parent_mode 親チャンクの検索モード: full-doc 全文検索 / paragraph 段落検索 + - subchunk_segmentation (object) 子チャンクルール + - separator セグメンテーション識別子。現在は 1 つの区切り文字のみ許可。デフォルトは *** + - max_tokens 最大長 (トークン) は親チャンクの長さより短いことを検証する必要があります + - chunk_overlap 隣接するチャンク間の重複を定義 (オプション) + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/update-by-file' \ + --header 'Authorization: Bearer {api_key}' \ + --form 'data="{\"name\":\"Dify\",\"indexing_technique\":\"high_quality\",\"process_rule\":{\"rules\":{\"pre_processing_rules\":[{\"id\":\"remove_extra_spaces\",\"enabled\":true},{\"id\":\"remove_urls_emails\",\"enabled\":true}],\"segmentation\":{\"separator\":\"###\",\"max_tokens\":500}},\"mode\":\"custom\"}}";type=text/plain' \ + --form 'file=@"/path/to/file"' + ``` + + + ```json {{ title: 'Response' }} + { + "document": { + "id": "", + "position": 1, + "data_source_type": "upload_file", + "data_source_info": { + "upload_file_id": "" + }, + "dataset_process_rule_id": "", + "name": "Dify.txt", + "created_from": "api", + "created_by": "", + "created_at": 1695308667, + "tokens": 0, + "indexing_status": "waiting", + "error": null, + "enabled": true, + "disabled_at": null, + "disabled_by": null, + "archived": false, + "display_status": "queuing", + "word_count": 0, + "hit_count": 0, + "doc_form": "text_model" + }, + "batch": "20230921150427533684" + } + ``` + + + + +
+ + + + + ### パラメータ + + + ナレッジ ID + + + アップロードされたドキュメントのバッチ番号 + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{batch}/indexing-status' \ + --header 'Authorization: Bearer {api_key}' \ + ``` + + + ```json {{ title: 'Response' }} + { + "data":[{ + "id": "", + "indexing_status": "indexing", + "processing_started_at": 1681623462.0, + "parsing_completed_at": 1681623462.0, + "cleaning_completed_at": 1681623462.0, + "splitting_completed_at": 1681623462.0, + "completed_at": null, + "paused_at": null, + "error": null, + "stopped_at": null, + "completed_segments": 24, + "total_segments": 100 + }] + } + ``` + + + + +
+ + + + + ### パラメータ + + + ナレッジ ID + + + ドキュメント ID + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request DELETE '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \ + --header 'Authorization: Bearer {api_key}' \ + ``` + + + ```json {{ title: 'Response' }} + { + "result": "success" + } + ``` + + + + +
+ + + + + ### パラメータ + + + ナレッジ ID + + + + ### クエリ + + + 検索キーワード、現在はドキュメント名のみ検索 (オプション) + + + ページ番号 (オプション) + + + 返されるアイテム数、デフォルトは 20、範囲は 1-100 (オプション) + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents' \ + --header 'Authorization: Bearer {api_key}' \ + ``` + + + ```json {{ title: 'Response' }} + { + "data": [ + { + "id": "", + "position": 1, + "data_source_type": "file_upload", + "data_source_info": null, + "dataset_process_rule_id": null, + "name": "dify", + "created_from": "", + "created_by": "", + "created_at": 1681623639, + "tokens": 0, + "indexing_status": "waiting", + "error": null, + "enabled": true, + "disabled_at": null, + "disabled_by": null, + "archived": false + }, + ], + "has_more": false, + "limit": 20, + "total": 9, + "page": 1 + } + ``` + + + + +
+ + + + + ### パラメータ + + + ナレッジ ID + + + ドキュメント ID + + + + ### リクエストボディ + + + - content (text) テキスト内容 / 質問内容、必須 + - answer (text) 回答内容、ナレッジのモードが Q&A モードの場合に値を渡します (オプション) + - keywords (list) キーワード (オプション) + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "segments": [ + { + "content": "1", + "answer": "1", + "keywords": ["a"] + } + ] + }' + ``` + + + ```json {{ title: 'Response' }} + { + "data": [{ + "id": "", + "position": 1, + "document_id": "", + "content": "1", + "answer": "1", + "word_count": 25, + "tokens": 0, + "keywords": [ + "a" + ], + "index_node_id": "", + "index_node_hash": "", + "hit_count": 0, + "enabled": true, + "disabled_at": null, + "disabled_by": null, + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + }], + "doc_form": "text_model" + } + ``` + + + + +
+ + + + + ### パス + + + ナレッジ ID + + + ドキュメント ID + + + + ### クエリ + + + キーワード (オプション) + + + 検索ステータス、completed + + + ページ番号 (オプション) + + + 返されるアイテム数、デフォルトは 20、範囲は 1-100 (オプション) + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' + ``` + + + ```json {{ title: 'Response' }} + { + "data": [{ + "id": "", + "position": 1, + "document_id": "", + "content": "1", + "answer": "1", + "word_count": 25, + "tokens": 0, + "keywords": [ + "a" + ], + "index_node_id": "", + "index_node_hash": "", + "hit_count": 0, + "enabled": true, + "disabled_at": null, + "disabled_by": null, + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + }], + "doc_form": "text_model", + "has_more": false, + "limit": 20, + "total": 9, + "page": 1 + } + ``` + + + + +
+ + + + + ### パス + + + ナレッジ ID + + + ドキュメント ID + + + ドキュメントセグメント ID + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request DELETE '${props.apiBaseUrl}/datasets/{dataset_id}/segments/{segment_id}' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' + ``` + + + ```json {{ title: 'Response' }} + { + "result": "success" + } + ``` + + + + +
+ + + + + ### POST + + + ナレッジ ID + + + ドキュメント ID + + + ドキュメントセグメント ID + + + + ### リクエストボディ + + + - content (text) テキスト内容 / 質問内容、必須 + - answer (text) 回答内容、ナレッジが Q&A モードの場合に値を渡します (オプション) + - keywords (list) キーワード (オプション) + - enabled (bool) False / true (オプション) + - regenerate_child_chunks (bool) 子チャンクを再生成するかどうか (オプション) + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "segment": { + "content": "1", + "answer": "1", + "keywords": ["a"], + "enabled": false + } + }' + ``` + + + ```json {{ title: 'Response' }} + { + "data": { + "id": "", + "position": 1, + "document_id": "", + "content": "1", + "answer": "1", + "word_count": 25, + "tokens": 0, + "keywords": [ + "a" + ], + "index_node_id": "", + "index_node_hash": "", + "hit_count": 0, + "enabled": true, + "disabled_at": null, + "disabled_by": null, + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + }, + "doc_form": "text_model" + } + ``` + + + + +
+ + + + + ### パラメータ + + + ナレッジ ID + + + ドキュメント ID + + + セグメント ID + + + + ### リクエストボディ + + + 子チャンクの内容 + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "content": "Child chunk content" + }' + ``` + + + ```json {{ title: 'Response' }} + { + "data": { + "id": "", + "segment_id": "", + "content": "Child chunk content", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + } + } + ``` + + + + +
+ + + + + ### パラメータ + + + ナレッジ ID + + + ドキュメント ID + + + セグメント ID + + + + ### クエリ + + + 検索キーワード (オプション) + + + ページ番号 (オプション、デフォルト: 1) + + + ページあたりのアイテム数 (オプション、デフォルト: 20、最大: 100) + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks?page=1&limit=20' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "data": [{ + "id": "", + "segment_id": "", + "content": "Child chunk content", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + }], + "total": 1, + "total_pages": 1, + "page": 1, + "limit": 20 + } + ``` + + + + +
+ + + + + ### パラメータ + + + ナレッジ ID + + + ドキュメント ID + + + セグメント ID + + + 子チャンク ID + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request DELETE '${props.apiBaseUrl}/datasets/{dataset_id}/segments/{segment_id}/child_chunks/{child_chunk_id}' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "result": "success" + } + ``` + + + + +
+ + + + + ### パラメータ + + + ナレッジ ID + + + ドキュメント ID + + + セグメント ID + + + 子チャンク ID + + + + ### リクエストボディ + + + 子チャンクの内容 + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks/{child_chunk_id}' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "content": "Updated child chunk content" + }' + ``` + + + ```json {{ title: 'Response' }} + { + "data": { + "id": "", + "segment_id": "", + "content": "Updated child chunk content", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + } + } + ``` + + + + +
+ + + + + ### パス + + + ナレッジ ID + + + ドキュメント ID + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/upload-file' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' + ``` + + + ```json {{ title: 'Response' }} + { + "id": "file_id", + "name": "file_name", + "size": 1024, + "extension": "txt", + "url": "preview_url", + "download_url": "download_url", + "mime_type": "text/plain", + "created_by": "user_id", + "created_at": 1728734540, + } + ``` + + + + +
+ + + + + ### パス + + + ナレッジ ID + + + + ### リクエストボディ + + + クエリキーワード + + + 検索モデル (オプション、入力されない場合はデフォルトの方法でリコールされます) + - search_method (text) 検索方法: 以下の 4 つのキーワードのいずれかが必要です + - keyword_search キーワード検索 + - semantic_search セマンティック検索 + - full_text_search 全文検索 + - hybrid_search ハイブリッド検索 + - reranking_enable (bool) 再ランキングを有効にするかどうか、検索モードが semantic_search または hybrid_search の場合に必須 (オプション) + - reranking_mode (object) 再ランキングモデル構成、再ランキングが有効な場合に必須 + - reranking_provider_name (string) 再ランキングモデルプロバイダー + - reranking_model_name (string) 再ランキングモデル名 + - weights (float) ハイブリッド検索モードでのセマンティック検索の重み設定 + - top_k (integer) 返される結果の数 (オプション) + - score_threshold_enabled (bool) スコア閾値を有効にするかどうか + - score_threshold (float) スコア閾値 + + + 未使用フィールド + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}/retrieve' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "query": "test", + "retrieval_model": { + "search_method": "keyword_search", + "reranking_enable": false, + "reranking_mode": null, + "reranking_model": { + "reranking_provider_name": "", + "reranking_model_name": "" + }, + "weights": null, + "top_k": 2, + "score_threshold_enabled": false, + "score_threshold": null + } + }' + ``` + + + ```json {{ title: 'Response' }} + { + "query": { + "content": "test" + }, + "records": [ + { + "segment": { + "id": "7fa6f24f-8679-48b3-bc9d-bdf28d73f218", + "position": 1, + "document_id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2", + "content": "Operation guide", + "answer": null, + "word_count": 847, + "tokens": 280, + "keywords": [ + "install", + "java", + "base", + "scripts", + "jdk", + "manual", + "internal", + "opens", + "add", + "vmoptions" + ], + "index_node_id": "39dd8443-d960-45a8-bb46-7275ad7fbc8e", + "index_node_hash": "0189157697b3c6a418ccf8264a09699f25858975578f3467c76d6bfc94df1d73", + "hit_count": 0, + "enabled": true, + "disabled_at": null, + "disabled_by": null, + "status": "completed", + "created_by": "dbcb1ab5-90c8-41a7-8b78-73b235eb6f6f", + "created_at": 1728734540, + "indexing_at": 1728734552, + "completed_at": 1728734584, + "error": null, + "stopped_at": null, + "document": { + "id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2", + "data_source_type": "upload_file", + "name": "readme.txt", + } + }, + "score": 3.730463140527718e-05, + "tsne_position": null + } + ] + } + ``` + + + + +
+ + + + + ### パラメータ + + + ナレッジ ID + + + + ### リクエストボディ + + + - type (string) メタデータの種類、必須 + - name (string) メタデータの名前、必須 + + + + + + ```bash {{ title: 'cURL' }} + ``` + + + ```json {{ title: 'Response' }} + { + "id": "abc", + "type": "string", + "name": "test", + } + ``` + + + + +
+ + + + + ### パラメータ + + + ナレッジ ID + + + メタデータ ID + + + + ### リクエストボディ + + + - name (string) メタデータの名前、必須 + + + + + + ```bash {{ title: 'cURL' }} + ``` + + + ```json {{ title: 'Response' }} + { + "id": "abc", + "type": "string", + "name": "test", + } + ``` + + + + +
+ + + + + ### パラメータ + + + ナレッジ ID + + + メタデータ ID + + + + + + ```bash {{ title: 'cURL' }} + ``` + + + + +
+ + + + + ### パラメータ + + + ナレッジ ID + + + disable/enable + + + + + + ```bash {{ title: 'cURL' }} + ``` + + + + +
+ + + + + ### パラメータ + + + ナレッジ ID + + + + ### リクエストボディ + + + - document_id (string) ドキュメント ID + - metadata_list (list) メタデータリスト + - id (string) メタデータ ID + - value (string) メタデータの値 + - name (string) メタデータの名前 + + + + + + ```bash {{ title: 'cURL' }} + ``` + + + + +
+ + + + + ### パラメータ + + + ナレッジ ID + + + + + + ```bash {{ title: 'cURL' }} + ``` + + + ```json {{ title: 'Response' }} + { + "doc_metadata": [ + { + "id": "", + "name": "name", + "type": "string", + "use_count": 0, + }, + ... + ], + "built_in_field_enabled": true + } + ``` + + + + +
+ + + + ### エラーメッセージ + + + エラーコード + + + + + エラーステータス + + + + + エラーメッセージ + + + + + + ```json {{ title: 'Response' }} + { + "code": "no_file_uploaded", + "message": "Please upload your file.", + "status": 400 + } + ``` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
codestatusmessage
no_file_uploaded400Please upload your file.
too_many_files400Only one file is allowed.
file_too_large413File size exceeded.
unsupported_file_type415File type not allowed.
high_quality_dataset_only400Current operation only supports 'high-quality' datasets.
dataset_not_initialized400The dataset is still being initialized or indexing. Please wait a moment.
archived_document_immutable403The archived document is not editable.
dataset_name_duplicate409The dataset name already exists. Please modify your dataset name.
invalid_action400Invalid action.
document_already_finished400The document has been processed. Please refresh the page or go to the document details.
document_indexing400The document is being processed and cannot be edited.
invalid_metadata400The metadata content is incorrect. Please check and verify.
+
diff --git a/web/app/(commonLayout)/datasets/template/template.zh.mdx b/web/app/(commonLayout)/datasets/template/template.zh.mdx index d4ed85233..1d4e5ead6 100644 --- a/web/app/(commonLayout)/datasets/template/template.zh.mdx +++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx @@ -49,7 +49,8 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi 索引方式 - - high_quality 高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引 + - high_quality 高质量:使用 + ding 模型进行嵌入,构建为向量数据库索引 - economy 经济:使用 keyword table index 的倒排索引进行构建 @@ -439,6 +440,195 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
+ + + + ### Query + + + 知识库 ID + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "id": "eaedb485-95ac-4ffd-ab1e-18da6d676a2f", + "name": "Test Knowledge Base", + "description": "", + "provider": "vendor", + "permission": "only_me", + "data_source_type": null, + "indexing_technique": null, + "app_count": 0, + "document_count": 0, + "word_count": 0, + "created_by": "e99a1635-f725-4951-a99a-1daaaa76cfc6", + "created_at": 1735620612, + "updated_by": "e99a1635-f725-4951-a99a-1daaaa76cfc6", + "updated_at": 1735620612, + "embedding_model": null, + "embedding_model_provider": null, + "embedding_available": true, + "retrieval_model_dict": { + "search_method": "semantic_search", + "reranking_enable": false, + "reranking_mode": null, + "reranking_model": { + "reranking_provider_name": "", + "reranking_model_name": "" + }, + "weights": null, + "top_k": 2, + "score_threshold_enabled": false, + "score_threshold": null + }, + "tags": [], + "doc_form": null, + "external_knowledge_info": { + "external_knowledge_id": null, + "external_knowledge_api_id": null, + "external_knowledge_api_name": null, + "external_knowledge_api_endpoint": null + }, + "external_retrieval_model": { + "top_k": 2, + "score_threshold": 0.0, + "score_threshold_enabled": null + } + } + ``` + + + + +
+ + + + + ### Query + + + 知识库 ID + + + 索引模式(选填,建议填写) + - high_quality 高质量 + - economy 经济 + + + 权限(选填,默认 only_me) + - only_me 仅自己 + - all_team_members 所有团队成员 + - partial_members 部分团队成员 + + + 嵌入模型提供商(选填), 必须先在系统内设定好接入的模型,对应的是provider字段 + + + 嵌入模型(选填) + + + 检索模型(选填) + + + 部分团队成员 ID 列表(选填) + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{"name": "Test Knowledge Base", "indexing_technique": "high_quality", "permission": "only_me",\ + "embedding_model_provider": "zhipuai", "embedding_model": "embedding-3", "retrieval_model": "", "partial_member_list": []}' + ``` + + + ```json {{ title: 'Response' }} + { + "id": "eaedb485-95ac-4ffd-ab1e-18da6d676a2f", + "name": "Test Knowledge Base", + "description": "", + "provider": "vendor", + "permission": "only_me", + "data_source_type": null, + "indexing_technique": "high_quality", + "app_count": 0, + "document_count": 0, + "word_count": 0, + "created_by": "e99a1635-f725-4951-a99a-1daaaa76cfc6", + "created_at": 1735620612, + "updated_by": "e99a1635-f725-4951-a99a-1daaaa76cfc6", + "updated_at": 1735622679, + "embedding_model": "embedding-3", + "embedding_model_provider": "zhipuai", + "embedding_available": null, + "retrieval_model_dict": { + "search_method": "semantic_search", + "reranking_enable": false, + "reranking_mode": null, + "reranking_model": { + "reranking_provider_name": "", + "reranking_model_name": "" + }, + "weights": null, + "top_k": 2, + "score_threshold_enabled": false, + "score_threshold": null + }, + "tags": [], + "doc_form": null, + "external_knowledge_info": { + "external_knowledge_id": null, + "external_knowledge_api_id": null, + "external_knowledge_api_name": null, + "external_knowledge_api_endpoint": null + }, + "external_retrieval_model": { + "top_k": 2, + "score_threshold": 0.0, + "score_threshold_enabled": null + }, + "partial_member_list": [] + } + ``` + + + + +
+ + + + + ### Path + + + 知识库 ID + + + 文档 ID + + + 分段 ID + + + + ### Request Body + + + 子分段内容 + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "content": "子分段内容" + }' + ``` + + + ```json {{ title: 'Response' }} + { + "data": { + "id": "", + "segment_id": "", + "content": "子分段内容", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + } + } + ``` + + + + +
+ + + + + ### Path + + + 知识库 ID + + + 文档 ID + + + 分段 ID + + + + ### Query + + + 搜索关键词(选填) + + + 页码(选填,默认1) + + + 每页数量(选填,默认20,最大100) + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks?page=1&limit=20' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "data": [{ + "id": "", + "segment_id": "", + "content": "子分段内容", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + }], + "total": 1, + "total_pages": 1, + "page": 1, + "limit": 20 + } + ``` + + + + +
+ + + + + ### Path + + + 知识库 ID + + + 文档 ID + + + 分段 ID + + + 子分段 ID + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request DELETE '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks/{child_chunk_id}' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "result": "success" + } + ``` + + + + +
+ + + + ### 错误信息 + + + 返回的错误代码 + + + + + 返回的错误状态 + + + + + 返回的错误信息 + + + + + + ```json {{ title: 'Response' }} + { + "code": "no_file_uploaded", + "message": "Please upload your file.", + "status": 400 + } + ``` + + + + +
+ + + + + ### Path + + + 知识库 ID + + + 文档 ID + + + 分段 ID + + + 子分段 ID + + + + ### Request Body + + + 子分段内容 + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks/{child_chunk_id}' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "content": "更新的子分段内容" + }' + ``` + + + ```json {{ title: 'Response' }} + { + "data": { + "id": "", + "segment_id": "", + "content": "更新的子分段内容", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + } + } + ``` + + + + +
+ ```bash {{ title: 'cURL' }} ``` @@ -1601,6 +2095,110 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi +
+ + + + + ### Query + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/workspaces/current/models/model-types/text-embedding' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + ``` + + + ```json {{ title: 'Response' }} + { + "data": [ + { + "provider": "zhipuai", + "label": { + "zh_Hans": "智谱 AI", + "en_US": "ZHIPU AI" + }, + "icon_small": { + "zh_Hans": "http://127.0.0.1:5001/console/api/workspaces/current/model-providers/zhipuai/icon_small/zh_Hans", + "en_US": "http://127.0.0.1:5001/console/api/workspaces/current/model-providers/zhipuai/icon_small/en_US" + }, + "icon_large": { + "zh_Hans": "http://127.0.0.1:5001/console/api/workspaces/current/model-providers/zhipuai/icon_large/zh_Hans", + "en_US": "http://127.0.0.1:5001/console/api/workspaces/current/model-providers/zhipuai/icon_large/en_US" + }, + "status": "active", + "models": [ + { + "model": "embedding-3", + "label": { + "zh_Hans": "embedding-3", + "en_US": "embedding-3" + }, + "model_type": "text-embedding", + "features": null, + "fetch_from": "predefined-model", + "model_properties": { + "context_size": 8192 + }, + "deprecated": false, + "status": "active", + "load_balancing_enabled": false + }, + { + "model": "embedding-2", + "label": { + "zh_Hans": "embedding-2", + "en_US": "embedding-2" + }, + "model_type": "text-embedding", + "features": null, + "fetch_from": "predefined-model", + "model_properties": { + "context_size": 8192 + }, + "deprecated": false, + "status": "active", + "load_balancing_enabled": false + }, + { + "model": "text_embedding", + "label": { + "zh_Hans": "text_embedding", + "en_US": "text_embedding" + }, + "model_type": "text-embedding", + "features": null, + "fetch_from": "predefined-model", + "model_properties": { + "context_size": 512 + }, + "deprecated": false, + "status": "active", + "load_balancing_enabled": false + } + ] + } + ] + } + ``` + + + +
diff --git a/web/app/(commonLayout)/education-apply/page.tsx b/web/app/(commonLayout)/education-apply/page.tsx new file mode 100644 index 000000000..873034452 --- /dev/null +++ b/web/app/(commonLayout)/education-apply/page.tsx @@ -0,0 +1,29 @@ +'use client' + +import { + useEffect, + useMemo, +} from 'react' +import { + useRouter, + useSearchParams, +} from 'next/navigation' +import EducationApplyPage from '@/app/education-apply/education-apply-page' +import { useProviderContext } from '@/context/provider-context' + +export default function EducationApply() { + const router = useRouter() + const { enableEducationPlan, isEducationAccount } = useProviderContext() + const searchParams = useSearchParams() + const token = searchParams.get('token') + const showEducationApplyPage = useMemo(() => { + return enableEducationPlan && !isEducationAccount && token + }, [enableEducationPlan, isEducationAccount, token]) + + useEffect(() => { + if (!showEducationApplyPage) + router.replace('/') + }, [showEducationApplyPage, router]) + + return +} diff --git a/web/app/(commonLayout)/plugins/page.tsx b/web/app/(commonLayout)/plugins/page.tsx index a3066311b..cc525992f 100644 --- a/web/app/(commonLayout)/plugins/page.tsx +++ b/web/app/(commonLayout)/plugins/page.tsx @@ -13,8 +13,4 @@ const PluginList = async () => { ) } -export const metadata = { - title: 'Plugins - Dify', -} - export default PluginList diff --git a/web/app/account/account-page/index.tsx b/web/app/account/account-page/index.tsx index 6176fe58a..72d2648c2 100644 --- a/web/app/account/account-page/index.tsx +++ b/web/app/account/account-page/index.tsx @@ -1,7 +1,9 @@ 'use client' import { useState } from 'react' import { useTranslation } from 'react-i18next' - +import { + RiGraduationCapFill, +} from '@remixicon/react' import { useContext } from 'use-context-selector' import DeleteAccount from '../delete-account' import s from './index.module.css' @@ -12,10 +14,12 @@ import Modal from '@/app/components/base/modal' import Button from '@/app/components/base/button' import { updateUserProfile } from '@/service/common' import { useAppContext } from '@/context/app-context' +import { useProviderContext } from '@/context/provider-context' import { ToastContext } from '@/app/components/base/toast' import AppIcon from '@/app/components/base/app-icon' import { IS_CE_EDITION } from '@/config' import Input from '@/app/components/base/input' +import PremiumBadge from '@/app/components/base/premium-badge' const titleClassName = ` system-sm-semibold text-text-secondary @@ -30,6 +34,7 @@ export default function AccountPage() { const { t } = useTranslation() const { systemFeatures } = useAppContext() const { mutateUserProfile, userProfile, apps } = useAppContext() + const { isEducationAccount } = useProviderContext() const { notify } = useContext(ToastContext) const [editNameModalVisible, setEditNameModalVisible] = useState(false) const [editName, setEditName] = useState('') @@ -135,7 +140,15 @@ export default function AccountPage() {
-

{userProfile.name}

+

+ {userProfile.name} + {isEducationAccount && ( + + + EDU + + )} +

{userProfile.email}

diff --git a/web/app/account/avatar.tsx b/web/app/account/avatar.tsx index e37d15c6a..ea897e639 100644 --- a/web/app/account/avatar.tsx +++ b/web/app/account/avatar.tsx @@ -2,11 +2,16 @@ import { useTranslation } from 'react-i18next' import { Fragment } from 'react' import { useRouter } from 'next/navigation' +import { + RiGraduationCapFill, +} from '@remixicon/react' import { Menu, MenuButton, MenuItem, MenuItems, Transition } from '@headlessui/react' import Avatar from '@/app/components/base/avatar' import { logout } from '@/service/common' import { useAppContext } from '@/context/app-context' +import { useProviderContext } from '@/context/provider-context' import { LogOut01 } from '@/app/components/base/icons/src/vender/line/general' +import PremiumBadge from '@/app/components/base/premium-badge' export type IAppSelector = { isMobile: boolean @@ -16,6 +21,7 @@ export default function AppSelector() { const router = useRouter() const { t } = useTranslation() const { userProfile } = useAppContext() + const { isEducationAccount } = useProviderContext() const handleLogout = async () => { await logout({ @@ -68,7 +74,15 @@ export default function AppSelector() {
-
{userProfile.name}
+
+ {userProfile.name} + {isEducationAccount && ( + + + EDU + + )} +
{userProfile.email}
diff --git a/web/app/activate/activateForm.tsx b/web/app/activate/activateForm.tsx index aef5b5a29..782b24be6 100644 --- a/web/app/activate/activateForm.tsx +++ b/web/app/activate/activateForm.tsx @@ -50,8 +50,8 @@ const ActivateForm = () => { {checkRes && !checkRes.is_valid && (
-
🤷‍♂️
-

{t('login.invalid')}

+
🤷‍♂️
+

{t('login.invalid')}

diff --git a/web/app/components/app/log/list.tsx b/web/app/components/app/log/list.tsx index 0ca80c3ec..c4e46af10 100644 --- a/web/app/components/app/log/list.tsx +++ b/web/app/components/app/log/list.tsx @@ -41,6 +41,7 @@ import { CopyIcon } from '@/app/components/base/copy-icon' import { buildChatItemTree, getThreadMessages } from '@/app/components/base/chat/utils' import { getProcessedFilesFromResponse } from '@/app/components/base/file-uploader/utils' import cn from '@/utils/classnames' +import { noop } from 'lodash-es' dayjs.extend(utc) dayjs.extend(timezone) @@ -411,7 +412,7 @@ function DetailPanel({ detail, onFeedback }: IDetailPanel) { content={detail.message.answer} messageId={detail.message.id} isError={false} - onRetry={() => { }} + onRetry={noop} isInstalledApp={false} supportFeedback feedback={detail.message.feedbacks.find((item: any) => item.from_source === 'admin')} diff --git a/web/app/components/app/overview/appChart.tsx b/web/app/components/app/overview/appChart.tsx index fdb5d1dba..957699087 100644 --- a/web/app/components/app/overview/appChart.tsx +++ b/web/app/components/app/overview/appChart.tsx @@ -238,7 +238,7 @@ const Chart: React.FC = ({
{t('appOverview.analysis.tokenUsage.consumed')} Tokens @@ -351,6 +351,7 @@ export const TokenPerSecond: FC = ({ id, period }) => { isAvg unit={t('appOverview.analysis.tokenPS') as string} {...(noDataFlag && { yMax: 100 })} + className="min-w-0" /> } diff --git a/web/app/components/app/overview/embedded/index.tsx b/web/app/components/app/overview/embedded/index.tsx index 0d545aaf3..cb00c9835 100644 --- a/web/app/components/app/overview/embedded/index.tsx +++ b/web/app/components/app/overview/embedded/index.tsx @@ -44,7 +44,10 @@ const OPTION_MAP = { : ''}${IS_CE_EDITION ? `, baseUrl: '${url}'` - : ''} + : ''}, + systemVariables: { + // user_id: 'YOU CAN DEFINE USER ID HERE', + }, }