fix: dos in annotation import (#29470)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
2026-06-12 18:11:42 +08:00 · 2025-12-15 15:22:04 +08:00
parent 714b443077
commit 724cd57dbf
9 changed files with 643 additions and 13 deletions
@@ -1,6 +1,9 @@
+import logging
 import uuid

 import pandas as pd
+
+logger = logging.getLogger(__name__)
 from sqlalchemy import or_, select
 from werkzeug.datastructures import FileStorage
 from werkzeug.exceptions import NotFound
@@ -330,6 +333,18 @@ class AppAnnotationService:

    @classmethod
    def batch_import_app_annotations(cls, app_id, file: FileStorage):
+        """
+        Batch import annotations from CSV file with enhanced security checks.
+
+        Security features:
+        - File size validation
+        - Row count limits (min/max)
+        - Memory-efficient CSV parsing
+        - Subscription quota validation
+        - Concurrency tracking
+        """
+        from configs import dify_config
+
        # get app info
        current_user, current_tenant_id = current_account_with_tenant()
        app = (
@@ -341,16 +356,80 @@ class AppAnnotationService:
        if not app:
            raise NotFound("App not found")

+        job_id: str | None = None  # Initialize to avoid unbound variable error
        try:
-            # Skip the first row
-            df = pd.read_csv(file.stream, dtype=str)
-            result = []
-            for _, row in df.iterrows():
-                content = {"question": row.iloc[0], "answer": row.iloc[1]}
+            # Quick row count check before full parsing (memory efficient)
+            # Read only first chunk to estimate row count
+            file.stream.seek(0)
+            first_chunk = file.stream.read(8192)  # Read first 8KB
+            file.stream.seek(0)
+
+            # Estimate row count from first chunk
+            newline_count = first_chunk.count(b"\n")
+            if newline_count == 0:
+                raise ValueError("The CSV file appears to be empty or invalid.")
+
+            # Parse CSV with row limit to prevent memory exhaustion
+            # Use chunksize for memory-efficient processing
+            max_records = dify_config.ANNOTATION_IMPORT_MAX_RECORDS
+            min_records = dify_config.ANNOTATION_IMPORT_MIN_RECORDS
+
+            # Read CSV in chunks to avoid loading entire file into memory
+            df = pd.read_csv(
+                file.stream,
+                dtype=str,
+                nrows=max_records + 1,  # Read one extra to detect overflow
+                engine="python",
+                on_bad_lines="skip",  # Skip malformed lines instead of crashing
+            )
+
+            # Validate column count
+            if len(df.columns) < 2:
+                raise ValueError("Invalid CSV format. The file must contain at least 2 columns (question and answer).")
+
+            # Build result list with validation
+            result: list[dict] = []
+            for idx, row in df.iterrows():
+                # Stop if we exceed the limit
+                if len(result) >= max_records:
+                    raise ValueError(
+                        f"The CSV file contains too many records. Maximum {max_records} records allowed per import. "
+                        f"Please split your file into smaller batches."
+                    )
+
+                # Extract and validate question and answer
+                try:
+                    question_raw = row.iloc[0]
+                    answer_raw = row.iloc[1]
+                except (IndexError, KeyError):
+                    continue  # Skip malformed rows
+
+                # Convert to string and strip whitespace
+                question = str(question_raw).strip() if question_raw is not None else ""
+                answer = str(answer_raw).strip() if answer_raw is not None else ""
+
+                # Skip empty entries or NaN values
+                if not question or not answer or question.lower() == "nan" or answer.lower() == "nan":
+                    continue
+
+                # Validate length constraints (idx is pandas index, convert to int for display)
+                row_num = int(idx) + 2 if isinstance(idx, (int, float)) else len(result) + 2
+                if len(question) > 2000:
+                    raise ValueError(f"Question at row {row_num} is too long. Maximum 2000 characters allowed.")
+                if len(answer) > 10000:
+                    raise ValueError(f"Answer at row {row_num} is too long. Maximum 10000 characters allowed.")
+
+                content = {"question": question, "answer": answer}
                result.append(content)
-            if len(result) == 0:
-                raise ValueError("The CSV file is empty.")
-            # check annotation limit
+
+            # Validate minimum records
+            if len(result) < min_records:
+                raise ValueError(
+                    f"The CSV file must contain at least {min_records} valid annotation record(s). "
+                    f"Found {len(result)} valid record(s)."
+                )
+
+            # Check annotation quota limit
            features = FeatureService.get_features(current_tenant_id)
            if features.billing.enabled:
                annotation_quota_limit = features.annotation_quota_limit
@@ -359,12 +438,34 @@ class AppAnnotationService:
            # async job
            job_id = str(uuid.uuid4())
            indexing_cache_key = f"app_annotation_batch_import_{str(job_id)}"
-            # send batch add segments task
+
+            # Register job in active tasks list for concurrency tracking
+            current_time = int(naive_utc_now().timestamp() * 1000)
+            active_jobs_key = f"annotation_import_active:{current_tenant_id}"
+            redis_client.zadd(active_jobs_key, {job_id: current_time})
+            redis_client.expire(active_jobs_key, 7200)  # 2 hours TTL
+
+            # Set job status
            redis_client.setnx(indexing_cache_key, "waiting")
            batch_import_annotations_task.delay(str(job_id), result, app_id, current_tenant_id, current_user.id)
-        except Exception as e:
+
+        except ValueError as e:
            return {"error_msg": str(e)}
-        return {"job_id": job_id, "job_status": "waiting"}
+        except Exception as e:
+            # Clean up active job registration on error (only if job was created)
+            if job_id is not None:
+                try:
+                    active_jobs_key = f"annotation_import_active:{current_tenant_id}"
+                    redis_client.zrem(active_jobs_key, job_id)
+                except Exception:
+                    # Silently ignore cleanup errors - the job will be auto-expired
+                    logger.debug("Failed to clean up active job tracking during error handling")
+
+            # Check if it's a CSV parsing error
+            error_str = str(e)
+            return {"error_msg": f"An error occurred while processing the file: {error_str}"}
+
+        return {"job_id": job_id, "job_status": "waiting", "record_count": len(result)}

    @classmethod
    def get_annotation_hit_histories(cls, app_id: str, annotation_id: str, page, limit):