mirror of
https://github.com/YFGaia/dify-plus.git
synced 2026-06-14 20:41:21 +08:00
This commit is contained in:
@@ -68,22 +68,17 @@ class MarkdownExtractor(BaseExtractor):
|
|||||||
continue
|
continue
|
||||||
header_match = re.match(r"^#+\s", line)
|
header_match = re.match(r"^#+\s", line)
|
||||||
if header_match:
|
if header_match:
|
||||||
if current_header is not None:
|
markdown_tups.append((current_header, current_text))
|
||||||
markdown_tups.append((current_header, current_text))
|
|
||||||
|
|
||||||
current_header = line
|
current_header = line
|
||||||
current_text = ""
|
current_text = ""
|
||||||
else:
|
else:
|
||||||
current_text += line + "\n"
|
current_text += line + "\n"
|
||||||
markdown_tups.append((current_header, current_text))
|
markdown_tups.append((current_header, current_text))
|
||||||
|
|
||||||
if current_header is not None:
|
markdown_tups = [
|
||||||
# pass linting, assert keys are defined
|
(re.sub(r"#", "", cast(str, key)).strip() if key else None, re.sub(r"<.*?>", "", value))
|
||||||
markdown_tups = [
|
for key, value in markdown_tups
|
||||||
(re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) for key, value in markdown_tups
|
]
|
||||||
]
|
|
||||||
else:
|
|
||||||
markdown_tups = [(key, re.sub("\n", "", value)) for key, value in markdown_tups]
|
|
||||||
|
|
||||||
return markdown_tups
|
return markdown_tups
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,22 @@
|
|||||||
|
from core.rag.extractor.markdown_extractor import MarkdownExtractor
|
||||||
|
|
||||||
|
|
||||||
|
def test_markdown_to_tups():
|
||||||
|
markdown = """
|
||||||
|
this is some text without header
|
||||||
|
|
||||||
|
# title 1
|
||||||
|
this is balabala text
|
||||||
|
|
||||||
|
## title 2
|
||||||
|
this is more specific text.
|
||||||
|
"""
|
||||||
|
extractor = MarkdownExtractor(file_path="dummy_path")
|
||||||
|
updated_output = extractor.markdown_to_tups(markdown)
|
||||||
|
assert len(updated_output) == 3
|
||||||
|
key, header_value = updated_output[0]
|
||||||
|
assert key == None
|
||||||
|
assert header_value.strip() == "this is some text without header"
|
||||||
|
title_1, value = updated_output[1]
|
||||||
|
assert title_1.strip() == "title 1"
|
||||||
|
assert value.strip() == "this is balabala text"
|
||||||
Reference in New Issue
Block a user