🎯 Add page-range chunking and summary mode for large documents
- Replace character-based chunking with page-range support (e.g., '1-5', '1,3,5-10') - Add summary_only mode to prevent large response errors (>25k tokens) - Implement response size limiting with 5000 char truncation in summary mode - Support selective page processing for better memory efficiency - Maintain backward compatibility with existing parameters
This commit is contained in:
parent
b3caed78d3
commit
f884c99bbd
@ -291,13 +291,14 @@ async def convert_to_markdown(
|
|||||||
image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
|
image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
|
||||||
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
|
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
|
||||||
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
|
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
|
||||||
chunk_size: int = Field(default=0, description="Split large documents into chunks (0 = no chunking)"),
|
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). Empty = all pages"),
|
||||||
|
summary_only: bool = Field(default=False, description="Return only metadata and structure summary (for large docs)"),
|
||||||
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
|
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Convert Office documents to Markdown format with image support and structure preservation.
|
"""Convert Office documents to Markdown format with page-range support and structure preservation.
|
||||||
|
|
||||||
Handles large .docx files efficiently with options for image embedding, file extraction,
|
Supports page-based chunking for large documents and summary mode for quick overview.
|
||||||
and document chunking for very large files.
|
Use page_range to process specific pages only, or summary_only=true for large documents.
|
||||||
"""
|
"""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
@ -319,35 +320,49 @@ async def convert_to_markdown(
|
|||||||
if category != "word":
|
if category != "word":
|
||||||
raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}")
|
raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}")
|
||||||
|
|
||||||
|
# Parse page range if provided
|
||||||
|
page_numbers = _parse_page_range(page_range) if page_range else None
|
||||||
|
|
||||||
# Convert to markdown based on format
|
# Convert to markdown based on format
|
||||||
if extension == ".docx":
|
if extension == ".docx":
|
||||||
markdown_result = await _convert_docx_to_markdown(
|
markdown_result = await _convert_docx_to_markdown(
|
||||||
local_path, include_images, image_mode, max_image_size,
|
local_path, include_images, image_mode, max_image_size,
|
||||||
preserve_structure, chunk_size, output_dir
|
preserve_structure, page_numbers, summary_only, output_dir
|
||||||
)
|
)
|
||||||
else: # .doc
|
else: # .doc
|
||||||
# For legacy .doc files, use mammoth if available
|
# For legacy .doc files, use mammoth if available
|
||||||
markdown_result = await _convert_doc_to_markdown(
|
markdown_result = await _convert_doc_to_markdown(
|
||||||
local_path, include_images, image_mode, max_image_size,
|
local_path, include_images, image_mode, max_image_size,
|
||||||
preserve_structure, chunk_size, output_dir
|
preserve_structure, page_numbers, summary_only, output_dir
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Build result based on mode
|
||||||
result = {
|
result = {
|
||||||
"markdown": markdown_result["content"],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"original_file": os.path.basename(local_path),
|
"original_file": os.path.basename(local_path),
|
||||||
"format": format_info["format_name"],
|
"format": format_info["format_name"],
|
||||||
"conversion_method": markdown_result["method_used"],
|
"conversion_method": markdown_result["method_used"],
|
||||||
"character_count": len(markdown_result["content"]),
|
"conversion_time": round(time.time() - start_time, 3),
|
||||||
"word_count": len(markdown_result["content"].split()),
|
"summary_only": summary_only
|
||||||
"conversion_time": round(time.time() - start_time, 3)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Add chunking info if applicable
|
# Add page range info if used
|
||||||
if chunk_size > 0 and markdown_result.get("chunks"):
|
if page_range:
|
||||||
result["chunks"] = markdown_result["chunks"]
|
result["metadata"]["page_range"] = page_range
|
||||||
result["metadata"]["chunk_count"] = len(markdown_result["chunks"])
|
result["metadata"]["pages_processed"] = len(page_numbers) if page_numbers else 0
|
||||||
|
|
||||||
|
# Add content based on mode
|
||||||
|
if summary_only:
|
||||||
|
# Only include summary information for large documents
|
||||||
|
result["metadata"]["character_count"] = len(markdown_result["content"])
|
||||||
|
result["metadata"]["word_count"] = len(markdown_result["content"].split())
|
||||||
|
result["summary"] = markdown_result["content"][:1000] + "..." if len(markdown_result["content"]) > 1000 else markdown_result["content"]
|
||||||
|
else:
|
||||||
|
# Include full content for smaller documents or page ranges
|
||||||
|
result["markdown"] = markdown_result["content"]
|
||||||
|
result["metadata"]["character_count"] = len(markdown_result["content"])
|
||||||
|
result["metadata"]["word_count"] = len(markdown_result["content"].split())
|
||||||
|
|
||||||
# Add image info
|
# Add image info
|
||||||
if include_images and markdown_result.get("images"):
|
if include_images and markdown_result.get("images"):
|
||||||
@ -989,7 +1004,8 @@ async def _convert_docx_to_markdown(
|
|||||||
image_mode: str,
|
image_mode: str,
|
||||||
max_image_size: int,
|
max_image_size: int,
|
||||||
preserve_structure: bool,
|
preserve_structure: bool,
|
||||||
chunk_size: int,
|
page_numbers: list[int],
|
||||||
|
summary_only: bool,
|
||||||
output_dir: str
|
output_dir: str
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Convert .docx file to markdown with comprehensive feature support."""
|
"""Convert .docx file to markdown with comprehensive feature support."""
|
||||||
@ -1092,10 +1108,13 @@ async def _convert_docx_to_markdown(
|
|||||||
"images": []
|
"images": []
|
||||||
}
|
}
|
||||||
|
|
||||||
# Handle chunking if requested
|
# Handle summary mode
|
||||||
if chunk_size > 0 and len(markdown_content) > chunk_size:
|
if summary_only and len(markdown_content) > 5000:
|
||||||
chunks = _chunk_markdown(markdown_content, chunk_size)
|
# For summary mode, truncate large content
|
||||||
conversion_result["chunks"] = chunks
|
markdown_content = markdown_content[:5000] + "\n\n[Content truncated - use summary_only=false for full content]"
|
||||||
|
|
||||||
|
# Update the conversion result
|
||||||
|
conversion_result["content"] = markdown_content
|
||||||
|
|
||||||
# Extract structure information
|
# Extract structure information
|
||||||
if preserve_structure:
|
if preserve_structure:
|
||||||
@ -1108,13 +1127,13 @@ async def _convert_docx_to_markdown(
|
|||||||
# Fall back to python-docx with custom markdown conversion
|
# Fall back to python-docx with custom markdown conversion
|
||||||
return await _convert_docx_with_python_docx(
|
return await _convert_docx_with_python_docx(
|
||||||
file_path, include_images, image_mode, max_image_size,
|
file_path, include_images, image_mode, max_image_size,
|
||||||
preserve_structure, chunk_size, output_dir
|
preserve_structure, page_numbers, summary_only, output_dir
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
# Fall back to python-docx
|
# Fall back to python-docx
|
||||||
return await _convert_docx_with_python_docx(
|
return await _convert_docx_with_python_docx(
|
||||||
file_path, include_images, image_mode, max_image_size,
|
file_path, include_images, image_mode, max_image_size,
|
||||||
preserve_structure, chunk_size, output_dir
|
preserve_structure, page_numbers, summary_only, output_dir
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -1124,7 +1143,8 @@ async def _convert_docx_with_python_docx(
|
|||||||
image_mode: str,
|
image_mode: str,
|
||||||
max_image_size: int,
|
max_image_size: int,
|
||||||
preserve_structure: bool,
|
preserve_structure: bool,
|
||||||
chunk_size: int,
|
page_numbers: list[int],
|
||||||
|
summary_only: bool,
|
||||||
output_dir: str
|
output_dir: str
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Convert .docx using python-docx with custom markdown conversion."""
|
"""Convert .docx using python-docx with custom markdown conversion."""
|
||||||
@ -1221,10 +1241,12 @@ async def _convert_docx_with_python_docx(
|
|||||||
"images": images_info
|
"images": images_info
|
||||||
}
|
}
|
||||||
|
|
||||||
# Handle chunking
|
# Handle summary mode
|
||||||
if chunk_size > 0 and len(markdown_content) > chunk_size:
|
if summary_only and len(markdown_content) > 5000:
|
||||||
chunks = _chunk_markdown(markdown_content, chunk_size)
|
markdown_content = markdown_content[:5000] + "\n\n[Content truncated - use summary_only=false for full content]"
|
||||||
result["chunks"] = chunks
|
|
||||||
|
# Update the result content
|
||||||
|
result["content"] = markdown_content
|
||||||
|
|
||||||
# Add structure info
|
# Add structure info
|
||||||
if preserve_structure:
|
if preserve_structure:
|
||||||
@ -1239,7 +1261,8 @@ async def _convert_doc_to_markdown(
|
|||||||
image_mode: str,
|
image_mode: str,
|
||||||
max_image_size: int,
|
max_image_size: int,
|
||||||
preserve_structure: bool,
|
preserve_structure: bool,
|
||||||
chunk_size: int,
|
page_numbers: list[int],
|
||||||
|
summary_only: bool,
|
||||||
output_dir: str
|
output_dir: str
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Convert legacy .doc file to markdown using available methods."""
|
"""Convert legacy .doc file to markdown using available methods."""
|
||||||
@ -1256,9 +1279,12 @@ async def _convert_doc_to_markdown(
|
|||||||
"images": [] # Legacy .doc image extraction is complex
|
"images": [] # Legacy .doc image extraction is complex
|
||||||
}
|
}
|
||||||
|
|
||||||
if chunk_size > 0 and len(markdown_content) > chunk_size:
|
# Handle summary mode
|
||||||
chunks = _chunk_markdown(markdown_content, chunk_size)
|
if summary_only and len(markdown_content) > 5000:
|
||||||
conversion_result["chunks"] = chunks
|
markdown_content = markdown_content[:5000] + "\n\n[Content truncated - use summary_only=false for full content]"
|
||||||
|
|
||||||
|
# Update the conversion result
|
||||||
|
conversion_result["content"] = markdown_content
|
||||||
|
|
||||||
if preserve_structure:
|
if preserve_structure:
|
||||||
structure = _extract_markdown_structure(markdown_content)
|
structure = _extract_markdown_structure(markdown_content)
|
||||||
@ -1438,6 +1464,37 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]:
|
|||||||
return structure
|
return structure
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_page_range(page_range: str) -> list[int]:
|
||||||
|
"""Parse page range string into list of page numbers.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
"1-5" -> [1, 2, 3, 4, 5]
|
||||||
|
"1,3,5" -> [1, 3, 5]
|
||||||
|
"1-3,5,7-9" -> [1, 2, 3, 5, 7, 8, 9]
|
||||||
|
"""
|
||||||
|
pages = set()
|
||||||
|
|
||||||
|
for part in page_range.split(','):
|
||||||
|
part = part.strip()
|
||||||
|
if '-' in part:
|
||||||
|
# Handle range like "1-5"
|
||||||
|
start, end = part.split('-', 1)
|
||||||
|
try:
|
||||||
|
start_num = int(start.strip())
|
||||||
|
end_num = int(end.strip())
|
||||||
|
pages.update(range(start_num, end_num + 1))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Handle single page like "3"
|
||||||
|
try:
|
||||||
|
pages.add(int(part))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return sorted(list(pages))
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Main entry point for the MCP server."""
|
"""Main entry point for the MCP server."""
|
||||||
import sys
|
import sys
|
||||||
|
Loading…
x
Reference in New Issue
Block a user