Add chapter-based extraction for documents without bookmarks
- Add chapter_name parameter to convert_to_markdown tool - Implement _find_chapter_content_range() for heading-based navigation - Add _get_available_headings() to help users find chapter names - Include chapter extraction metadata in results - Enhanced ultra-fast summary with available headings - Provides alternative to bookmark extraction when bookmarks unavailable
This commit is contained in:
parent
6484036b69
commit
778ef3a2d4
@ -293,6 +293,7 @@ async def convert_to_markdown(
|
|||||||
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
|
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
|
||||||
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
|
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
|
||||||
bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
|
bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
|
||||||
|
chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
|
||||||
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
|
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
|
||||||
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
|
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
@ -334,15 +335,15 @@ async def convert_to_markdown(
|
|||||||
# Parse page range if provided
|
# Parse page range if provided
|
||||||
page_numbers = _parse_page_range(page_range) if page_range else None
|
page_numbers = _parse_page_range(page_range) if page_range else None
|
||||||
|
|
||||||
# Prioritize bookmark extraction over page ranges
|
# Prioritize bookmark/chapter extraction over page ranges
|
||||||
if bookmark_name:
|
if bookmark_name or chapter_name:
|
||||||
page_numbers = None # Ignore page ranges when bookmark is specified
|
page_numbers = None # Ignore page ranges when bookmark or chapter is specified
|
||||||
|
|
||||||
# Convert to markdown based on format
|
# Convert to markdown based on format
|
||||||
if extension == ".docx":
|
if extension == ".docx":
|
||||||
markdown_result = await _convert_docx_to_markdown(
|
markdown_result = await _convert_docx_to_markdown(
|
||||||
local_path, include_images, image_mode, max_image_size,
|
local_path, include_images, image_mode, max_image_size,
|
||||||
preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
|
preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
|
||||||
)
|
)
|
||||||
else: # .doc
|
else: # .doc
|
||||||
# For legacy .doc files, use mammoth if available
|
# For legacy .doc files, use mammoth if available
|
||||||
@ -1059,7 +1060,8 @@ async def _convert_docx_to_markdown(
|
|||||||
page_numbers: list[int],
|
page_numbers: list[int],
|
||||||
summary_only: bool,
|
summary_only: bool,
|
||||||
output_dir: str,
|
output_dir: str,
|
||||||
bookmark_name: str = ""
|
bookmark_name: str = "",
|
||||||
|
chapter_name: str = ""
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Convert .docx file to markdown with comprehensive feature support."""
|
"""Convert .docx file to markdown with comprehensive feature support."""
|
||||||
import base64
|
import base64
|
||||||
@ -1068,12 +1070,12 @@ async def _convert_docx_to_markdown(
|
|||||||
if summary_only:
|
if summary_only:
|
||||||
return await _get_ultra_fast_summary(file_path)
|
return await _get_ultra_fast_summary(file_path)
|
||||||
|
|
||||||
# If page_numbers or bookmark_name is specified, we need to use python-docx for targeted extraction
|
# If page_numbers, bookmark_name, or chapter_name is specified, we need to use python-docx for targeted extraction
|
||||||
# as mammoth processes the entire document
|
# as mammoth processes the entire document
|
||||||
if page_numbers or bookmark_name:
|
if page_numbers or bookmark_name or chapter_name:
|
||||||
return await _convert_docx_with_python_docx(
|
return await _convert_docx_with_python_docx(
|
||||||
file_path, include_images, image_mode, max_image_size,
|
file_path, include_images, image_mode, max_image_size,
|
||||||
preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
|
preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -1192,13 +1194,13 @@ async def _convert_docx_to_markdown(
|
|||||||
# Fall back to python-docx with custom markdown conversion
|
# Fall back to python-docx with custom markdown conversion
|
||||||
return await _convert_docx_with_python_docx(
|
return await _convert_docx_with_python_docx(
|
||||||
file_path, include_images, image_mode, max_image_size,
|
file_path, include_images, image_mode, max_image_size,
|
||||||
preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
|
preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
# Fall back to python-docx
|
# Fall back to python-docx
|
||||||
return await _convert_docx_with_python_docx(
|
return await _convert_docx_with_python_docx(
|
||||||
file_path, include_images, image_mode, max_image_size,
|
file_path, include_images, image_mode, max_image_size,
|
||||||
preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
|
preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -1211,7 +1213,8 @@ async def _convert_docx_with_python_docx(
|
|||||||
page_numbers: list[int],
|
page_numbers: list[int],
|
||||||
summary_only: bool,
|
summary_only: bool,
|
||||||
output_dir: str,
|
output_dir: str,
|
||||||
bookmark_name: str = ""
|
bookmark_name: str = "",
|
||||||
|
chapter_name: str = ""
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Convert .docx using python-docx with custom markdown conversion."""
|
"""Convert .docx using python-docx with custom markdown conversion."""
|
||||||
import base64
|
import base64
|
||||||
@ -1267,7 +1270,7 @@ async def _convert_docx_with_python_docx(
|
|||||||
"markdown_ref": f""
|
"markdown_ref": f""
|
||||||
})
|
})
|
||||||
|
|
||||||
# Handle bookmark-based extraction vs page-based vs full document
|
# Handle bookmark-based, chapter-based, or page-based extraction vs full document
|
||||||
if bookmark_name:
|
if bookmark_name:
|
||||||
# For bookmark extraction, find the bookmark boundaries
|
# For bookmark extraction, find the bookmark boundaries
|
||||||
bookmark_range = await _find_bookmark_content_range(doc, bookmark_name)
|
bookmark_range = await _find_bookmark_content_range(doc, bookmark_name)
|
||||||
@ -1280,6 +1283,21 @@ async def _convert_docx_with_python_docx(
|
|||||||
}
|
}
|
||||||
max_paragraphs = 500 # Generous limit for bookmark sections
|
max_paragraphs = 500 # Generous limit for bookmark sections
|
||||||
max_chars = 100000
|
max_chars = 100000
|
||||||
|
chapter_range = None
|
||||||
|
elif chapter_name:
|
||||||
|
# For chapter extraction, find the heading boundaries
|
||||||
|
chapter_range = await _find_chapter_content_range(doc, chapter_name)
|
||||||
|
if not chapter_range:
|
||||||
|
return {
|
||||||
|
"content": f"Chapter '{chapter_name}' not found in document. Available headings will be listed in processing_limits.",
|
||||||
|
"method_used": "python-docx-chapter-not-found",
|
||||||
|
"images": [],
|
||||||
|
"chapter_error": True,
|
||||||
|
"available_headings": await _get_available_headings(doc)
|
||||||
|
}
|
||||||
|
max_paragraphs = 500 # Generous limit for chapter sections
|
||||||
|
max_chars = 100000
|
||||||
|
bookmark_range = None
|
||||||
elif page_numbers:
|
elif page_numbers:
|
||||||
# For page ranges, severely limit content extraction
|
# For page ranges, severely limit content extraction
|
||||||
max_pages_requested = max(page_numbers) if page_numbers else 1
|
max_pages_requested = max(page_numbers) if page_numbers else 1
|
||||||
@ -1287,10 +1305,12 @@ async def _convert_docx_with_python_docx(
|
|||||||
max_paragraphs = min(max_pages_requested * 25, 100) # Cap at 100 paragraphs max
|
max_paragraphs = min(max_pages_requested * 25, 100) # Cap at 100 paragraphs max
|
||||||
max_chars = min(max_pages_requested * 8000, 40000) # Cap at 40k chars max
|
max_chars = min(max_pages_requested * 8000, 40000) # Cap at 40k chars max
|
||||||
bookmark_range = None
|
bookmark_range = None
|
||||||
|
chapter_range = None
|
||||||
else:
|
else:
|
||||||
max_paragraphs = 1000 # Large limit for full document
|
max_paragraphs = 1000 # Large limit for full document
|
||||||
max_chars = 200000
|
max_chars = 200000
|
||||||
bookmark_range = None
|
bookmark_range = None
|
||||||
|
chapter_range = None
|
||||||
|
|
||||||
current_page = 1
|
current_page = 1
|
||||||
processed_paragraphs = 0
|
processed_paragraphs = 0
|
||||||
@ -1303,9 +1323,11 @@ async def _convert_docx_with_python_docx(
|
|||||||
if processed_paragraphs >= max_paragraphs or total_chars >= max_chars:
|
if processed_paragraphs >= max_paragraphs or total_chars >= max_chars:
|
||||||
break
|
break
|
||||||
|
|
||||||
# Skip elements outside bookmark range if bookmark extraction is used
|
# Skip elements outside bookmark/chapter range if targeted extraction is used
|
||||||
if bookmark_range and not (bookmark_range['start_idx'] <= element_idx <= bookmark_range['end_idx']):
|
if bookmark_range and not (bookmark_range['start_idx'] <= element_idx <= bookmark_range['end_idx']):
|
||||||
continue
|
continue
|
||||||
|
if chapter_range and not (chapter_range['start_idx'] <= element_idx <= chapter_range['end_idx']):
|
||||||
|
continue
|
||||||
|
|
||||||
if isinstance(element, CT_P):
|
if isinstance(element, CT_P):
|
||||||
paragraph = Paragraph(element, doc)
|
paragraph = Paragraph(element, doc)
|
||||||
@ -1398,6 +1420,12 @@ async def _convert_docx_with_python_docx(
|
|||||||
"elements_range": f"{bookmark_range['start_idx']}-{bookmark_range['end_idx']}",
|
"elements_range": f"{bookmark_range['start_idx']}-{bookmark_range['end_idx']}",
|
||||||
"extraction_note": bookmark_range["note"]
|
"extraction_note": bookmark_range["note"]
|
||||||
}
|
}
|
||||||
|
elif chapter_name and chapter_range:
|
||||||
|
result["chapter_extraction"] = {
|
||||||
|
"chapter_name": chapter_name,
|
||||||
|
"elements_range": f"{chapter_range['start_idx']}-{chapter_range['end_idx']}",
|
||||||
|
"extraction_note": chapter_range["note"]
|
||||||
|
}
|
||||||
elif page_numbers:
|
elif page_numbers:
|
||||||
result["pages_processed"] = page_numbers
|
result["pages_processed"] = page_numbers
|
||||||
result["total_pages_in_range"] = len(page_numbers)
|
result["total_pages_in_range"] = len(page_numbers)
|
||||||
@ -1667,6 +1695,103 @@ async def _find_bookmark_content_range(doc, bookmark_name: str) -> dict[str, Any
|
|||||||
return None # Error finding bookmark
|
return None # Error finding bookmark
|
||||||
|
|
||||||
|
|
||||||
|
async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]:
|
||||||
|
"""Find the content range for a specific chapter by heading text."""
|
||||||
|
try:
|
||||||
|
# Find heading that matches the chapter name
|
||||||
|
chapter_start_idx = None
|
||||||
|
chapter_end_idx = None
|
||||||
|
|
||||||
|
# Search through document elements for matching heading
|
||||||
|
for elem_idx, element in enumerate(doc.element.body):
|
||||||
|
# Check if this element is a paragraph with heading style
|
||||||
|
try:
|
||||||
|
para = element
|
||||||
|
if para.tag.endswith('}p'): # Word paragraph element
|
||||||
|
# Get the text content
|
||||||
|
text_content = ''.join(text_elem.text or '' for text_elem in para.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}))
|
||||||
|
|
||||||
|
# Check if this matches our chapter name (case insensitive, flexible matching)
|
||||||
|
if text_content.strip() and chapter_name.lower() in text_content.lower().strip():
|
||||||
|
# Check if it's actually a heading by looking at paragraph style
|
||||||
|
style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
|
||||||
|
if style_elem:
|
||||||
|
style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
|
||||||
|
if 'heading' in style_val.lower() or 'title' in style_val.lower():
|
||||||
|
chapter_start_idx = elem_idx
|
||||||
|
break
|
||||||
|
# Also consider short text lines as potential headings
|
||||||
|
elif len(text_content.strip()) < 100:
|
||||||
|
chapter_start_idx = elem_idx
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if chapter_start_idx is None:
|
||||||
|
return None # Chapter heading not found
|
||||||
|
|
||||||
|
# Find the end of this chapter (next major heading or end of document)
|
||||||
|
chapter_end_idx = len(doc.element.body) - 1 # Default to end of document
|
||||||
|
|
||||||
|
# Look for the next major heading to determine chapter end
|
||||||
|
for elem_idx in range(chapter_start_idx + 1, len(doc.element.body)):
|
||||||
|
try:
|
||||||
|
para = doc.element.body[elem_idx]
|
||||||
|
if para.tag.endswith('}p'):
|
||||||
|
# Check if this is a major heading (same level or higher than chapter start)
|
||||||
|
style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
|
||||||
|
if style_elem:
|
||||||
|
style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
|
||||||
|
if 'heading1' in style_val.lower() or 'title' in style_val.lower():
|
||||||
|
chapter_end_idx = elem_idx - 1
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return {
|
||||||
|
'start_idx': chapter_start_idx,
|
||||||
|
'end_idx': chapter_end_idx,
|
||||||
|
'chapter_name': chapter_name,
|
||||||
|
'note': f"Extracting content for chapter '{chapter_name}' (elements {chapter_start_idx}-{chapter_end_idx})"
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return None # Error finding chapter
|
||||||
|
|
||||||
|
|
||||||
|
async def _get_available_headings(doc) -> list[str]:
|
||||||
|
"""Extract available headings from the document to help users find chapter names."""
|
||||||
|
try:
|
||||||
|
headings = []
|
||||||
|
|
||||||
|
# Search through document elements for headings
|
||||||
|
for element in doc.element.body[:100]: # Only check first 100 elements to avoid token issues
|
||||||
|
try:
|
||||||
|
if element.tag.endswith('}p'): # Word paragraph element
|
||||||
|
# Get the text content
|
||||||
|
text_content = ''.join(text_elem.text or '' for text_elem in element.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}))
|
||||||
|
|
||||||
|
if text_content.strip():
|
||||||
|
# Check if it's a heading by looking at paragraph style
|
||||||
|
style_elem = element.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
|
||||||
|
if style_elem:
|
||||||
|
style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
|
||||||
|
if 'heading' in style_val.lower() or 'title' in style_val.lower():
|
||||||
|
headings.append(text_content.strip()[:100]) # Limit heading length
|
||||||
|
# Also consider short text lines as potential headings
|
||||||
|
elif len(text_content.strip()) < 100:
|
||||||
|
# Only add if it looks like a heading (not just short random text)
|
||||||
|
if any(word in text_content.lower() for word in ['chapter', 'section', 'part', 'introduction', 'conclusion']):
|
||||||
|
headings.append(text_content.strip())
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return headings[:20] # Return max 20 headings to avoid token issues
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
|
async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
|
||||||
"""Ultra-fast summary that extracts minimal data to prevent MCP token limits."""
|
"""Ultra-fast summary that extracts minimal data to prevent MCP token limits."""
|
||||||
try:
|
try:
|
||||||
@ -1719,6 +1844,9 @@ async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
|
|||||||
# Create very basic summary
|
# Create very basic summary
|
||||||
summary_content = "\n\n".join(content_parts)
|
summary_content = "\n\n".join(content_parts)
|
||||||
|
|
||||||
|
# Extract available headings for chapter navigation
|
||||||
|
available_headings = await _get_available_headings(doc)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"content": summary_content,
|
"content": summary_content,
|
||||||
"method_used": "ultra-fast-summary",
|
"method_used": "ultra-fast-summary",
|
||||||
@ -1727,7 +1855,10 @@ async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
|
|||||||
"basic_info": f"Document has ~{total_paragraphs} paragraphs, {total_tables} tables, {heading_count} headings found in first scan",
|
"basic_info": f"Document has ~{total_paragraphs} paragraphs, {total_tables} tables, {heading_count} headings found in first scan",
|
||||||
"bookmarks": bookmarks[:20] if bookmarks else [], # Limit to first 20 bookmarks
|
"bookmarks": bookmarks[:20] if bookmarks else [], # Limit to first 20 bookmarks
|
||||||
"bookmark_count": len(bookmarks),
|
"bookmark_count": len(bookmarks),
|
||||||
"bookmark_note": "Bookmarks often indicate chapter starts. Use these as navigation hints for page_range extraction."
|
"bookmark_note": "Bookmarks often indicate chapter starts. Use these as navigation hints for page_range extraction.",
|
||||||
|
"available_headings": available_headings[:10] if available_headings else [], # Limit to first 10 headings
|
||||||
|
"heading_count": len(available_headings),
|
||||||
|
"heading_note": "Use these headings with chapter_name parameter for chapter-based extraction when bookmarks are not available."
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user