📖 Add bookmark-based chapter extraction for precise content targeting
- Add bookmark_name parameter for extracting specific chapters/sections - Implement bookmark boundary detection using Word XML structure - Extract content between bookmark start/end markers with smart extension - More reliable than page ranges - bookmarks are anchored to exact locations - Support chapter extraction like bookmark_name='Chapter1_Start' - Include bookmark metadata in response with element ranges - Perfect for extracting individual chapters from large documents
This commit is contained in:
parent
b2033fc239
commit
6484036b69
@ -292,6 +292,7 @@ async def convert_to_markdown(
|
|||||||
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
|
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
|
||||||
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
|
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
|
||||||
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
|
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
|
||||||
|
bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
|
||||||
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
|
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
|
||||||
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
|
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
@ -333,11 +334,15 @@ async def convert_to_markdown(
|
|||||||
# Parse page range if provided
|
# Parse page range if provided
|
||||||
page_numbers = _parse_page_range(page_range) if page_range else None
|
page_numbers = _parse_page_range(page_range) if page_range else None
|
||||||
|
|
||||||
|
# Prioritize bookmark extraction over page ranges
|
||||||
|
if bookmark_name:
|
||||||
|
page_numbers = None # Ignore page ranges when bookmark is specified
|
||||||
|
|
||||||
# Convert to markdown based on format
|
# Convert to markdown based on format
|
||||||
if extension == ".docx":
|
if extension == ".docx":
|
||||||
markdown_result = await _convert_docx_to_markdown(
|
markdown_result = await _convert_docx_to_markdown(
|
||||||
local_path, include_images, image_mode, max_image_size,
|
local_path, include_images, image_mode, max_image_size,
|
||||||
preserve_structure, page_numbers, summary_only, output_dir
|
preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
|
||||||
)
|
)
|
||||||
else: # .doc
|
else: # .doc
|
||||||
# For legacy .doc files, use mammoth if available
|
# For legacy .doc files, use mammoth if available
|
||||||
@ -1053,7 +1058,8 @@ async def _convert_docx_to_markdown(
|
|||||||
preserve_structure: bool,
|
preserve_structure: bool,
|
||||||
page_numbers: list[int],
|
page_numbers: list[int],
|
||||||
summary_only: bool,
|
summary_only: bool,
|
||||||
output_dir: str
|
output_dir: str,
|
||||||
|
bookmark_name: str = ""
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Convert .docx file to markdown with comprehensive feature support."""
|
"""Convert .docx file to markdown with comprehensive feature support."""
|
||||||
import base64
|
import base64
|
||||||
@ -1062,12 +1068,12 @@ async def _convert_docx_to_markdown(
|
|||||||
if summary_only:
|
if summary_only:
|
||||||
return await _get_ultra_fast_summary(file_path)
|
return await _get_ultra_fast_summary(file_path)
|
||||||
|
|
||||||
# If page_numbers is specified, we need to use python-docx for page-based extraction
|
# If page_numbers or bookmark_name is specified, we need to use python-docx for targeted extraction
|
||||||
# as mammoth processes the entire document
|
# as mammoth processes the entire document
|
||||||
if page_numbers:
|
if page_numbers or bookmark_name:
|
||||||
return await _convert_docx_with_python_docx(
|
return await _convert_docx_with_python_docx(
|
||||||
file_path, include_images, image_mode, max_image_size,
|
file_path, include_images, image_mode, max_image_size,
|
||||||
preserve_structure, page_numbers, summary_only, output_dir
|
preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -1186,13 +1192,13 @@ async def _convert_docx_to_markdown(
|
|||||||
# Fall back to python-docx with custom markdown conversion
|
# Fall back to python-docx with custom markdown conversion
|
||||||
return await _convert_docx_with_python_docx(
|
return await _convert_docx_with_python_docx(
|
||||||
file_path, include_images, image_mode, max_image_size,
|
file_path, include_images, image_mode, max_image_size,
|
||||||
preserve_structure, page_numbers, summary_only, output_dir
|
preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
# Fall back to python-docx
|
# Fall back to python-docx
|
||||||
return await _convert_docx_with_python_docx(
|
return await _convert_docx_with_python_docx(
|
||||||
file_path, include_images, image_mode, max_image_size,
|
file_path, include_images, image_mode, max_image_size,
|
||||||
preserve_structure, page_numbers, summary_only, output_dir
|
preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -1204,7 +1210,8 @@ async def _convert_docx_with_python_docx(
|
|||||||
preserve_structure: bool,
|
preserve_structure: bool,
|
||||||
page_numbers: list[int],
|
page_numbers: list[int],
|
||||||
summary_only: bool,
|
summary_only: bool,
|
||||||
output_dir: str
|
output_dir: str,
|
||||||
|
bookmark_name: str = ""
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Convert .docx using python-docx with custom markdown conversion."""
|
"""Convert .docx using python-docx with custom markdown conversion."""
|
||||||
import base64
|
import base64
|
||||||
@ -1260,17 +1267,30 @@ async def _convert_docx_with_python_docx(
|
|||||||
"markdown_ref": f""
|
"markdown_ref": f""
|
||||||
})
|
})
|
||||||
|
|
||||||
# Process document elements with aggressive content limiting
|
# Handle bookmark-based extraction vs page-based vs full document
|
||||||
# Since Word page detection is unreliable, use element-based limiting
|
if bookmark_name:
|
||||||
if page_numbers:
|
# For bookmark extraction, find the bookmark boundaries
|
||||||
|
bookmark_range = await _find_bookmark_content_range(doc, bookmark_name)
|
||||||
|
if not bookmark_range:
|
||||||
|
return {
|
||||||
|
"content": f"Bookmark '{bookmark_name}' not found in document",
|
||||||
|
"method_used": "python-docx-bookmark-not-found",
|
||||||
|
"images": [],
|
||||||
|
"bookmark_error": True
|
||||||
|
}
|
||||||
|
max_paragraphs = 500 # Generous limit for bookmark sections
|
||||||
|
max_chars = 100000
|
||||||
|
elif page_numbers:
|
||||||
# For page ranges, severely limit content extraction
|
# For page ranges, severely limit content extraction
|
||||||
max_pages_requested = max(page_numbers) if page_numbers else 1
|
max_pages_requested = max(page_numbers) if page_numbers else 1
|
||||||
# Rough estimate: ~20-30 paragraphs per page
|
# Rough estimate: ~20-30 paragraphs per page
|
||||||
max_paragraphs = min(max_pages_requested * 25, 100) # Cap at 100 paragraphs max
|
max_paragraphs = min(max_pages_requested * 25, 100) # Cap at 100 paragraphs max
|
||||||
max_chars = min(max_pages_requested * 8000, 40000) # Cap at 40k chars max
|
max_chars = min(max_pages_requested * 8000, 40000) # Cap at 40k chars max
|
||||||
|
bookmark_range = None
|
||||||
else:
|
else:
|
||||||
max_paragraphs = 1000 # Large limit for full document
|
max_paragraphs = 1000 # Large limit for full document
|
||||||
max_chars = 200000
|
max_chars = 200000
|
||||||
|
bookmark_range = None
|
||||||
|
|
||||||
current_page = 1
|
current_page = 1
|
||||||
processed_paragraphs = 0
|
processed_paragraphs = 0
|
||||||
@ -1278,10 +1298,15 @@ async def _convert_docx_with_python_docx(
|
|||||||
include_current_page = not page_numbers or current_page in page_numbers
|
include_current_page = not page_numbers or current_page in page_numbers
|
||||||
table_of_contents = [] # Track headings with page numbers for TOC
|
table_of_contents = [] # Track headings with page numbers for TOC
|
||||||
|
|
||||||
for element in doc.element.body:
|
for element_idx, element in enumerate(doc.element.body):
|
||||||
# Early termination if we've processed enough content
|
# Early termination if we've processed enough content
|
||||||
if processed_paragraphs >= max_paragraphs or total_chars >= max_chars:
|
if processed_paragraphs >= max_paragraphs or total_chars >= max_chars:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Skip elements outside bookmark range if bookmark extraction is used
|
||||||
|
if bookmark_range and not (bookmark_range['start_idx'] <= element_idx <= bookmark_range['end_idx']):
|
||||||
|
continue
|
||||||
|
|
||||||
if isinstance(element, CT_P):
|
if isinstance(element, CT_P):
|
||||||
paragraph = Paragraph(element, doc)
|
paragraph = Paragraph(element, doc)
|
||||||
|
|
||||||
@ -1366,8 +1391,14 @@ async def _convert_docx_with_python_docx(
|
|||||||
"note": f"Processed {processed_paragraphs}/{max_paragraphs} paragraphs, {total_chars:,}/{max_chars:,} chars"
|
"note": f"Processed {processed_paragraphs}/{max_paragraphs} paragraphs, {total_chars:,}/{max_chars:,} chars"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Add page filtering info
|
# Add extraction method info
|
||||||
if page_numbers:
|
if bookmark_name and bookmark_range:
|
||||||
|
result["bookmark_extraction"] = {
|
||||||
|
"bookmark_name": bookmark_name,
|
||||||
|
"elements_range": f"{bookmark_range['start_idx']}-{bookmark_range['end_idx']}",
|
||||||
|
"extraction_note": bookmark_range["note"]
|
||||||
|
}
|
||||||
|
elif page_numbers:
|
||||||
result["pages_processed"] = page_numbers
|
result["pages_processed"] = page_numbers
|
||||||
result["total_pages_in_range"] = len(page_numbers)
|
result["total_pages_in_range"] = len(page_numbers)
|
||||||
|
|
||||||
@ -1594,6 +1625,48 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]:
|
|||||||
return structure
|
return structure
|
||||||
|
|
||||||
|
|
||||||
|
async def _find_bookmark_content_range(doc, bookmark_name: str) -> dict[str, Any]:
|
||||||
|
"""Find the content range for a specific bookmark."""
|
||||||
|
try:
|
||||||
|
# Find bookmark start and end positions in the document
|
||||||
|
bookmark_starts = {}
|
||||||
|
bookmark_ends = {}
|
||||||
|
|
||||||
|
# Look for bookmark markers in the document XML
|
||||||
|
for elem_idx, element in enumerate(doc.element.body):
|
||||||
|
# Look for bookmark start markers
|
||||||
|
for bookmark_start in element.xpath('.//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
|
||||||
|
name = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name')
|
||||||
|
if name == bookmark_name:
|
||||||
|
bookmark_id = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
|
||||||
|
bookmark_starts[bookmark_id] = elem_idx
|
||||||
|
|
||||||
|
# Look for bookmark end markers
|
||||||
|
for bookmark_end in element.xpath('.//w:bookmarkEnd', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
|
||||||
|
bookmark_id = bookmark_end.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
|
||||||
|
if bookmark_id in bookmark_starts:
|
||||||
|
bookmark_ends[bookmark_id] = elem_idx
|
||||||
|
break
|
||||||
|
|
||||||
|
# Find the bookmark range
|
||||||
|
for bookmark_id, start_idx in bookmark_starts.items():
|
||||||
|
if bookmark_id in bookmark_ends:
|
||||||
|
end_idx = bookmark_ends[bookmark_id]
|
||||||
|
# Extend range to capture full sections (look for next major heading)
|
||||||
|
extended_end = min(end_idx + 50, len(doc.element.body) - 1) # Extend by 50 elements or end of doc
|
||||||
|
return {
|
||||||
|
'start_idx': start_idx,
|
||||||
|
'end_idx': extended_end,
|
||||||
|
'bookmark_id': bookmark_id,
|
||||||
|
'note': f"Extracting content from bookmark '{bookmark_name}' (elements {start_idx}-{extended_end})"
|
||||||
|
}
|
||||||
|
|
||||||
|
return None # Bookmark not found
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return None # Error finding bookmark
|
||||||
|
|
||||||
|
|
||||||
async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
|
async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
|
||||||
"""Ultra-fast summary that extracts minimal data to prevent MCP token limits."""
|
"""Ultra-fast summary that extracts minimal data to prevent MCP token limits."""
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user