Complete architecture cleanup - eliminated duplicate server files: - Deleted server_monolithic.py (2249 lines) - Deleted server_legacy.py (2209 lines) New utility modules created: - utils/word_processing.py - Word extraction/conversion (preserves page range fixes) - utils/excel_processing.py - Excel extraction - utils/powerpoint_processing.py - PowerPoint extraction - utils/processing.py - Universal helpers (parse_page_range, health checks, etc.) Updated mixins to import from utils instead of server_monolithic. Entry point remains server.py (48 lines) using mixin architecture. All 53 tests pass. Coverage improved from 11% to 22% by removing duplicate code.
229 lines
8.3 KiB
Python
229 lines
8.3 KiB
Python
"""Universal processing helper functions for Office documents.
|
|
|
|
This module contains helper functions used across different document processing
|
|
operations including metadata extraction, health scoring, content truncation,
|
|
and page range parsing.
|
|
"""
|
|
|
|
import os
|
|
import tempfile
|
|
from typing import Any
|
|
|
|
# Configuration
|
|
TEMP_DIR = os.environ.get("OFFICE_TEMP_DIR", tempfile.gettempdir())
|
|
DEBUG = os.environ.get("DEBUG", "false").lower() == "true"
|
|
|
|
|
|
async def _extract_basic_metadata(file_path: str, extension: str, category: str) -> dict[str, Any]:
|
|
"""Extract basic metadata from Office documents."""
|
|
metadata = {"category": category, "extension": extension}
|
|
|
|
try:
|
|
if extension in [".docx", ".xlsx", ".pptx"] and category in ["word", "excel", "powerpoint"]:
|
|
import zipfile
|
|
|
|
with zipfile.ZipFile(file_path, 'r') as zip_file:
|
|
# Core properties
|
|
if 'docProps/core.xml' in zip_file.namelist():
|
|
zip_file.read('docProps/core.xml').decode('utf-8')
|
|
metadata["has_core_properties"] = True
|
|
|
|
# App properties
|
|
if 'docProps/app.xml' in zip_file.namelist():
|
|
zip_file.read('docProps/app.xml').decode('utf-8')
|
|
metadata["has_app_properties"] = True
|
|
|
|
except Exception:
|
|
pass
|
|
|
|
return metadata
|
|
|
|
|
|
def _calculate_health_score(validation: dict[str, Any], format_info: dict[str, Any]) -> int:
|
|
"""Calculate document health score (1-10)."""
|
|
score = 10
|
|
|
|
# Deduct for validation errors
|
|
if not validation["is_valid"]:
|
|
score -= 5
|
|
|
|
if validation["errors"]:
|
|
score -= len(validation["errors"]) * 2
|
|
|
|
if validation["warnings"]:
|
|
score -= len(validation["warnings"])
|
|
|
|
# Deduct for problematic characteristics
|
|
if validation.get("password_protected"):
|
|
score -= 1
|
|
|
|
if format_info.get("is_legacy"):
|
|
score -= 1
|
|
|
|
structure = format_info.get("structure", {})
|
|
if structure.get("estimated_complexity") == "complex":
|
|
score -= 1
|
|
|
|
return max(1, min(10, score))
|
|
|
|
|
|
def _get_health_recommendations(validation: dict[str, Any], format_info: dict[str, Any]) -> list[str]:
|
|
"""Get health improvement recommendations."""
|
|
recommendations = []
|
|
|
|
if validation["errors"]:
|
|
recommendations.append("Fix validation errors before processing")
|
|
|
|
if validation.get("password_protected"):
|
|
recommendations.append("Remove password protection if possible")
|
|
|
|
if format_info.get("is_legacy"):
|
|
recommendations.append("Consider converting to modern format (.docx, .xlsx, .pptx)")
|
|
|
|
structure = format_info.get("structure", {})
|
|
if structure.get("estimated_complexity") == "complex":
|
|
recommendations.append("Complex document may require specialized processing")
|
|
|
|
if not recommendations:
|
|
recommendations.append("Document appears healthy and ready for processing")
|
|
|
|
return recommendations
|
|
|
|
|
|
def _smart_truncate_content(content: str, max_chars: int) -> str:
|
|
"""Intelligently truncate content while preserving structure and readability."""
|
|
if len(content) <= max_chars:
|
|
return content
|
|
|
|
lines = content.split('\n')
|
|
truncated_lines = []
|
|
current_length = 0
|
|
|
|
# Try to preserve structure by stopping at a natural break point
|
|
for line in lines:
|
|
line_length = len(line) + 1 # +1 for newline
|
|
|
|
# If adding this line would exceed limit
|
|
if current_length + line_length > max_chars:
|
|
# Try to find a good stopping point
|
|
if truncated_lines:
|
|
# Check if we're in the middle of a section
|
|
last_lines = '\n'.join(truncated_lines[-3:]) if len(truncated_lines) >= 3 else '\n'.join(truncated_lines)
|
|
|
|
# If we stopped mid-paragraph, remove incomplete paragraph
|
|
if not (line.strip() == '' or line.startswith('#') or line.startswith('|')):
|
|
# Remove lines until we hit a natural break
|
|
while truncated_lines and not (
|
|
truncated_lines[-1].strip() == '' or
|
|
truncated_lines[-1].startswith('#') or
|
|
truncated_lines[-1].startswith('|') or
|
|
truncated_lines[-1].startswith('-') or
|
|
truncated_lines[-1].startswith('*')
|
|
):
|
|
truncated_lines.pop()
|
|
break
|
|
|
|
truncated_lines.append(line)
|
|
current_length += line_length
|
|
|
|
# Add truncation notice
|
|
result = '\n'.join(truncated_lines)
|
|
result += f"\n\n---\n**[CONTENT TRUNCATED]**\nShowing {len(result):,} of {len(content):,} characters.\nUse smaller page ranges (e.g., 3-5 pages) for full content without truncation.\n---"
|
|
|
|
return result
|
|
|
|
|
|
def _parse_page_range(page_range: str) -> list[int]:
|
|
"""Parse page range string into list of page numbers.
|
|
|
|
Examples:
|
|
"1-5" -> [1, 2, 3, 4, 5]
|
|
"1,3,5" -> [1, 3, 5]
|
|
"1-3,5,7-9" -> [1, 2, 3, 5, 7, 8, 9]
|
|
"""
|
|
pages = set()
|
|
|
|
for part in page_range.split(','):
|
|
part = part.strip()
|
|
if '-' in part:
|
|
# Handle range like "1-5"
|
|
start, end = part.split('-', 1)
|
|
try:
|
|
start_num = int(start.strip())
|
|
end_num = int(end.strip())
|
|
pages.update(range(start_num, end_num + 1))
|
|
except ValueError:
|
|
continue
|
|
else:
|
|
# Handle single page like "3"
|
|
try:
|
|
pages.add(int(part))
|
|
except ValueError:
|
|
continue
|
|
|
|
return sorted(list(pages))
|
|
|
|
|
|
def _get_processing_recommendation(
|
|
doc_analysis: dict[str, Any],
|
|
page_range: str,
|
|
summary_only: bool
|
|
) -> dict[str, Any]:
|
|
"""Generate intelligent processing recommendations based on document analysis."""
|
|
|
|
estimated_pages = doc_analysis["estimated_pages"]
|
|
content_size = doc_analysis["estimated_content_size"]
|
|
|
|
recommendation = {
|
|
"status": "optimal",
|
|
"message": "",
|
|
"suggested_workflow": [],
|
|
"warnings": []
|
|
}
|
|
|
|
# Large document recommendations
|
|
if content_size in ["large", "very_large"] and not page_range and not summary_only:
|
|
recommendation["status"] = "suboptimal"
|
|
recommendation["message"] = (
|
|
f"⚠️ Large document detected ({estimated_pages} estimated pages). "
|
|
"Consider using recommended workflow for better performance."
|
|
)
|
|
recommendation["suggested_workflow"] = [
|
|
"1. First: Call with summary_only=true to get document overview and TOC",
|
|
"2. Then: Use page_range to process specific sections (e.g., '1-5', '6-10', '15-20')",
|
|
"3. Recommended: Use 3-8 page chunks to stay under 25k token MCP limit",
|
|
"4. The tool auto-truncates if content is too large, but smaller ranges work better"
|
|
]
|
|
recommendation["warnings"] = [
|
|
"Page ranges >8 pages may hit 25k token response limit and get truncated",
|
|
"Use smaller page ranges (3-5 pages) for dense content documents",
|
|
"Auto-truncation preserves structure but loses content completeness"
|
|
]
|
|
|
|
# Medium document recommendations
|
|
elif content_size == "medium" and not page_range and not summary_only:
|
|
recommendation["status"] = "caution"
|
|
recommendation["message"] = (
|
|
f"Medium document detected ({estimated_pages} estimated pages). "
|
|
"Consider summary_only=true first if you encounter response size issues."
|
|
)
|
|
recommendation["suggested_workflow"] = [
|
|
"Option 1: Try full processing (current approach)",
|
|
"Option 2: Use summary_only=true first, then page_range if needed"
|
|
]
|
|
|
|
# Optimal usage patterns
|
|
elif summary_only:
|
|
recommendation["message"] = "✅ Excellent! Using summary mode for initial document analysis."
|
|
recommendation["suggested_workflow"] = [
|
|
"After reviewing summary, use page_range to extract specific sections of interest"
|
|
]
|
|
|
|
elif page_range and content_size in ["large", "very_large"]:
|
|
recommendation["message"] = "✅ Perfect! Using page-range processing for efficient extraction."
|
|
|
|
elif content_size == "small":
|
|
recommendation["message"] = "✅ Small document - full processing is optimal."
|
|
|
|
return recommendation
|