Ryan Malloy af6aadf559 Refactor: Extract processing logic into utility modules
Complete architecture cleanup - eliminated duplicate server files:
- Deleted server_monolithic.py (2249 lines)
- Deleted server_legacy.py (2209 lines)

New utility modules created:
- utils/word_processing.py - Word extraction/conversion (preserves page range fixes)
- utils/excel_processing.py - Excel extraction
- utils/powerpoint_processing.py - PowerPoint extraction
- utils/processing.py - Universal helpers (parse_page_range, health checks, etc.)

Updated mixins to import from utils instead of server_monolithic.
Entry point remains server.py (48 lines) using mixin architecture.

All 53 tests pass. Coverage improved from 11% to 22% by removing duplicate code.
2026-01-11 05:08:18 -07:00

229 lines
8.3 KiB
Python

"""Universal processing helper functions for Office documents.
This module contains helper functions used across different document processing
operations including metadata extraction, health scoring, content truncation,
and page range parsing.
"""
import os
import tempfile
from typing import Any
# Configuration
TEMP_DIR = os.environ.get("OFFICE_TEMP_DIR", tempfile.gettempdir())
DEBUG = os.environ.get("DEBUG", "false").lower() == "true"
async def _extract_basic_metadata(file_path: str, extension: str, category: str) -> dict[str, Any]:
"""Extract basic metadata from Office documents."""
metadata = {"category": category, "extension": extension}
try:
if extension in [".docx", ".xlsx", ".pptx"] and category in ["word", "excel", "powerpoint"]:
import zipfile
with zipfile.ZipFile(file_path, 'r') as zip_file:
# Core properties
if 'docProps/core.xml' in zip_file.namelist():
zip_file.read('docProps/core.xml').decode('utf-8')
metadata["has_core_properties"] = True
# App properties
if 'docProps/app.xml' in zip_file.namelist():
zip_file.read('docProps/app.xml').decode('utf-8')
metadata["has_app_properties"] = True
except Exception:
pass
return metadata
def _calculate_health_score(validation: dict[str, Any], format_info: dict[str, Any]) -> int:
"""Calculate document health score (1-10)."""
score = 10
# Deduct for validation errors
if not validation["is_valid"]:
score -= 5
if validation["errors"]:
score -= len(validation["errors"]) * 2
if validation["warnings"]:
score -= len(validation["warnings"])
# Deduct for problematic characteristics
if validation.get("password_protected"):
score -= 1
if format_info.get("is_legacy"):
score -= 1
structure = format_info.get("structure", {})
if structure.get("estimated_complexity") == "complex":
score -= 1
return max(1, min(10, score))
def _get_health_recommendations(validation: dict[str, Any], format_info: dict[str, Any]) -> list[str]:
"""Get health improvement recommendations."""
recommendations = []
if validation["errors"]:
recommendations.append("Fix validation errors before processing")
if validation.get("password_protected"):
recommendations.append("Remove password protection if possible")
if format_info.get("is_legacy"):
recommendations.append("Consider converting to modern format (.docx, .xlsx, .pptx)")
structure = format_info.get("structure", {})
if structure.get("estimated_complexity") == "complex":
recommendations.append("Complex document may require specialized processing")
if not recommendations:
recommendations.append("Document appears healthy and ready for processing")
return recommendations
def _smart_truncate_content(content: str, max_chars: int) -> str:
"""Intelligently truncate content while preserving structure and readability."""
if len(content) <= max_chars:
return content
lines = content.split('\n')
truncated_lines = []
current_length = 0
# Try to preserve structure by stopping at a natural break point
for line in lines:
line_length = len(line) + 1 # +1 for newline
# If adding this line would exceed limit
if current_length + line_length > max_chars:
# Try to find a good stopping point
if truncated_lines:
# Check if we're in the middle of a section
last_lines = '\n'.join(truncated_lines[-3:]) if len(truncated_lines) >= 3 else '\n'.join(truncated_lines)
# If we stopped mid-paragraph, remove incomplete paragraph
if not (line.strip() == '' or line.startswith('#') or line.startswith('|')):
# Remove lines until we hit a natural break
while truncated_lines and not (
truncated_lines[-1].strip() == '' or
truncated_lines[-1].startswith('#') or
truncated_lines[-1].startswith('|') or
truncated_lines[-1].startswith('-') or
truncated_lines[-1].startswith('*')
):
truncated_lines.pop()
break
truncated_lines.append(line)
current_length += line_length
# Add truncation notice
result = '\n'.join(truncated_lines)
result += f"\n\n---\n**[CONTENT TRUNCATED]**\nShowing {len(result):,} of {len(content):,} characters.\nUse smaller page ranges (e.g., 3-5 pages) for full content without truncation.\n---"
return result
def _parse_page_range(page_range: str) -> list[int]:
"""Parse page range string into list of page numbers.
Examples:
"1-5" -> [1, 2, 3, 4, 5]
"1,3,5" -> [1, 3, 5]
"1-3,5,7-9" -> [1, 2, 3, 5, 7, 8, 9]
"""
pages = set()
for part in page_range.split(','):
part = part.strip()
if '-' in part:
# Handle range like "1-5"
start, end = part.split('-', 1)
try:
start_num = int(start.strip())
end_num = int(end.strip())
pages.update(range(start_num, end_num + 1))
except ValueError:
continue
else:
# Handle single page like "3"
try:
pages.add(int(part))
except ValueError:
continue
return sorted(list(pages))
def _get_processing_recommendation(
doc_analysis: dict[str, Any],
page_range: str,
summary_only: bool
) -> dict[str, Any]:
"""Generate intelligent processing recommendations based on document analysis."""
estimated_pages = doc_analysis["estimated_pages"]
content_size = doc_analysis["estimated_content_size"]
recommendation = {
"status": "optimal",
"message": "",
"suggested_workflow": [],
"warnings": []
}
# Large document recommendations
if content_size in ["large", "very_large"] and not page_range and not summary_only:
recommendation["status"] = "suboptimal"
recommendation["message"] = (
f"⚠️ Large document detected ({estimated_pages} estimated pages). "
"Consider using recommended workflow for better performance."
)
recommendation["suggested_workflow"] = [
"1. First: Call with summary_only=true to get document overview and TOC",
"2. Then: Use page_range to process specific sections (e.g., '1-5', '6-10', '15-20')",
"3. Recommended: Use 3-8 page chunks to stay under 25k token MCP limit",
"4. The tool auto-truncates if content is too large, but smaller ranges work better"
]
recommendation["warnings"] = [
"Page ranges >8 pages may hit 25k token response limit and get truncated",
"Use smaller page ranges (3-5 pages) for dense content documents",
"Auto-truncation preserves structure but loses content completeness"
]
# Medium document recommendations
elif content_size == "medium" and not page_range and not summary_only:
recommendation["status"] = "caution"
recommendation["message"] = (
f"Medium document detected ({estimated_pages} estimated pages). "
"Consider summary_only=true first if you encounter response size issues."
)
recommendation["suggested_workflow"] = [
"Option 1: Try full processing (current approach)",
"Option 2: Use summary_only=true first, then page_range if needed"
]
# Optimal usage patterns
elif summary_only:
recommendation["message"] = "✅ Excellent! Using summary mode for initial document analysis."
recommendation["suggested_workflow"] = [
"After reviewing summary, use page_range to extract specific sections of interest"
]
elif page_range and content_size in ["large", "very_large"]:
recommendation["message"] = "✅ Perfect! Using page-range processing for efficient extraction."
elif content_size == "small":
recommendation["message"] = "✅ Small document - full processing is optimal."
return recommendation