mcp-office-tools/src/mcp_office_tools/mixins/excel.py

"""Excel Document Tools Mixin - Specialized tools for Excel spreadsheet processing."""

import time
from typing import Any, List, Optional, Dict
import tempfile
import os

from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
from pydantic import Field

from ..utils import (
    OfficeFileError,
    resolve_office_file_path,
    validate_office_file,
    resolve_field_defaults,
    handle_office_errors
)


class ExcelMixin(MCPMixin):
    """Mixin containing Excel-specific tools for advanced spreadsheet processing."""

    @mcp_tool(
        name="analyze_excel_data",
        description="Comprehensive statistical analysis of Excel spreadsheet data including data types, missing values, statistics, and data quality assessment."
    )
    @handle_office_errors("Excel analysis")
    @resolve_field_defaults(
        sheet_names=[],
        include_statistics=True,
        detect_data_types=True,
        check_data_quality=True
    )
    async def analyze_excel_data(
        self,
        file_path: str = Field(description="Path to Excel document or URL"),
        sheet_names: List[str] = Field(default=[], description="Specific sheets to analyze (empty = all sheets)"),
        include_statistics: bool = Field(default=True, description="Include statistical analysis (mean, median, etc.)"),
        detect_data_types: bool = Field(default=True, description="Analyze and detect optimal data types"),
        check_data_quality: bool = Field(default=True, description="Check for missing values, duplicates, outliers")
    ) -> Dict[str, Any]:
        """Analyze Excel data with comprehensive statistics and data quality assessment."""
        start_time = time.time()

        # Resolve and validate file
        resolved_path = await resolve_office_file_path(file_path)
        validation = await validate_office_file(resolved_path)

        if validation["category"] not in ["excel"]:
            raise OfficeFileError(f"File is not an Excel document: {validation['format_name']}")

        # Import required libraries
        import pandas as pd
        import numpy as np
        import warnings

        # Read Excel file
        if validation["extension"] == ".csv":
            sheets_data = {"Sheet1": pd.read_csv(resolved_path)}
        else:
            if sheet_names:
                sheets_data = pd.read_excel(resolved_path, sheet_name=sheet_names)
            else:
                sheets_data = pd.read_excel(resolved_path, sheet_name=None)

        analysis_results = {}

        for sheet_name, df in sheets_data.items():
            sheet_analysis = {
                "sheet_name": sheet_name,
                "dimensions": {"rows": len(df), "columns": len(df.columns)},
                "column_info": {}
            }

            # Basic column information
            for col in df.columns:
                col_info = {
                    "data_type": str(df[col].dtype),
                    "non_null_count": df[col].count(),
                    "null_count": df[col].isnull().sum(),
                    "null_percentage": (df[col].isnull().sum() / len(df)) * 100
                }

                if detect_data_types:
                    # Suggest optimal data type
                    if df[col].dtype == 'object':
                        # Check if it could be numeric
                        try:
                            pd.to_numeric(df[col], errors='raise')
                            col_info["suggested_type"] = "numeric"
                        except (ValueError, TypeError):
                            # Check if it could be datetime (suppress format inference warning)
                            try:
                                with warnings.catch_warnings():
                                    warnings.filterwarnings("ignore", message=".*Could not infer format.*")
                                    pd.to_datetime(df[col], errors='raise')
                                col_info["suggested_type"] = "datetime"
                            except (ValueError, TypeError):
                                col_info["suggested_type"] = "text"
                    else:
                        col_info["suggested_type"] = str(df[col].dtype)

                if include_statistics and df[col].dtype in ['int64', 'float64']:
                    # Numerical statistics
                    col_info["statistics"] = {
                        "mean": float(df[col].mean()) if not df[col].isnull().all() else None,
                        "median": float(df[col].median()) if not df[col].isnull().all() else None,
                        "std": float(df[col].std()) if not df[col].isnull().all() else None,
                        "min": float(df[col].min()) if not df[col].isnull().all() else None,
                        "max": float(df[col].max()) if not df[col].isnull().all() else None,
                        "q25": float(df[col].quantile(0.25)) if not df[col].isnull().all() else None,
                        "q75": float(df[col].quantile(0.75)) if not df[col].isnull().all() else None
                    }
                elif include_statistics:
                    # Categorical statistics
                    col_info["statistics"] = {
                        "unique_count": df[col].nunique(),
                        "most_frequent": str(df[col].mode().iloc[0]) if not df[col].empty and not df[col].mode().empty else None,
                        "frequency_of_most": int(df[col].value_counts().iloc[0]) if not df[col].empty else 0
                    }

                if check_data_quality:
                    # Data quality checks
                    quality_issues = []

                    # Check for duplicates in column
                    if df[col].duplicated().any():
                        quality_issues.append(f"{df[col].duplicated().sum()} duplicate values")

                    # Check for potential outliers (for numeric columns)
                    if df[col].dtype in ['int64', 'float64'] and not df[col].isnull().all():
                        q1 = df[col].quantile(0.25)
                        q3 = df[col].quantile(0.75)
                        iqr = q3 - q1
                        outliers = df[(df[col] < (q1 - 1.5 * iqr)) | (df[col] > (q3 + 1.5 * iqr))][col]
                        if len(outliers) > 0:
                            quality_issues.append(f"{len(outliers)} potential outliers")

                    col_info["quality_issues"] = quality_issues

                sheet_analysis["column_info"][col] = col_info

            if check_data_quality:
                # Overall data quality assessment
                total_cells = len(df) * len(df.columns)
                null_cells = df.isnull().sum().sum()
                duplicate_rows = df.duplicated().sum()

                sheet_analysis["data_quality"] = {
                    "completeness_percentage": ((total_cells - null_cells) / total_cells) * 100,
                    "duplicate_rows": int(duplicate_rows),
                    "total_rows": len(df),
                    "data_density": f"{((total_cells - null_cells) / total_cells) * 100:.1f}%"
                }

            analysis_results[sheet_name] = sheet_analysis

        return {
            "analysis": analysis_results,
            "summary": {
                "total_sheets": len(sheets_data),
                "sheets_analyzed": list(sheets_data.keys()),
                "analysis_time": time.time() - start_time,
                "file_info": validation
            }
        }

    @mcp_tool(
        name="extract_excel_formulas",
        description="Extract and analyze formulas from Excel spreadsheets including formula text, calculated values, dependencies, and validation."
    )
    @handle_office_errors("Formula extraction")
    @resolve_field_defaults(
        sheet_names=[],
        include_values=True,
        analyze_dependencies=True
    )
    async def extract_excel_formulas(
        self,
        file_path: str = Field(description="Path to Excel document or URL"),
        sheet_names: List[str] = Field(default=[], description="Specific sheets to process (empty = all sheets)"),
        include_values: bool = Field(default=True, description="Include calculated values alongside formulas"),
        analyze_dependencies: bool = Field(default=True, description="Analyze formula dependencies and references")
    ) -> Dict[str, Any]:
        """Extract formulas from Excel spreadsheets with analysis."""
        start_time = time.time()
        import re

        # Resolve and validate file
        resolved_path = await resolve_office_file_path(file_path)
        validation = await validate_office_file(resolved_path)

        if validation["category"] not in ["excel"] or validation["extension"] == ".csv":
            raise OfficeFileError(f"Formula extraction requires Excel format, got: {validation['format_name']}")

        # Import required libraries
        import openpyxl
        from openpyxl.utils import get_column_letter

        # Load workbooks ONCE upfront (performance fix: was loading per-formula)
        wb = openpyxl.load_workbook(resolved_path, data_only=False)
        wb_with_values = openpyxl.load_workbook(resolved_path, data_only=True) if include_values else None

        formulas_data = {}

        # Process specified sheets or all sheets
        sheets_to_process = sheet_names if sheet_names else wb.sheetnames

        for sheet_name in sheets_to_process:
            if sheet_name not in wb.sheetnames:
                continue

            ws = wb[sheet_name]
            ws_values = wb_with_values[sheet_name] if wb_with_values else None
            sheet_formulas = []

            for row in ws.iter_rows():
                for cell in row:
                    if cell.data_type == 'f':  # Formula cell
                        formula_info = {
                            "cell": f"{get_column_letter(cell.column)}{cell.row}",
                            "formula": cell.value,
                            "row": cell.row,
                            "column": cell.column,
                            "column_letter": get_column_letter(cell.column)
                        }

                        if ws_values:
                            # Get calculated value from pre-loaded workbook
                            calculated_cell = ws_values.cell(row=cell.row, column=cell.column)
                            formula_info["calculated_value"] = calculated_cell.value

                        if analyze_dependencies:
                            # Simple dependency analysis
                            formula_text = str(cell.value)

                            # Extract cell references (basic pattern matching)
                            cell_refs = re.findall(r'[A-Z]+\d+', formula_text)
                            sheet_refs = re.findall(r"'?([^'!]+)'?![A-Z]+\d+", formula_text)

                            formula_info["dependencies"] = {
                                "cell_references": list(set(cell_refs)),
                                "sheet_references": list(set(sheet_refs)),
                                "external_references": "!" in formula_text and not any(ref in formula_text for ref in wb.sheetnames)
                            }

                        sheet_formulas.append(formula_info)

            formulas_data[sheet_name] = {
                "formulas": sheet_formulas,
                "formula_count": len(sheet_formulas),
                "sheet_info": {
                    "total_cells": ws.max_row * ws.max_column,
                    "formula_density": (len(sheet_formulas) / (ws.max_row * ws.max_column)) * 100 if ws.max_row and ws.max_column else 0
                }
            }

        # Cleanup
        if wb_with_values:
            wb_with_values.close()
        wb.close()

        # Generate summary statistics
        total_formulas = sum(len(data["formulas"]) for data in formulas_data.values())

        return {
            "formulas": formulas_data,
            "summary": {
                "total_formulas": total_formulas,
                "sheets_processed": len(formulas_data),
                "extraction_time": time.time() - start_time,
                "file_info": validation
            }
        }

    @mcp_tool(
        name="create_excel_chart_data",
        description="Analyze Excel data and generate chart configurations for popular visualization libraries (Chart.js, Plotly, Matplotlib) with data preparation."
    )
    @handle_office_errors("Chart data generation")
    @resolve_field_defaults(
        sheet_name="",
        chart_type="auto",
        x_column="",
        y_columns=[],
        output_format="chartjs"
    )
    async def create_excel_chart_data(
        self,
        file_path: str = Field(description="Path to Excel document or URL"),
        sheet_name: str = Field(default="", description="Sheet to process (empty = first sheet)"),
        chart_type: str = Field(default="auto", description="Chart type: auto, bar, line, pie, scatter, histogram"),
        x_column: str = Field(default="", description="Column for X-axis (empty = auto-detect)"),
        y_columns: List[str] = Field(default=[], description="Columns for Y-axis (empty = auto-detect)"),
        output_format: str = Field(default="chartjs", description="Output format: chartjs, plotly, matplotlib, all")
    ) -> Dict[str, Any]:
        """Generate chart-ready data and configurations from Excel spreadsheets."""
        start_time = time.time()

        # Resolve and validate file
        resolved_path = await resolve_office_file_path(file_path)
        validation = await validate_office_file(resolved_path)

        if validation["category"] not in ["excel"]:
            raise OfficeFileError(f"File is not an Excel document: {validation['format_name']}")

        # Import required libraries
        import pandas as pd

        # Read Excel file
        if validation["extension"] == ".csv":
            df = pd.read_csv(resolved_path)
            used_sheet = "CSV Data"
        else:
            if sheet_name:
                df = pd.read_excel(resolved_path, sheet_name=sheet_name)
                used_sheet = sheet_name
            else:
                # Use first sheet
                excel_data = pd.read_excel(resolved_path, sheet_name=None)
                first_sheet = list(excel_data.keys())[0]
                df = excel_data[first_sheet]
                used_sheet = first_sheet

        # Auto-detect columns if not specified
        if not x_column:
            # Look for text/date columns for X-axis
            text_cols = df.select_dtypes(include=['object', 'datetime64']).columns
            x_column = text_cols[0] if len(text_cols) > 0 else df.columns[0]

        if not y_columns:
            # Look for numeric columns for Y-axis
            numeric_cols = df.select_dtypes(include=['number']).columns
            # Remove x_column if it's numeric
            y_columns = [col for col in numeric_cols if col != x_column][:3]  # Limit to 3 series

        # Auto-detect chart type if needed
        if chart_type == "auto":
            if len(df) > 50:
                chart_type = "line"  # Line chart for time series
            elif df[x_column].dtype == 'object' and len(df[x_column].unique()) < 20:
                chart_type = "bar"  # Bar chart for categories
            elif len(y_columns) == 1:
                chart_type = "scatter"  # Scatter for single numeric relationship
            else:
                chart_type = "line"  # Default to line

        # Prepare data
        chart_data = {
            "source_data": {
                "x_column": x_column,
                "y_columns": y_columns,
                "chart_type": chart_type,
                "data_points": len(df)
            },
            "processed_data": {}
        }

        # Clean and prepare the data
        clean_df = df[[x_column] + y_columns].dropna()

        # Generate Chart.js configuration
        if output_format in ["chartjs", "all"]:
            chartjs_config = {
                "type": chart_type,
                "data": {
                    "labels": clean_df[x_column].astype(str).tolist(),
                    "datasets": []
                },
                "options": {
                    "responsive": True,
                    "plugins": {
                        "title": {
                            "display": True,
                            "text": f"Chart from {used_sheet}"
                        }
                    },
                    "scales": {
                        "x": {"title": {"display": True, "text": x_column}},
                        "y": {"title": {"display": True, "text": "Values"}}
                    }
                }
            }

            colors = ["rgb(255, 99, 132)", "rgb(54, 162, 235)", "rgb(255, 205, 86)", "rgb(75, 192, 192)"]

            for i, y_col in enumerate(y_columns):
                dataset = {
                    "label": y_col,
                    "data": clean_df[y_col].tolist(),
                    "borderColor": colors[i % len(colors)],
                    "backgroundColor": colors[i % len(colors)].replace("rgb", "rgba").replace(")", ", 0.2)")
                }
                chartjs_config["data"]["datasets"].append(dataset)

            chart_data["processed_data"]["chartjs"] = chartjs_config

        # Generate Plotly configuration
        if output_format in ["plotly", "all"]:
            plotly_config = {
                "data": [],
                "layout": {
                    "title": f"Chart from {used_sheet}",
                    "xaxis": {"title": x_column},
                    "yaxis": {"title": "Values"}
                }
            }

            for y_col in y_columns:
                trace = {
                    "x": clean_df[x_column].tolist(),
                    "y": clean_df[y_col].tolist(),
                    "name": y_col,
                    "type": "scatter" if chart_type == "scatter" else chart_type
                }
                if chart_type == "line":
                    trace["mode"] = "lines+markers"
                plotly_config["data"].append(trace)

            chart_data["processed_data"]["plotly"] = plotly_config

        # Generate Matplotlib code template
        if output_format in ["matplotlib", "all"]:
            matplotlib_code = f"""
import matplotlib.pyplot as plt
import pandas as pd

# Data preparation
x_data = {clean_df[x_column].tolist()}
"""
            for y_col in y_columns:
                matplotlib_code += f"{y_col.replace(' ', '_')}_data = {clean_df[y_col].tolist()}\n"

            matplotlib_code += f"""
# Create the plot
plt.figure(figsize=(10, 6))
"""

            if chart_type == "bar":
                for i, y_col in enumerate(y_columns):
                    matplotlib_code += f"plt.bar(x_data, {y_col.replace(' ', '_')}_data, label='{y_col}', alpha=0.7)\n"
            elif chart_type == "line":
                for y_col in y_columns:
                    matplotlib_code += f"plt.plot(x_data, {y_col.replace(' ', '_')}_data, label='{y_col}', marker='o')\n"
            elif chart_type == "scatter":
                for y_col in y_columns:
                    matplotlib_code += f"plt.scatter(x_data, {y_col.replace(' ', '_')}_data, label='{y_col}', alpha=0.7)\n"

            matplotlib_code += f"""
plt.xlabel('{x_column}')
plt.ylabel('Values')
plt.title('Chart from {used_sheet}')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
"""

            chart_data["processed_data"]["matplotlib"] = matplotlib_code

        return {
            "chart_configuration": chart_data,
            "data_summary": {
                "original_rows": len(df),
                "clean_rows": len(clean_df),
                "x_column": x_column,
                "y_columns": y_columns,
                "chart_type": chart_type,
                "sheet_used": used_sheet
            },
            "generation_time": time.time() - start_time,
            "file_info": validation
        }