import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from typing import Dict
from scipy.stats import linregress
from scipy.stats import spearmanr, theilslopes
from config.constants import NEGATIVE_MARKERS, MODEL_NAME_REPLACEMENTS


def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_data: Dict):
    """
    Produces two figures:
      • Figure #1 with three subplots side-by-side:
        (1) Raw Scores bar chart (+ 95% CI)
        (2) Calibrated Scores bar chart (+ 95% CI)
        (3) Heatmap of all per-criterion scores across each model (10 - score for negative markers).
      • Figure #2: A 4×4 grid of mini scatter plots, one per model (up to 16),
        showing item length (chars) vs. aggregated_score_raw. A linear regression
        line and correlation stats are included for each model if enough points exist.
    """
    # -------------------------------------------------------------------
    # 1) The main (raw / calibrated / heatmap) figure
    # -------------------------------------------------------------------
    raw_stats = run_data["raw_model_stats"]
    cal_stats = run_data["calibrated_model_stats"]

    if judge_model in MODEL_NAME_REPLACEMENTS:
        judge_model = MODEL_NAME_REPLACEMENTS[judge_model]
    
    # All model names in raw_stats
    model_names = list(raw_stats.keys())

    # Convert to arrays for sorting
    raw_means = [raw_stats[m]["mean"] for m in model_names]
    cal_means = [cal_stats[m]["mean"] for m in model_names]
    raw_cis   = [raw_stats[m]["ci95"] for m in model_names]
    cal_cis   = [cal_stats[m]["ci95"] for m in model_names]
    
    # Sort by calibrated score descending
    sorted_indices = np.argsort(cal_means)[::-1]
    model_names = [model_names[i] for i in sorted_indices]
    raw_means   = [raw_means[i]   for i in sorted_indices]
    cal_means   = [cal_means[i]   for i in sorted_indices]
    raw_cis     = [raw_cis[i]     for i in sorted_indices]
    cal_cis     = [cal_cis[i]     for i in sorted_indices]
    
    # 1.A) Build data for the heatmap: per-criterion scores (with negative flips)
    #     We gather them from run_data "parsed_scores"
    #     Then we convert them to 0..10 bins and store percentage distribution.
    all_scores_by_model = {m: [] for m in model_names}
    results = run_data.get("results", {})
    
    for model_name in model_names:
        iter_dict = results.get(model_name, {})
        for iteration_key, item_dict in iter_dict.items():
            if not isinstance(item_dict, dict):
                continue
            for item_id, item_info in item_dict.items():
                if not isinstance(item_info, dict):
                    continue
                parsed_scores = item_info.get("parsed_scores", {})
                if not isinstance(parsed_scores, dict):
                    continue
                for crit_name, val in parsed_scores.items():
                    if isinstance(val, (int, float)) and 0 <= val <= 10:
                        crit_lower = crit_name.strip().lower()
                        # Flip negative
                        if any(nm in crit_lower for nm in NEGATIVE_MARKERS):
                            final_val = 10 - val
                        else:
                            final_val = val
                        all_scores_by_model[model_name].append(final_val)
    
    # Convert to a 2D array for the heatmap (rows = models, columns = bins)
    bins = np.linspace(0, 10, 11)
    heatmap_rows = []
    for m in model_names:
        scores = all_scores_by_model[m]
        if scores:
            counts, _ = np.histogram(scores, bins=bins)
            pct = (counts / len(scores)) * 100.0
        else:
            pct = np.zeros(len(bins)-1, dtype=float)
        heatmap_rows.append(pct)
    heatmap_data = np.array(heatmap_rows, dtype=float)
    
    # 1.B) Plot the main figure with 3 subplots
    fig1, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 7))  # Increased height

    # Increase the base font size for all text elements
    plt.rcParams.update({'font.size': 14})
    
    # (A) Raw bar chart    
    y_pos = np.arange(len(model_names))
    ax1.barh(y_pos, raw_means, color='skyblue', alpha=0.7)
    for i, (mean_val, ci95) in enumerate(zip(raw_means, raw_cis)):
        ax1.errorbar(mean_val, i, xerr=ci95, color='red', capsize=5)
    ax1.set_yticks(y_pos)
    ax1.set_yticklabels(model_names, fontsize=12)
    ax1.invert_yaxis()
    ax1.set_xlabel("Raw Scores", fontsize=14)
    ax1.set_title("Raw Model Scores (95% CI)", fontsize=16)
    ax1.grid(True, alpha=0.3)
    ax1.tick_params(axis='y', pad=10)

    # (B) Calibrated bar chart
    ax2.barh(y_pos, cal_means, color='lightgreen', alpha=0.7)
    for i, (mean_val, ci95) in enumerate(zip(cal_means, cal_cis)):
        ax2.errorbar(mean_val, i, xerr=ci95, color='red', capsize=5)
    ax2.set_yticks(y_pos)
    ax2.set_yticklabels(model_names, fontsize=12)
    ax2.invert_yaxis()
    ax2.set_xlabel("Calibrated Scores", fontsize=14)
    ax2.set_title("Calibrated Model Scores (95% CI)", fontsize=16)
    ax2.grid(True, alpha=0.3)
    ax2.tick_params(axis='y', pad=10)

    # (C) Heatmap
    bin_edges = bins[:-1]
    ax3.set_xticks(np.arange(len(bin_edges)))
    ax3.set_xticklabels([str(int(be)) for be in bin_edges])
    im = ax3.imshow(heatmap_data, aspect='auto', origin='upper', cmap='plasma')
    #ax3.set_xticks(np.arange(len(bin_centers)))
    #ax3.set_xticklabels([f"{bc:.0f}" for bc in bin_centers], fontsize=12)
    ax3.set_yticks(np.arange(len(model_names)))
    ax3.set_yticklabels(model_names, fontsize=12)
    ax3.set_xlabel("Score Bin (0–10)", fontsize=14)
    ax3.set_title("Per-Criterion Score Distribution (Heatmap)", fontsize=16)
    ax3.tick_params(axis='y', pad=10)
    
    cbar = plt.colorbar(im, ax=ax3)
    cbar.set_label("% of Criteria in Bin", fontsize=14)
    cbar.ax.tick_params(labelsize=12)
    cbar.formatter.set_powerlimits((0, 0))
    cbar.formatter = ticker.PercentFormatter(decimals=1)
    cbar.update_ticks()

    # Overall title
    
    sanitized_judge = re.sub(r"[^\w\-]", "-", judge_model.replace("/", "__"))
    fig1.suptitle(f"Judgemark: Raw/Calibrated/Heatmap - Judge: {judge_model}", fontsize=20)
    
    # Adjust layout with more space
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    
    plt.savefig(f"results/charts/judgemark_3chart_{sanitized_judge}.png", 
                bbox_inches='tight', 
                dpi=150,
                pad_inches=0.5)
    plt.close(fig1)

    # -------------------------------------------------------------------
    # 2) Second Figure: A 4×4 grid of scatter plots (per-model), 
    #    where X = item length, Y = aggregated_score_raw.
    #    We'll gather text straight from samples_data, compute len(...).
    # -------------------------------------------------------------------
    # If you want exactly 16 models, skip any extra or exclude some.
    # Example: exclude "gemini-1.5-pro-001" 
    excluded_models = {"gemini-1.5-pro-001"}
    model_list_for_scatter = [m for m in model_names if m not in excluded_models]
    
    # If it's still longer than 16, slice it to 16
    if len(model_list_for_scatter) > 16:
        model_list_for_scatter = model_list_for_scatter[:16]

    # Build figure and subplots: 4x4
    fig2, axes2 = plt.subplots(4, 4, figsize=(20, 20))  # each cell is a scatter
    fig2.suptitle(f"Judgemark: Per-Model Length vs. Score - Judge: {judge_model}", fontsize=18)
    
    # We might have fewer than 16 models. We'll track them by row & col.
    for idx, mname in enumerate(model_list_for_scatter):
        row = idx // 4
        col = idx % 4
        ax = axes2[row, col]
        
        # Collect all (length, raw_score) for this model
        length_vals = []
        score_vals  = []
        
        # For each (iteration_key, item_id), find the text in samples_data,
        # find aggregated_score_raw in run_data, then store pairs
        model_res = run_data["results"].get(mname, {})
        for it_key, it_dict in model_res.items():
            if not isinstance(it_dict, dict):
                continue
            for item_id, item_info in it_dict.items():
                if not isinstance(item_info, dict):
                    continue
                raw_score = item_info.get("aggregated_score_raw", None)
                if not isinstance(raw_score, (int, float)):
                    continue

                # Look up the text in samples_data:
                text = (samples_data
                        .get(mname, {})
                        .get("samples", {})
                        .get(it_key, {})
                        .get(item_id, "")) 
                text_len = len(text)

                # If it's non-empty text
                if text_len > 0:
                    length_vals.append(text_len)
                    score_vals.append(raw_score)
        
        ax.set_title(mname, fontsize=12)
        ax.set_xlabel("Length")
        ax.set_ylabel("Raw Score")
        
        if len(length_vals) > 1:
            ax.scatter(length_vals, score_vals, alpha=0.4, color='blue')
            
            # -- Rank-based correlation (Spearman) --
            rho, p_value = spearmanr(length_vals, score_vals)
            
            # -- Robust linear fit (Theil-Sen) --
            # returns slope, intercept, lower_slope, upper_slope
            slope, intercept, lo_slope, hi_slope = theilslopes(score_vals, length_vals, alpha=0.95)
            
            # Build the line
            xline = np.linspace(min(length_vals), max(length_vals), 200)
            yline = slope * xline + intercept
            ax.plot(xline, yline, color='red', linewidth=2,
                    label=f"Spearman ρ={rho:.2f}, p={p_value:.2g}")
            ax.legend(loc="best")
        else:
            ax.text(0.5, 0.5, "Not enough data",
                    ha='center', va='center', color='red', transform=ax.transAxes)
    
    # If we have fewer than 16 models, blank out remaining subplots
    total_subplots = 16
    for i in range(len(model_list_for_scatter), total_subplots):
        row = i // 4
        col = i % 4
        axes2[row, col].axis("off")
    
    plt.tight_layout()
    plt.savefig(f"results/charts/judgemark_scattergrid_{sanitized_judge}.png", bbox_inches='tight', dpi=200)
    plt.close(fig2)