import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np
import warnings

warnings.filterwarnings('ignore')

# ── Style ──────────────────────────────────────────────────────────────────
plt.rcParams.update({
    'figure.dpi': 120,
    'font.family': 'DejaVu Sans',
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.grid': True,
    'grid.alpha': 0.3,
})
PALETTE = sns.color_palette('husl', 8)

# ── Load data ──────────────────────────────────────────────────────────────
CSV_PATH = os.path.join('..', 'data', 'processed', 'classified_prompts.csv')

# If the pipeline hasn't been run yet, load the sample data bundled with the notebook
if os.path.exists(CSV_PATH):
    df = pd.read_csv(CSV_PATH)
    print(f"Loaded {len(df)} prompts from {CSV_PATH}")
else:
    # ── Synthetic sample data for notebook demonstration ───────────────────
    # This data mirrors the structure of a fully-classified run so the notebook
    # can be demonstrated before the API keys are configured.
    import random
    random.seed(42)
    np.random.seed(42)

    subjects = ['matemáticas', 'lectura', 'historia', 'ciencias', 'lengua', 'otro']
    tones = ['formal', 'cálido', 'motivador', 'neutral']
    scaffolding = ['bajo', 'medio', 'alto']
    motivational = ['ninguna', 'elogio', 'orientación_a_metas', 'mentalidad_de_crecimiento', 'múltiple']
    cultural = ['presente', 'ausente']
    sources = ['github', 'reddit', 'synthetic']

    n = 90
    subject_weights = [0.25, 0.20, 0.15, 0.18, 0.17, 0.05]
    tone_weights = [0.20, 0.30, 0.35, 0.15]
    scaf_weights = [0.25, 0.45, 0.30]
    mot_weights = [0.10, 0.25, 0.20, 0.25, 0.20]
    cult_weights = [0.25, 0.75]

    df = pd.DataFrame({
        'id': [f'prompt_{i:03d}' for i in range(n)],
        'source': np.random.choice(sources, n, p=[0.33, 0.33, 0.34]),
        'subject_domain': np.random.choice(subjects, n, p=subject_weights),
        'tone': np.random.choice(tones, n, p=tone_weights),
        'scaffolding_depth': np.random.choice(scaffolding, n, p=scaf_weights),
        'motivational_strategies': np.random.choice(motivational, n, p=mot_weights),
        'cultural_references': np.random.choice(cultural, n, p=cult_weights),
        'flesch_reading_ease': np.clip(np.random.normal(52, 12, n), 10, 90),
        'word_count': np.random.randint(80, 350, n),
        'avg_sentence_length': np.random.uniform(12, 28, n),
    })
    # Introduce realistic correlations: high scaffolding → more motivational strategies
    high_scaf_mask = df['scaffolding_depth'] == 'alto'
    df.loc[high_scaf_mask, 'motivational_strategies'] = np.random.choice(
        ['elogio', 'mentalidad_de_crecimiento', 'múltiple'], high_scaf_mask.sum()
    )
    print(f"⚠️  Pipeline output not found. Using {n} sample records for demonstration.")
    print(f"   Run `python src/pipeline.py` to generate real classified data.")

print(f"\nDataset shape: {df.shape}")

Loaded 90 prompts from ..\data\processed\classified_prompts.csv

Dataset shape: (90, 15)

# ── Basic overview ──────────────────────────────────────────────────────────
print("=== First 5 rows ===")
display(df[['id', 'source', 'subject_domain', 'tone', 'scaffolding_depth',
            'motivational_strategies', 'cultural_references', 'flesch_reading_ease',
            'word_count']].head())

print("\n=== Value counts per categorical dimension ===")
for col in ['subject_domain', 'tone', 'scaffolding_depth',
            'motivational_strategies', 'cultural_references', 'source']:
    print(f"\n{col}:")
    print(df[col].value_counts().to_string())

print("\n=== Numeric columns ===")
display(df[['flesch_reading_ease', 'word_count', 'avg_sentence_length']].describe().round(1))

=== First 5 rows ===

=== Value counts per categorical dimension ===

subject_domain:
subject_domain
otro           26
lengua         19
lectura        14
matemáticas    12
ciencias       11
historia        8

tone:
tone
calido       42
neutral      27
formal       14
motivador     5

scaffolding_depth:
scaffolding_depth
medio    52
alto     23
bajo     13

motivational_strategies:
motivational_strategies
multiple       38
ninguna        36
elogio          9
crecimiento     5

cultural_references:
cultural_references
ausente     53
presente    37

source:
source
github         30
huggingface    30
synthetic      30

=== Numeric columns ===

fig, ax = plt.subplots(figsize=(10, 5))

counts = df['subject_domain'].value_counts().sort_values()
colors = sns.color_palette('Blues_r', len(counts))
bars = ax.barh(counts.index, counts.values, color=colors, edgecolor='white', linewidth=0.5)

# Add count labels
for bar, val in zip(bars, counts.values):
    ax.text(val + 0.3, bar.get_y() + bar.get_height() / 2,
            str(val), va='center', fontsize=11, fontweight='bold', color='#333')

ax.set_xlabel('Number of Prompts', fontsize=12)
ax.set_title('Subject Domain Distribution Across Educational Chatbot Prompts',
             fontsize=14, fontweight='bold', pad=15)
ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))
plt.tight_layout()
plt.show()

# Color-code by semantic meaning: warm tones green, formal blue, neutral gray
tone_colors = {
    'motivador': '#2ecc71',
    'cálido':    '#27ae60',
    'formal':    '#2980b9',
    'neutral':   '#95a5a6',
}

tone_counts = df['tone'].value_counts()
bar_colors = [tone_colors.get(t, '#bdc3c7') for t in tone_counts.index]

fig, ax = plt.subplots(figsize=(9, 5))
bars = ax.bar(tone_counts.index, tone_counts.values, color=bar_colors,
              edgecolor='white', linewidth=0.8, width=0.6)

for bar, val in zip(bars, tone_counts.values):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.3,
            str(val), ha='center', va='bottom', fontsize=12, fontweight='bold')

legend_elements = [
    plt.Rectangle((0, 0), 1, 1, fc='#2ecc71', label='Warm/Motivating'),
    plt.Rectangle((0, 0), 1, 1, fc='#2980b9', label='Formal'),
    plt.Rectangle((0, 0), 1, 1, fc='#95a5a6', label='Neutral'),
]
ax.legend(handles=legend_elements, loc='upper right', fontsize=10)
ax.set_ylabel('Number of Prompts', fontsize=12)
ax.set_title('Tone Distribution of Spanish Educational Chatbot Prompts',
             fontsize=14, fontweight='bold', pad=15)
plt.tight_layout()
plt.show()

scaf_order = ['bajo', 'medio', 'alto']
mot_order  = ['ninguna', 'elogio', 'orientación_a_metas',
               'mentalidad_de_crecimiento', 'múltiple']

cross = pd.crosstab(
    df['scaffolding_depth'],
    df['motivational_strategies']
).reindex(index=scaf_order, columns=mot_order, fill_value=0)

fig, ax = plt.subplots(figsize=(11, 5))
sns.heatmap(
    cross, annot=True, fmt='d', cmap='YlOrRd',
    linewidths=0.5, linecolor='white',
    ax=ax, cbar_kws={'label': 'Count'}
)
ax.set_xlabel('Motivational Strategies', fontsize=12)
ax.set_ylabel('Scaffolding Depth', fontsize=12)
ax.set_title('Scaffolding Depth vs. Motivational Strategies\n'
             'Do high-scaffolding prompts also use stronger motivational language?',
             fontsize=13, fontweight='bold', pad=15)
plt.xticks(rotation=25, ha='right')
plt.tight_layout()
plt.show()

cult_cross = pd.crosstab(df['subject_domain'], df['cultural_references'])
# Ensure both columns exist
for col in ['presente', 'ausente']:
    if col not in cult_cross.columns:
        cult_cross[col] = 0

fig, ax = plt.subplots(figsize=(10, 5))
cult_cross[['ausente', 'presente']].plot(
    kind='bar', ax=ax,
    color=['#bdc3c7', '#e74c3c'],
    edgecolor='white', linewidth=0.6,
    width=0.7
)

ax.set_xlabel('Subject Domain', fontsize=12)
ax.set_ylabel('Number of Prompts', fontsize=12)
ax.set_title('Cultural References by Subject Domain',
             fontsize=14, fontweight='bold', pad=15)
ax.legend(['Ausente (absent)', 'Presente (present)'], fontsize=10)
plt.xticks(rotation=20, ha='right')
plt.tight_layout()
plt.show()

subject_order = (
    df.groupby('subject_domain')['flesch_reading_ease']
    .median()
    .sort_values(ascending=False)
    .index.tolist()
)

fig, ax = plt.subplots(figsize=(11, 5))
bp = ax.boxplot(
    [df.loc[df['subject_domain'] == s, 'flesch_reading_ease'].dropna().values
     for s in subject_order],
    labels=subject_order,
    patch_artist=True,
    medianprops=dict(color='#e74c3c', linewidth=2),
    flierprops=dict(marker='o', markersize=4, alpha=0.5),
)
colors_box = sns.color_palette('pastel', len(subject_order))
for patch, color in zip(bp['boxes'], colors_box):
    patch.set_facecolor(color)

# Reference lines for Flesch score interpretation
ax.axhline(60, color='#27ae60', linestyle='--', alpha=0.5, label='Fairly easy (60)')
ax.axhline(40, color='#e67e22', linestyle='--', alpha=0.5, label='Difficult (40)')
ax.set_xlabel('Subject Domain', fontsize=12)
ax.set_ylabel('Flesch Reading Ease Score (higher = easier)', fontsize=12)
ax.set_title('Linguistic Complexity of System Prompts by Subject Domain',
             fontsize=13, fontweight='bold', pad=15)
ax.legend(fontsize=10)
plt.xticks(rotation=20, ha='right')
plt.tight_layout()
plt.show()

# Example: How weekly data collection would work in the real RCT context

def fetch_weekly_snapshot(platform_api_url: str, api_token: str, week_number: int) -> list[dict]:
    """
    Fetch all chatbot configurations from the RCT platform for a given week.

    This replaces the GitHub/Reddit scrapers in production. The schema is
    identical — only the source and metadata fields change.

    Args:
        platform_api_url: Base URL of the chatbot configuration platform
        api_token:        Bearer token for authentication
        week_number:      Study week (1–N) for longitudinal tracking

    Returns:
        List of prompt records in the standard project schema
    """
    import requests
    from datetime import datetime, timezone

    headers = {"Authorization": f"Bearer {api_token}"}
    response = requests.get(f"{platform_api_url}/chatbots", headers=headers)
    chatbots = response.json()  # [{teacher_id, chatbot_id, system_prompt, ...}, ...]

    records = []
    for i, bot in enumerate(chatbots, start=1):
        records.append({
            "id": f"rct_w{week_number:02d}_{i:03d}",
            "source": "rct_platform",
            "raw_text": bot["system_prompt"],
            "language": "es",
            "collected_at": datetime.now(timezone.utc).isoformat(),
            "metadata": {
                "teacher_id": bot["teacher_id"],       # Anonymised
                "chatbot_id": bot["chatbot_id"],
                "week_number": week_number,
                "school_id": bot.get("school_id"),     # For multilevel analysis
                "condition": bot.get("condition"),     # Treatment/control arm
            }
        })
    return records

print("✅ Weekly collection function ready — drop-in replacement for scrapers.")
print("   Run on a schedule (e.g. GitHub Actions cron) to build longitudinal dataset.")

✅ Weekly collection function ready — drop-in replacement for scrapers.
   Run on a schedule (e.g. GitHub Actions cron) to build longitudinal dataset.

Dimension	Method
Subject Domain	Keywords → Gemini fallback
Tone	Gemini 1.5 Flash
Scaffolding Depth	Gemini 1.5 Flash
Motivational Strategies	Gemini 1.5 Flash
Cultural References	Keywords + Gemini
Complexity (Flesch Reading Ease)	`textstat` library

	flesch_reading_ease	word_count	avg_sentence_length
count	90.0	90.0	90.0
mean	60.0	112.6	14.6
std	22.3	104.2	6.9
min	-46.5	12.0	5.3
25%	48.5	18.0	11.2
50%	61.9	69.5	14.0
75%	76.9	211.5	17.0
max	106.4	321.0	65.3

📂 Newly Added Analysis Notebooks

Spanish Educational Chatbot Prompt Analyzer¶

Exploratory Analysis Notebook¶

Project Overview¶

Research Questions¶

1. Loading & Overview¶

2. Visualization 1 — Subject Domain Distribution¶

3. Visualization 2 — Tone Distribution¶

4. Visualization 3 — Scaffolding Depth × Motivational Strategies Heatmap¶

5. Visualization 4 — Cultural References by Subject Domain¶

6. Visualization 5 — Reading Complexity (Flesch Score) by Subject Domain¶

7. Scaling This Analysis to 267 Teacher-Created Chatbots¶

What Would Stay the Same¶

What Would Change¶

	id	source	subject_domain	tone	scaffolding_depth	motivational_strategies	cultural_references	flesch_reading_ease	word_count
0	github_001	github	lectura	neutral	bajo	ninguna	ausente	53.9	258
1	github_002	github	matemáticas	motivador	alto	multiple	presente	52.4	291
2	github_003	github	lengua	calido	medio	crecimiento	presente	74.0	312
3	github_004	github	matemáticas	neutral	bajo	ninguna	ausente	84.1	290
4	github_005	github	lengua	neutral	alto	ninguna	ausente	34.4	243