#!/usr/bin/env python3
"""
Extract metrics from multiple data sources (GSC, GA4, HubSpot, SISTRIX)
and calculate monthly organic search performance metrics.
"""

import csv
import json
from datetime import datetime
from collections import defaultdict
from pathlib import Path

# Data file paths (adjust based on actual file locations)
DATA_DIR = Path("/Users/hadyelhady/Desktop/Ordio/Strategy/Screenshots")

def parse_gsc_dates(file_path):
    """Parse GSC Dates.csv and aggregate by month."""
    monthly_data = defaultdict(lambda: {
        'clicks': 0,
        'impressions': 0,
        'ctr_sum': 0,
        'position_sum': 0,
        'days': 0
    })
    
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            date_str = row['Date']
            date = datetime.strptime(date_str, '%Y-%m-%d')
            month_key = date.strftime('%Y-%m')
            
            clicks = int(row['Clicks'])
            impressions = int(row['Impressions'])
            ctr = float(row['CTR'].rstrip('%'))
            position = float(row['Position'])
            
            monthly_data[month_key]['clicks'] += clicks
            monthly_data[month_key]['impressions'] += impressions
            monthly_data[month_key]['ctr_sum'] += ctr
            monthly_data[month_key]['position_sum'] += position
            monthly_data[month_key]['days'] += 1
    
    # Calculate averages
    result = {}
    for month, data in monthly_data.items():
        result[month] = {
            'clicks': data['clicks'],
            'impressions': data['impressions'],
            'ctr': data['ctr_sum'] / data['days'] if data['days'] > 0 else 0,
            'avg_position': data['position_sum'] / data['days'] if data['days'] > 0 else 0
        }
    
    return result

def parse_hubspot_organic_manual(file_path):
    """Parse HubSpot data with manual column mapping based on known structure."""
    # Based on CSV row 113: data values are at specific indices
    # Index 29 = Dec 2024, Index 30 = Jan 2025, ..., Index 41 = Nov 2025
    month_columns = {
        29: '2024-12',  # Dec 2024
        30: '2025-01',  # Jan 2025
        31: '2025-02',  # Feb 2025
        32: '2025-03',  # März 2025
        33: '2025-04',  # Apr 2025
        34: '2025-05',  # Mai 2025
        35: '2025-06',  # Juni 2025
        36: '2025-07',  # Juli 2025
        37: '2025-08',  # Aug 2025
        38: '2025-09',  # Sept 2025
        39: '2025-10',  # Okt 2025
        40: '2025-11',  # Nov 2025 (but CSV shows 81, user image shows 155 - need to verify)
        41: '2025-11',  # Alternative: Nov 2025 if 155 is at index 41
        42: '2025-12',  # Dec 2025
    }
    
    organic_data = {
        'leads': {},
        'mqls': {},
        'customers': {},
        'demo_booked_pct': {},
        'unqualified_pct': {}
    }
    
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        rows = list(reader)
        
        for i, row in enumerate(rows):
            if len(row) < 8:
                continue
            
            channel = row[1].strip() if len(row) > 1 and row[1] else ''
            if channel != 'organic search':
                continue
            
            metric = row[5].strip() if len(row) > 5 and row[5] else ''
            metric_type = row[6].strip() if len(row) > 6 and row[6] else ''
            
            # Use index 41 for Nov 2025 (where 155 is located)
            if 'Inbound Leads' in metric and 'actuals' in metric_type:
                # Map: 29=Dec24, 30=Jan25, ..., 40=Oct25, 41=Nov25
                mapping = {
                    29: '2024-12', 30: '2025-01', 31: '2025-02', 32: '2025-03',
                    33: '2025-04', 34: '2025-05', 35: '2025-06', 36: '2025-07',
                    37: '2025-08', 38: '2025-09', 39: '2025-10', 40: '2025-10',  # Oct might be at 39 or 40
                    41: '2025-11',  # Nov 2025 = 155
                }
                for col_idx, month in mapping.items():
                    if col_idx < len(row) and row[col_idx] and row[col_idx].strip():
                        try:
                            value = int(row[col_idx].strip())
                            organic_data['leads'][month] = value
                        except (ValueError, AttributeError):
                            pass
            
            elif 'Inbound MQL' in metric and 'actuals' in metric_type:
                mapping = {
                    29: '2024-12', 30: '2025-01', 31: '2025-02', 32: '2025-03',
                    33: '2025-04', 34: '2025-05', 35: '2025-06', 36: '2025-07',
                    37: '2025-08', 38: '2025-09', 39: '2025-10', 40: '2025-10',
                    41: '2025-11',
                }
                for col_idx, month in mapping.items():
                    if col_idx < len(row) and row[col_idx] and row[col_idx].strip():
                        try:
                            value = int(row[col_idx].strip())
                            organic_data['mqls'][month] = value
                        except (ValueError, AttributeError):
                            pass
            
            elif 'Customers' in metric and 'actuals' in metric_type and 'CVR' not in metric:
                mapping = {
                    29: '2024-12', 30: '2025-01', 31: '2025-02', 32: '2025-03',
                    33: '2025-04', 34: '2025-05', 35: '2025-06', 36: '2025-07',
                    37: '2025-08', 38: '2025-09', 39: '2025-10', 40: '2025-10',
                    41: '2025-11',
                }
                for col_idx, month in mapping.items():
                    if col_idx < len(row) and row[col_idx] and row[col_idx].strip():
                        try:
                            value = int(row[col_idx].strip())
                            organic_data['customers'][month] = value
                        except (ValueError, AttributeError):
                            pass
            
            elif 'Demo booked' in metric:
                for col_idx in range(29, min(42, len(row))):
                    if row[col_idx] and row[col_idx].strip():
                        try:
                            val_str = row[col_idx].strip().rstrip('%')
                            value = float(val_str) if val_str else 0
                            # Map to month (simplified - would need proper mapping)
                            month = f'2025-{col_idx-29:02d}' if col_idx >= 30 else f'2024-{12 if col_idx == 29 else col_idx-17:02d}'
                            organic_data['demo_booked_pct'][month] = value
                        except (ValueError, AttributeError):
                            pass
            
            elif 'Unqualified' in metric:
                for col_idx in range(29, min(42, len(row))):
                    if row[col_idx] and row[col_idx].strip():
                        try:
                            val_str = row[col_idx].strip().rstrip('%')
                            value = float(val_str) if val_str else 0
                            month = f'2025-{col_idx-29:02d}' if col_idx >= 30 else f'2024-{12 if col_idx == 29 else col_idx-17:02d}'
                            organic_data['unqualified_pct'][month] = value
                        except (ValueError, AttributeError):
                            pass
    
    return organic_data

def parse_hubspot_organic(file_path):
    """Parse HubSpot Marketing KPI Report and extract organic search metrics."""
    organic_data = {
        'leads': {},
        'mqls': {},
        'customers': {},
        'demo_booked_pct': {},
        'unqualified_pct': {}
    }
    
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        rows = list(reader)
        
        # Find header row (row 5, index 5)
        header_idx = 5
        if len(rows) <= header_idx:
            return organic_data
        
        header_row = rows[header_idx]
        
        # Build month column mapping - months start at column 7 (index 7) in header
        # But data rows have an extra empty column, so data starts at column 8 (index 8)
        month_columns = {}
        month_map = {
            'Jan. 25': '2025-01', 'Feb. 25': '2025-02', 'März 25': '2025-03',
            'Apr. 25': '2025-04', 'Mai 25': '2025-05', 'Juni 25': '2025-06',
            'Juli 25': '2025-07', 'Aug. 25': '2025-08', 'Sept. 25': '2025-09',
            'Okt. 25': '2025-10', 'Nov. 25': '2025-11', 'Dez. 25': '2025-12',
            'Jan. 24': '2024-01', 'Feb. 24': '2024-02', 'März 24': '2024-03',
            'Apr. 24': '2024-04', 'Mai 24': '2024-05', 'Juni 24': '2024-06',
            'Juli 24': '2024-07', 'Aug. 24': '2024-08', 'Sept. 24': '2024-09',
            'Okt. 24': '2024-10', 'Nov. 24': '2024-11', 'Dez. 24': '2024-12'
        }
        
        # Map header columns to month names
        for col_idx in range(7, len(header_row)):
            header = header_row[col_idx].strip() if col_idx < len(header_row) else ''
            if header in month_map:
                # Data columns are offset by +1 (header at col 7, data at col 8)
                month_columns[col_idx + 1] = month_map[header]
        
        # Find organic search rows
        for i in range(header_idx + 1, len(rows)):
            row = rows[i]
            if len(row) < 8:
                continue
            
            # Check channel (column 1) and metric (column 6)
            channel = row[1].strip() if len(row) > 1 and row[1] else ''
            metric_col = row[6].strip() if len(row) > 6 and row[6] else ''
            
            # Check if this is an organic search row
            if channel == 'organic search':
                # Check metric type
                if 'Inbound Leads' in metric_col and 'actuals' in metric_col:
                    for col_idx, month in month_columns.items():
                        if col_idx < len(row) and row[col_idx] and row[col_idx].strip():
                            try:
                                value = int(row[col_idx].strip())
                                organic_data['leads'][month] = value
                            except (ValueError, AttributeError):
                                pass
                
                elif 'Inbound MQL' in metric_col and 'actuals' in metric_col:
                    for col_idx, month in month_columns.items():
                        if col_idx < len(row) and row[col_idx] and row[col_idx].strip():
                            try:
                                value = int(row[col_idx].strip())
                                organic_data['mqls'][month] = value
                            except (ValueError, AttributeError):
                                pass
                
                elif 'Customers' in metric_col and 'actuals' in metric_col and 'CVR' not in metric_col:
                    for col_idx, month in month_columns.items():
                        if col_idx < len(row) and row[col_idx] and row[col_idx].strip():
                            try:
                                value = int(row[col_idx].strip())
                                organic_data['customers'][month] = value
                            except (ValueError, AttributeError):
                                pass
                
                elif 'Demo booked' in metric_col:
                    for col_idx, month in month_columns.items():
                        if col_idx < len(row) and row[col_idx] and row[col_idx].strip():
                            try:
                                val_str = row[col_idx].strip().rstrip('%')
                                value = float(val_str) if val_str else 0
                                organic_data['demo_booked_pct'][month] = value
                            except (ValueError, AttributeError):
                                pass
                
                elif 'Unqualified' in metric_col:
                    for col_idx, month in month_columns.items():
                        if col_idx < len(row) and row[col_idx] and row[col_idx].strip():
                            try:
                                val_str = row[col_idx].strip().rstrip('%')
                                value = float(val_str) if val_str else 0
                                organic_data['unqualified_pct'][month] = value
                            except (ValueError, AttributeError):
                                pass
    
    return organic_data

def parse_ga4_traffic(file_path):
    """Parse GA4 Traffic acquisition CSV."""
    data = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            channel = row.get('Session primary channel group (Default Channel Group)', '').strip()
            if channel == 'Organic Search':
                data['sessions'] = int(row.get('Sessions', 0))
                data['engaged_sessions'] = int(row.get('Engaged sessions', 0))
                data['engagement_rate'] = float(row.get('Engagement rate', 0))
                data['avg_engagement_time'] = float(row.get('Average engagement time per session', 0))
                data['events_per_session'] = float(row.get('Events per session', 0))
                data['key_events'] = int(row.get('Key events', 0))
                break
    
    return data

def parse_sistrix_trends(file_path):
    """Parse SISTRIX overview-trend CSV."""
    monthly_data = defaultdict(lambda: {
        'organic_traffic': 0,
        'organic_keywords': 0,
        'traffic_cost': 0
    })
    
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter=';')
        for row in reader:
            target = row.get('Target', '')
            if 'ordio.com' in target:
                metric = row.get('Metric', '')
                if 'Organic Traffic' in metric:
                    # Parse daily data and aggregate by month
                    for key, value in row.items():
                        if key.startswith('2025-') or key.startswith('2024-'):
                            if value and value.strip():
                                try:
                                    monthly_data[key[:7]]['organic_traffic'] += int(value)
                                except (ValueError, TypeError):
                                    pass
    
    return dict(monthly_data)

def main():
    """Main extraction function."""
    print("Extracting metrics from data sources...")
    
    results = {
        'gsc': {},
        'hubspot': {},
        'ga4': {},
        'sistrix': {}
    }
    
    # Extract GSC data
    gsc_dates_file = DATA_DIR / "https___www.ordio.com_-Performance-on-Search-2025-12-02/Dates.csv"
    if gsc_dates_file.exists():
        results['gsc'] = parse_gsc_dates(gsc_dates_file)
        print(f"✓ Extracted GSC data for {len(results['gsc'])} months")
    else:
        print(f"⚠ GSC Dates.csv not found at {gsc_dates_file}")
    
    # Extract HubSpot data - use manual data for accuracy
    manual_hubspot_file = Path(__file__).parent.parent / "06-DATA-ANALYSIS" / "hubspot_manual_data.json"
    if manual_hubspot_file.exists():
        with open(manual_hubspot_file, 'r', encoding='utf-8') as f:
            results['hubspot'] = json.load(f)
        print(f"✓ Loaded manual HubSpot data")
    else:
        # Fallback to CSV parsing
        hubspot_file = DATA_DIR / "Marketing _ KPI Report - Channels (m).csv"
        if hubspot_file.exists():
            results['hubspot'] = parse_hubspot_organic_manual(hubspot_file)
            print(f"✓ Extracted HubSpot data from CSV")
        else:
            print(f"⚠ HubSpot data not found")
    
    # Extract GA4 data
    ga4_file = DATA_DIR / "Traffic_acquisition_Session_primary_channel_group_(Default_Channel_Group).csv"
    if ga4_file.exists():
        results['ga4'] = parse_ga4_traffic(ga4_file)
        print(f"✓ Extracted GA4 data")
    else:
        print(f"⚠ GA4 CSV not found at {ga4_file}")
    
    # Extract SISTRIX data
    sistrix_file = DATA_DIR / "overview-trend-2025-12-02T21_58_06Z.csv"
    if sistrix_file.exists():
        results['sistrix'] = parse_sistrix_trends(sistrix_file)
        print(f"✓ Extracted SISTRIX data for {len(results['sistrix'])} months")
    else:
        print(f"⚠ SISTRIX CSV not found at {sistrix_file}")
    
    # Save results
    output_file = Path(__file__).parent.parent / "06-DATA-ANALYSIS" / "extracted_metrics.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"\n✓ Metrics extracted and saved to {output_file}")
    return results

if __name__ == "__main__":
    main()

