#!/usr/bin/env python3
"""
Validate data consistency across different sources (GSC, GA4, HubSpot, SISTRIX).
"""

import json
from pathlib import Path

def validate_data_consistency(extracted_data, conversion_data):
    """Cross-reference data from different sources."""
    
    validation_results = {
        'warnings': [],
        'errors': [],
        'notes': []
    }
    
    gsc_data = extracted_data.get('gsc', {})
    hubspot_data = extracted_data.get('hubspot', {})
    ga4_data = extracted_data.get('ga4', {})
    sistrix_data = extracted_data.get('sistrix', {})
    
    # Check GSC vs GA4
    total_gsc_clicks = sum(v.get('clicks', 0) for v in gsc_data.values())
    total_ga4_sessions = ga4_data.get('sessions', 0)
    
    if total_gsc_clicks > 0 and total_ga4_sessions > 0:
        clicks_to_sessions_ratio = total_ga4_sessions / total_gsc_clicks
        if clicks_to_sessions_ratio < 0.8 or clicks_to_sessions_ratio > 1.5:
            validation_results['warnings'].append(
                f"GSC clicks ({total_gsc_clicks}) vs GA4 sessions ({total_ga4_sessions}) ratio is {clicks_to_sessions_ratio:.2f}. "
                "Expected ratio closer to 1.0. This is normal as clicks and sessions are different metrics."
            )
        else:
            validation_results['notes'].append(
                f"GSC clicks and GA4 sessions are reasonably aligned (ratio: {clicks_to_sessions_ratio:.2f})"
            )
    
    # Check conversion rates are reasonable
    overall = conversion_data.get('overall', {})
    traffic_to_lead = overall.get('traffic_to_lead', 0)
    lead_to_mql = overall.get('lead_to_mql', 0)
    mql_to_customer = overall.get('mql_to_customer', 0)
    
    if traffic_to_lead > 10:
        validation_results['warnings'].append(
            f"Traffic to Lead conversion rate ({traffic_to_lead}%) seems high. Verify data accuracy."
        )
    elif traffic_to_lead < 0.01:
        validation_results['warnings'].append(
            f"Traffic to Lead conversion rate ({traffic_to_lead}%) seems very low. Check attribution."
        )
    
    if lead_to_mql > 100:
        validation_results['errors'].append(
            f"Lead to MQL conversion rate ({lead_to_mql}%) exceeds 100%. Data error detected."
        )
    
    if mql_to_customer > 50:
        validation_results['warnings'].append(
            f"MQL to Customer conversion rate ({mql_to_customer}%) seems high. Verify data accuracy."
        )
    
    # Check for missing months
    monthly = conversion_data.get('monthly', {})
    expected_months = [f'2025-{i:02d}' for i in range(1, 12)]  # Jan-Nov 2025
    missing_months = [m for m in expected_months if m not in monthly]
    
    if missing_months:
        validation_results['warnings'].append(
            f"Missing data for months: {', '.join(missing_months)}"
        )
    
    # Check for zero values that might indicate data issues
    for month, data in monthly.items():
        if data.get('sessions', 0) == 0 and data.get('leads', 0) > 0:
            validation_results['warnings'].append(
                f"{month}: Leads ({data['leads']}) but no sessions. Possible attribution issue."
            )
    
    return validation_results

def main():
    """Main validation function."""
    print("Validating data consistency...")
    
    # Load extracted data
    script_dir = Path(__file__).parent.absolute()
    extracted_file = script_dir.parent / "06-DATA-ANALYSIS" / "extracted_metrics.json"
    conversion_file = script_dir.parent / "06-DATA-ANALYSIS" / "conversion_analysis.json"
    
    if not extracted_file.exists():
        print(f"Error: {extracted_file} not found. Run extract_metrics.py first.")
        return
    
    if not conversion_file.exists():
        print(f"Error: {conversion_file} not found. Run calculate_conversions.py first.")
        return
    
    with open(extracted_file, 'r', encoding='utf-8') as f:
        extracted_data = json.load(f)
    
    with open(conversion_file, 'r', encoding='utf-8') as f:
        conversion_data = json.load(f)
    
    # Validate
    validation_results = validate_data_consistency(extracted_data, conversion_data)
    
    # Save results
    script_dir = Path(__file__).parent.absolute()
    output_file = script_dir.parent / "06-DATA-ANALYSIS" / "validation_report.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(validation_results, f, indent=2, ensure_ascii=False)
    
    print(f"\n✓ Validation complete. Results saved to {output_file}")
    
    if validation_results['errors']:
        print("\n❌ ERRORS FOUND:")
        for error in validation_results['errors']:
            print(f"  - {error}")
    
    if validation_results['warnings']:
        print("\n⚠ WARNINGS:")
        for warning in validation_results['warnings']:
            print(f"  - {warning}")
    
    if validation_results['notes']:
        print("\n✓ NOTES:")
        for note in validation_results['notes']:
            print(f"  - {note}")
    
    return validation_results

if __name__ == "__main__":
    main()

