import os
import re

FILE_QUEUE = "/var/www/html/repondeur_mail_grok/data/queue.csv"

def repair_and_dedup():
    if not os.path.exists(FILE_QUEUE):
        print("File not found.")
        return

    with open(FILE_QUEUE, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()

    header = "id,email,subject,date_received,tag_zimbra,grok_response,status,shop_id,client_id,last_order_date,last_order_ref,body,sender_name"
    
    # 1. Normalize Header Newline
    if content.startswith(header):
        content = header + "\n" + content[len(header):].lstrip()
    
    # 2. Add Newlines before IDs that are stuck to the previous row
    # An ID is usually 6 digits followed by a comma
    # We look for a space followed by a 6-digit ID at the start of a logical row
    content = re.sub(r' (?=\d{6,},)', "\n", content)
    
    rows = []
    # Now split by newline and try to reconstruct rows if they have 13 columns
    lines = content.split("\n")
    current_row = []
    
    for line in lines:
        if not line.strip(): continue
        # Detect if this line starts a new row (6 digits + comma)
        if re.match(r'^\d{6,},', line):
            if current_row:
                rows.append(current_row)
            current_row = [line]
        else:
            if current_row:
                current_row[-1] += "\n" + line
            else:
                # Header or stray line
                pass
                
    if current_row:
        rows.append(current_row)

    print(f"Detected {len(rows)} potential rows.")
    
    deduped = {}
    for r_parts in rows:
        r_raw = "\n".join(r_parts)
        # Simple split to extract ID and Email
        parts = r_raw.split(',')
        if len(parts) > 1:
            uid = parts[0].strip()
            email = parts[1].strip().lower()
            if email and "@" in email:
                if email not in deduped or int(uid) > int(deduped[email]['id']):
                    deduped[email] = {'id': uid, 'raw': r_raw}

    print(f"Deduplicated to {len(deduped)} unique emails.")
    
    if len(deduped) > 0:
        with open(FILE_QUEUE, 'w', encoding='utf-8') as f:
            f.write(header + "\n")
            for email in deduped:
                f.write(deduped[email]['raw'].strip() + "\n")
        print("SUCCESS: Queue cleaned.")
    else:
        print("No rows found to clean.")

if __name__ == "__main__":
    repair_and_dedup()
