<?php
// scripts/deduplicate_content.php

$file = __DIR__ . '/../data/queue.csv';
if (!file_exists($file))
    die("File not found\n");

$handle = fopen($file, 'r');
if (!$handle)
    die("Cannot open file\n");

$header = fgetcsv($handle);
$uniqueById = [];
$uniqueByContent = [];
$totalRows = 0;
$duplicatesRemoved = 0;

while (($row = fgetcsv($handle)) !== false) {
    if (count($row) < 3)
        continue; // Malformed
    $totalRows++;

    $id = $row[0];
    $email = $row[1];
    $subject = $row[2];
    $body = $row[9] ?? ''; // Body is column 10 (index 9)

    // Create a fingerprint of the content
    $fingerprint = md5($email . '|' . $subject . '|' . substr($body, 0, 100));

    if (isset($uniqueById[$id]) || isset($uniqueByContent[$fingerprint])) {
        $duplicatesRemoved++;
    }

    // Store
    $uniqueById[$id] = $row;
    $uniqueByContent[$fingerprint] = $row;
}
fclose($handle);

if ($duplicatesRemoved > 0) {
    // Collect unique rows (we use content as final filter)
    $handle = fopen($file, 'w');
    fputcsv($handle, $header);
    foreach ($uniqueByContent as $row) {
        fputcsv($handle, $row);
    }
    fclose($handle);
    echo "Ultra-Deduplication Complete: Removed $duplicatesRemoved duplicates from $totalRows rows.\n";
} else {
    echo "No content duplicates found in $totalRows rows.\n";
}
