netgescon-master/scripts/Script fatti per prova e per ora sospesi/import_ibrido_csv_mdb_mysql.py

#!/usr/bin/env python3
"""
IMPORTAZIONE IBRIDA CSV/MDB -> MYSQL PER ARCHIVI GESCON (v18 - 2025-06-02)
========================================================================================
## STORICO FUNZIONALE, PATCH E STRATEGIA DI LAVORO (aggiornato a v18)
----------------------------------------------------------------------------------------
- **v1-v7**: Importazione CSV/MDB, deduplica via hash, logging, patch su colonne 'id', 'anno', 'codice_condominio', 'id_amministratore'.
- **v8**: Gestione limiti row size/colonne MySQL, split tabella con suffisso (_002, ...), logging colonne_problematiche.json.
- **v9**: Forzatura campi "note", "memo" ecc. a TEXT/LONGTEXT, auto tipi colonne, patch naming tabella MySQL <=64 char.
- **v10**: Mapping tabella <-> origine in JSON, importazione incrementale, logging dettagliato.
- **v11**: Importazione dati PRIMA da CSV poi da MDB, aggiornamento dati esistenti (solo nuovi hash), processi distinti ma integrati.
- **v12-v15**:
    - Nome tabella GESCON: <CF_AMMINISTRATORE>_<CF_CONDOMINIO>_<CARTELLA_COND>_<ANNO>_<IDFILE>_<HASH8> (progressivo se split).
    - Per ogni file, ID numerico progressivo in struttura_File_GESCON (MySQL), mapping stabile per archivi multipli.
    - ANNO dalla cartella padre del file.
    - Import Stabili.csv (Rosetta) per mapping amministratore/condominio/cartella.
    - Mapping tabella/colonne in struttura_File_GESCON (MySQL) e mapping_tabelle.json.
    - Logging avanzato e gestione errori robusta.
    - Importazione PRIMA da CSV, poi da MDB (solo nuovi record).
    - Chiamate mdb-export corrette: "-H" solo per header, "--no-header -d ," per i dati.
- **v16-v18** (2025-06-02):
    - **PATCH**: Uso corretto di mdb-export per header e dati (mai argomento 'csv').
    - **PATCH**: Deduplica e inserimento solo dati nuovi da MDB dopo i CSV.
    - **PATCH**: Logging dettagliato (file, colonne problematiche, mapping, errori).
    - **PATCH**: Gestione robusta di split tabella, path, errori, tabelle vuote.
    - **NOTE**: Tutte le patch e note storiche mantenute.
========================================================================================
## DIPENDENZE
- pymysql
- mdbtools (per MDB)
"""

import os
import pymysql
import subprocess
import csv
import json
import hashlib
import datetime
import re

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
CONFIG_PATH = os.path.join(SCRIPT_DIR, "../agent_config.json")
with open(CONFIG_PATH) as f:
    config = json.load(f)
MYSQL_HOST = config.get("MySQLHost", "localhost")
MYSQL_DB = config.get("MySQLDatabase", "netgescon")
MYSQL_USER = config.get("MySQLUser", "root")
MYSQL_PW = config.get("MySQLPassword", "password")
INPUT_ROOT = config.get("OutputDirectory", os.path.join(SCRIPT_DIR, "../estratti"))
LOGDIR = os.path.join(SCRIPT_DIR, "../log")
os.makedirs(LOGDIR, exist_ok=True)
LOGFILE = os.path.join(LOGDIR, "import_ibrido_csv_mdb_mysql.jsonlog")
MAPPING_TABELLE_LOG = os.path.join(LOGDIR, "mapping_tabelle.json")
COLONNE_PROBLEMATICHE_LOG = os.path.join(LOGDIR, "colonne_problematiche.json")

STRUTTURA_FILE_GESCON = "struttura_File_GESCON"
ROSETTA_STABILI = "rossetta_stabili"

def log_event(event, **kwargs):
    row = {"event": event, "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
    row.update(kwargs)
    print("[LOG]", row)
    with open(LOGFILE, "a") as f:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

def log_colonna_problematica(tablename, colname, reason):
    rec = {"table": tablename, "column": colname, "reason": reason, "timestamp": datetime.datetime.now().isoformat()}
    try:
        if os.path.exists(COLONNE_PROBLEMATICHE_LOG):
            with open(COLONNE_PROBLEMATICHE_LOG, "r") as f:
                data = json.load(f)
        else:
            data = []
    except Exception:
        data = []
    data.append(rec)
    with open(COLONNE_PROBLEMATICHE_LOG, "w") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

def log_mapping_tabella(mapping):
    try:
        if os.path.exists(MAPPING_TABELLE_LOG):
            with open(MAPPING_TABELLE_LOG, "r") as f:
                data = json.load(f)
        else:
            data = []
    except Exception:
        data = []
    data.append(mapping)
    with open(MAPPING_TABELLE_LOG, "w") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

def connect_mysql():
    return pymysql.connect(
        host=MYSQL_HOST,
        user=MYSQL_USER,
        password=MYSQL_PW,
        database=MYSQL_DB,
        charset="utf8mb4",
        autocommit=True
    )

def safe_name(s, max_len=None):
    name = re.sub(r"[^a-zA-Z0-9_]", "_", str(s)).lower()
    if max_len:
        return name[:max_len]
    return name

def get_struttura_file_id(cur, file_name):
    cur.execute(f"""CREATE TABLE IF NOT EXISTS `{STRUTTURA_FILE_GESCON}` (
        idfile INT AUTO_INCREMENT PRIMARY KEY,
        file_name VARCHAR(255) UNIQUE
    )""")
    cur.execute(f"SELECT idfile FROM `{STRUTTURA_FILE_GESCON}` WHERE file_name=%s", (file_name,))
    row = cur.fetchone()
    if row:
        return str(row[0]).zfill(4)
    cur.execute(f"INSERT INTO `{STRUTTURA_FILE_GESCON}` (file_name) VALUES (%s)", (file_name,))
    cur.execute(f"SELECT idfile FROM `{STRUTTURA_FILE_GESCON}` WHERE file_name=%s", (file_name,))
    return str(cur.fetchone()[0]).zfill(4)

def infer_type(val, prefer_text=False, is_longtext=False):
    if is_longtext:
        return "LONGTEXT"
    if prefer_text:
        return "TEXT"
    if val is None or str(val).strip() == "":
        return "VARCHAR(255)"
    s = str(val)
    if re.fullmatch(r"-?\d+", s):
        return "INT"
    try:
        float(s.replace(",", "."))
        return "FLOAT"
    except Exception:
        pass
    if len(s) > 255:
        return "TEXT"
    return "VARCHAR(255)"

def fix_header_conflicts(header):
    CONFLICT_COLS = {
        "id": "ID_CSV",
        "anno": "Anno_CSV",
        "codice_condominio": "Codice_Condominio_CSV",
        "id_amministratore": "ID_Amministratore_CSV"
    }
    new_header = []
    for col in header:
        lower_col = col.lower()
        if lower_col in CONFLICT_COLS:
            new_header.append(CONFLICT_COLS[lower_col])
        else:
            new_header.append(col)
    return new_header

def fix_row_conflicts(row):
    CONFLICT_COLS = {
        "id": "ID_CSV",
        "anno": "Anno_CSV",
        "codice_condominio": "Codice_Condominio_CSV",
        "id_amministratore": "ID_Amministratore_CSV"
    }
    new_row = {}
    for k, v in row.items():
        lower_k = k.lower()
        if lower_k in CONFLICT_COLS:
            new_row[CONFLICT_COLS[lower_k]] = v
        else:
            new_row[k] = v
    return new_row

def create_table(cur, table_name, header, sample_row, prefer_text=False, skip_cols=None, depth=0, max_cols=50):
    if skip_cols is None:
        skip_cols = []
    fields_types = [
        "`id` INT AUTO_INCREMENT PRIMARY KEY",
        "`id_amministratore` VARCHAR(16)",
        "`codice_condominio` VARCHAR(16)",
        "`cartella_condominio` VARCHAR(4)",
        "`anno` VARCHAR(4)"
    ]
    usable_cols = [col for col in header if col not in skip_cols]
    if len(usable_cols) > max_cols:
        usable_cols = usable_cols[:max_cols]
    for col in usable_cols:
        sample = sample_row.get(col, "")
        is_longtext = bool(re.search(r"(note|memo|descr|testo|osserv|lettera|comment|mess|causale)", col, re.IGNORECASE))
        t = infer_type(sample, prefer_text, is_longtext)
        fields_types.append(f"`{col}` {t}")
    try:
        sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (" \
              f"{', '.join(fields_types)}, " \
              f"_hash_row CHAR(32) UNIQUE, " \
              f"_imported_at DATETIME)"
        cur.execute(sql)
        return []
    except pymysql.err.OperationalError as e:
        if e.args and e.args[0] == 1059:  # Identifier name too long
            print(f"[ERROR] Nome tabella troppo lungo: {table_name}")
            return []
        if e.args and e.args[0] == 1118:
            candidate_cols = [col for col in usable_cols if re.search(r"(note|memo|descr|testo|osserv|dettaglio|comment|mess|causale|lettera)", col, re.IGNORECASE)]
            if candidate_cols:
                col_to_skip = candidate_cols[0]
                log_colonna_problematica(table_name, col_to_skip, "Row size too large - skipped for import")
                print(f"[ERROR] Colonna {col_to_skip} problematica nella tabella {table_name}, saltata per superare il limite di MySQL.")
                return create_table(cur, table_name, [col for col in header if col != col_to_skip], sample_row, prefer_text, skip_cols + [col_to_skip], depth+1, max_cols)
            elif len(usable_cols) > 1:
                col_to_skip = usable_cols[-1]
                log_colonna_problematica(table_name, col_to_skip, "Row size too large - generic column skipped for import")
                print(f"[ERROR] Colonna {col_to_skip} problematica nella tabella {table_name}, saltata (generico) per superare il limite di MySQL.")
                return create_table(cur, table_name, [col for col in header if col != col_to_skip], sample_row, prefer_text, skip_cols + [col_to_skip], depth+1, max_cols)
            else:
                print(f"[ERROR] Nessuna colonna importabile per la tabella {table_name}. Importazione saltata per limiti fisici MySQL.")
                return skip_cols
        else:
            raise

def ensure_table_structure_split(cur, table_name_base, header, sample_row, cartella_cond, anno, idfile):
    max_cols = 50
    results = []
    tot_cols = len(header)
    idx = 0
    split_suff = 1
    while idx < tot_cols:
        imported_cols = header[idx:idx+max_cols]
        hash8 = table_name_base.split("_")[-1][:8]
        nome_tabella = f"{table_name_base}_{str(split_suff).zfill(3)}"
        nome_tabella = nome_tabella[:64]
        problematic_cols = create_table(cur, nome_tabella, imported_cols, sample_row, prefer_text=True, skip_cols=[], max_cols=max_cols)
        if imported_cols:
            results.append((nome_tabella, imported_cols, problematic_cols))
        idx += max_cols
        split_suff += 1
    return results

def calc_row_hash_md5(header, row, id_amministratore, codice_condominio, cartella_condominio, anno):
    values = [str(id_amministratore), str(codice_condominio), str(cartella_condominio), str(anno)] + [str(row.get(col, "")).strip() for col in header]
    return hashlib.md5("|".join(values).encode("utf-8")).hexdigest()

def get_existing_hashes(cur, table_name):
    try:
        cur.execute(f"SELECT _hash_row FROM `{table_name}`;")
        return set(row[0] for row in cur.fetchall())
    except Exception:
        return set()

def insert_row(cur, table_name, header, row, id_amministratore, codice_condominio, cartella_condominio, anno, existing_hashes):
    hash_row = calc_row_hash_md5(header, row, id_amministratore, codice_condominio, cartella_condominio, anno)
    if hash_row in existing_hashes:
        return False
    cols = ["`id_amministratore`", "`codice_condominio`", "`cartella_condominio`", "`anno`"] + [f"`{col}`" for col in header]
    vals = [id_amministratore, codice_condominio, cartella_condominio, anno] + [row.get(col, None) for col in header]
    cols += ["_hash_row", "_imported_at"]
    vals += [hash_row, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")]
    placeholders = ", ".join(["%s"] * len(cols))
    sql = f"INSERT INTO `{table_name}` ({', '.join(cols)}) VALUES ({placeholders})"
    cur.execute(sql, vals)
    existing_hashes.add(hash_row)
    return True

def get_mapping_from_stabili(csv_path):
    mapping = {}
    with open(csv_path, encoding="utf-8", errors="ignore") as f:
        reader = csv.DictReader(f)
        for row in reader:
            cartella = row.get("nome_directory", "").strip()
            mapping[cartella] = {
                "codice_fiscale_cond": row.get("codice_fisc", "").strip().upper().ljust(16, "_")[:16],
                "cf_amministratore": row.get("cf_amministratore", "").strip().upper().ljust(16, "_")[:16],
                "cartella": cartella,
            }
    return mapping

def process_stabili_csv(stabili_csv):
    conn = connect_mysql()
    cur = conn.cursor()
    table_mapping = ROSETTA_STABILI
    with open(stabili_csv, encoding="utf-8", errors="ignore") as f:
        reader = csv.DictReader(f)
        rows = list(reader)
    if not rows:
        return
    columns = [f"`{safe_name(col)}` TEXT" for col in rows[0].keys()]
    sql = f"CREATE TABLE IF NOT EXISTS `{table_mapping}` (id INT AUTO_INCREMENT PRIMARY KEY, {', '.join(columns)})"
    cur.execute(sql)
    existing_hashes = set()
    for row in rows:
        row_hash = hashlib.md5("|".join(str(row.get(col, "")) for col in rows[0].keys()).encode("utf-8")).hexdigest()
        if row_hash in existing_hashes:
            continue
        placeholders = ",".join(["%s"] * len(row))
        cur.execute(f"INSERT INTO `{table_mapping}` ({', '.join(f'`{safe_name(col)}`' for col in row.keys())}) VALUES ({placeholders})", tuple(row.values()))
        existing_hashes.add(row_hash)
    cur.close()
    conn.close()

def process_csv(csv_path, cur, stabili_mapping):
    parts = os.path.normpath(csv_path).split(os.sep)
    cf_amministratore = parts[-5] if len(parts) > 5 else "GESCON"
    codice_fiscale_cond = parts[-4] if len(parts) > 4 else "GESCON"
    cartella_cond = parts[-3] if len(parts) > 3 else "0000"
    anno = parts[-2] if len(parts) > 2 else "0000"
    nomefile_orig = os.path.splitext(os.path.basename(csv_path))[0]
    dati_cond = stabili_mapping.get(cartella_cond, {})
    if dati_cond.get("cf_amministratore"):
        cf_amministratore = dati_cond["cf_amministratore"]
    if dati_cond.get("codice_fiscale_cond"):
        codice_fiscale_cond = dati_cond["codice_fiscale_cond"]
    idfile = get_struttura_file_id(cur, nomefile_orig)
    hash8 = hashlib.md5(nomefile_orig.encode("utf-8")).hexdigest()[:8]
    table_name_base = f"{cf_amministratore}_{codice_fiscale_cond}_{cartella_cond.zfill(4)}_{anno.zfill(4)}_{idfile}_{hash8}"
    log_event("import_table", csv=csv_path, table_mysql=table_name_base, cf_amministratore=cf_amministratore, codice_fiscale_condominio=codice_fiscale_cond, cartella_condominio=cartella_cond, anno=anno)
    mapping = {
        "file_origine": csv_path,
        "table_mysql": table_name_base,
        "cf_amministratore": cf_amministratore,
        "codice_fiscale_condominio": codice_fiscale_cond,
        "cartella_condominio": cartella_cond,
        "anno": anno,
        "idfile": idfile,
        "hash": hash8
    }
    log_mapping_tabella(mapping)
    with open(csv_path, encoding="utf-8", errors="ignore") as f:
        reader = csv.DictReader(f)
        rows = list(reader)
        log_event("csv_read", file=csv_path, rows=len(rows))
        if not rows:
            log_event("empty_table", table=table_name_base)
            return
        header_fixed = fix_header_conflicts(rows[0].keys())
        rows_fixed = [fix_row_conflicts(row) for row in rows]
        split_tables = ensure_table_structure_split(cur, table_name_base, header_fixed, rows_fixed[0], cartella_cond, anno, idfile)
        for nome_tabella, imported_cols, problematic_cols in split_tables:
            rows_final = [
                {k: v for k, v in row.items() if k in imported_cols}
                for row in rows_fixed
            ]
            existing_hashes = set()
            try:
                existing_hashes = get_existing_hashes(cur, nome_tabella)
            except Exception:
                existing_hashes = set()
            imported = 0
            for row in rows_final:
                try:
                    if insert_row(cur, nome_tabella, imported_cols, row, cf_amministratore, codice_fiscale_cond, cartella_cond, anno, existing_hashes):
                        imported += 1
                except Exception as row_e:
                    log_event("row_error", table=nome_tabella, data=row, error=str(row_e))
            log_event("imported_rows", table=nome_tabella, rows=imported)
            print(f"Importate {imported} nuove righe in {nome_tabella}")

def process_mdb(mdb_path, cur, stabili_mapping):
    parts = os.path.normpath(mdb_path).split(os.sep)
    cf_amministratore = parts[-5] if len(parts) > 5 else "GESCON"
    codice_fiscale_cond = parts[-4] if len(parts) > 4 else "GESCON"
    cartella_cond = parts[-3] if len(parts) > 3 else "0000"
    anno = parts[-2] if len(parts) > 2 else "0000"
    nomefile_orig = os.path.splitext(os.path.basename(mdb_path))[0]
    dati_cond = stabili_mapping.get(cartella_cond, {})
    if dati_cond.get("cf_amministratore"):
        cf_amministratore = dati_cond["cf_amministratore"]
    if dati_cond.get("codice_fiscale_cond"):
        codice_fiscale_cond = dati_cond["codice_fiscale_cond"]
    idfile = get_struttura_file_id(cur, nomefile_orig)
    hash8 = hashlib.md5(nomefile_orig.encode("utf-8")).hexdigest()[:8]
    table_name_base = f"{cf_amministratore}_{codice_fiscale_cond}_{cartella_cond.zfill(4)}_{anno.zfill(4)}_{idfile}_{hash8}"
    log_event("import_table", mdb=mdb_path, table_mysql=table_name_base, cf_amministratore=cf_amministratore, codice_fiscale_condominio=codice_fiscale_cond, cartella_condominio=cartella_cond, anno=anno)
    mapping = {
        "file_origine": mdb_path,
        "table_mysql": table_name_base,
        "cf_amministratore": cf_amministratore,
        "codice_fiscale_condominio": codice_fiscale_cond,
        "cartella_condominio": cartella_cond,
        "anno": anno,
        "idfile": idfile,
        "hash": hash8
    }
    log_mapping_tabella(mapping)
    try:
        tables = subprocess.check_output(["mdb-tables", "-1", mdb_path]).decode().split()
    except Exception as e:
        log_event("error_mdb", file=mdb_path, error=str(e))
        print(f"[ERROR] Errore lettura tabelle: {e}")
        return
    for tab in tables:
        try:
            # 1. HEADER: mdb-export -H file.mdb Tabella
            header_proc = subprocess.check_output(
                ["mdb-export", "-H", mdb_path, tab]
            )
            header = [h.strip() for h in header_proc.decode(errors="ignore").strip().split(",")]
            header_fixed = fix_header_conflicts(header)
            # 2. DATI: mdb-export --no-header -d "," file.mdb Tabella
            proc = subprocess.Popen(
                ["mdb-export", "--no-header", "-d", ",", mdb_path, tab],
                stdout=subprocess.PIPE, stderr=subprocess.PIPE
            )
            out, err = proc.communicate()
            if proc.returncode != 0:
                log_event("mdb_export_error", table=tab, error=err.decode())
                print(f"[DEBUG] mdb-export error on {tab}: {err.decode()}")
                continue
            out = out.decode(errors="ignore")
            lines = out.splitlines()
            if not lines or not lines[0].strip():
                log_event("empty_table", table=tab)
                print(f"[DEBUG] Tabella {tab} - Numero righe lette: 0")
                continue
            reader = csv.DictReader(lines, fieldnames=header_fixed)
            rows = [fix_row_conflicts(row) for row in reader]
            log_event("mdb_read", table=tab, rows=len(rows))
            if not rows:
                log_event("empty_table", table=tab)
                continue
            split_tables = ensure_table_structure_split(cur, table_name_base, header_fixed, rows[0], cartella_cond, anno, idfile)
            for nome_tabella, imported_cols, problematic_cols in split_tables:
                rows_final = [
                    {k: v for k, v in row.items() if k in imported_cols}
                    for row in rows
                ]
                # Importa SOLO le righe nuove (quelle NON già presenti, deduplica via hash)
                existing_hashes = set()
                try:
                    existing_hashes = get_existing_hashes(cur, nome_tabella)
                except Exception:
                    existing_hashes = set()
                imported = 0
                for row in rows_final:
                    try:
                        if insert_row(cur, nome_tabella, imported_cols, row, cf_amministratore, codice_fiscale_cond, cartella_cond, anno, existing_hashes):
                            imported += 1
                    except Exception as row_e:
                        log_event("row_error", table=nome_tabella, data=row, error=str(row_e))
                log_event("imported_rows", table=nome_tabella, rows=imported)
                print(f"Importate {imported} nuove righe in {nome_tabella}")
        except Exception as e:
            log_event("table_error", table=table_name_base, error=str(e))
            print(f"[ERROR] Errore su tabella {tab}: {e}")

def main():
    stabili_csv = os.path.join(INPUT_ROOT, "dbc", "Stabili", "Stabili.csv")
    process_stabili_csv(stabili_csv)
    stabili_mapping = get_mapping_from_stabili(stabili_csv)
    conn = connect_mysql()
    cur = conn.cursor()
    # Prima importa tutti i CSV
    for root, _, files in os.walk(INPUT_ROOT):
        for fname in files:
            if fname.lower().endswith(".csv") and not fname.lower().startswith("stabili"):
                csv_path = os.path.join(root, fname)
                process_csv(csv_path, cur, stabili_mapping)
    # Poi importa tutti gli MDB (aggiornamento/incrementale)
    for root, _, files in os.walk(INPUT_ROOT):
        for fname in files:
            if fname.lower().endswith(".mdb"):
                mdb_path = os.path.join(root, fname)
                process_mdb(mdb_path, cur, stabili_mapping)
    cur.close()
    conn.close()
    log_event("import_complete")

if __name__ == "__main__":
    main()

# COMANDO PULIZIA TABELLE:
# mysql -u <user> -p<password> <db> -e "SET GROUP_CONCAT_MAX_LEN=1000000;SELECT GROUP_CONCAT(CONCAT('DROP TABLE IF EXISTS \`', table_name, '\`;') SEPARATOR ' ') FROM information_schema.tables WHERE table_schema = '<db>' AND (table_name LIKE 'rossetta_stabili%' OR table_name LIKE 'struttura_File_GESCON%' OR table_name REGEXP '^[A-Z0-9_]{16}_[A-Z0-9_]{16}_[0-9]{4}_[0-9_]{4}_[0-9]{4}_[a-f0-9]{8}(_[0-9]{3})?$');" | tail -n1 | mysql -u <user> -p<password> <db>