netgescon-master/scripts/Script fatti per prova e per ora sospesi/import_ibrido_csv_mdb_mysql.py

471 lines
22 KiB
Python

#!/usr/bin/env python3
"""
IMPORTAZIONE IBRIDA CSV/MDB -> MYSQL PER ARCHIVI GESCON (v18 - 2025-06-02)
========================================================================================
## STORICO FUNZIONALE, PATCH E STRATEGIA DI LAVORO (aggiornato a v18)
----------------------------------------------------------------------------------------
- **v1-v7**: Importazione CSV/MDB, deduplica via hash, logging, patch su colonne 'id', 'anno', 'codice_condominio', 'id_amministratore'.
- **v8**: Gestione limiti row size/colonne MySQL, split tabella con suffisso (_002, ...), logging colonne_problematiche.json.
- **v9**: Forzatura campi "note", "memo" ecc. a TEXT/LONGTEXT, auto tipi colonne, patch naming tabella MySQL <=64 char.
- **v10**: Mapping tabella <-> origine in JSON, importazione incrementale, logging dettagliato.
- **v11**: Importazione dati PRIMA da CSV poi da MDB, aggiornamento dati esistenti (solo nuovi hash), processi distinti ma integrati.
- **v12-v15**:
- Nome tabella GESCON: <CF_AMMINISTRATORE>_<CF_CONDOMINIO>_<CARTELLA_COND>_<ANNO>_<IDFILE>_<HASH8> (progressivo se split).
- Per ogni file, ID numerico progressivo in struttura_File_GESCON (MySQL), mapping stabile per archivi multipli.
- ANNO dalla cartella padre del file.
- Import Stabili.csv (Rosetta) per mapping amministratore/condominio/cartella.
- Mapping tabella/colonne in struttura_File_GESCON (MySQL) e mapping_tabelle.json.
- Logging avanzato e gestione errori robusta.
- Importazione PRIMA da CSV, poi da MDB (solo nuovi record).
- Chiamate mdb-export corrette: "-H" solo per header, "--no-header -d ," per i dati.
- **v16-v18** (2025-06-02):
- **PATCH**: Uso corretto di mdb-export per header e dati (mai argomento 'csv').
- **PATCH**: Deduplica e inserimento solo dati nuovi da MDB dopo i CSV.
- **PATCH**: Logging dettagliato (file, colonne problematiche, mapping, errori).
- **PATCH**: Gestione robusta di split tabella, path, errori, tabelle vuote.
- **NOTE**: Tutte le patch e note storiche mantenute.
========================================================================================
## DIPENDENZE
- pymysql
- mdbtools (per MDB)
"""
import os
import pymysql
import subprocess
import csv
import json
import hashlib
import datetime
import re
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
CONFIG_PATH = os.path.join(SCRIPT_DIR, "../agent_config.json")
with open(CONFIG_PATH) as f:
config = json.load(f)
MYSQL_HOST = config.get("MySQLHost", "localhost")
MYSQL_DB = config.get("MySQLDatabase", "netgescon")
MYSQL_USER = config.get("MySQLUser", "root")
MYSQL_PW = config.get("MySQLPassword", "password")
INPUT_ROOT = config.get("OutputDirectory", os.path.join(SCRIPT_DIR, "../estratti"))
LOGDIR = os.path.join(SCRIPT_DIR, "../log")
os.makedirs(LOGDIR, exist_ok=True)
LOGFILE = os.path.join(LOGDIR, "import_ibrido_csv_mdb_mysql.jsonlog")
MAPPING_TABELLE_LOG = os.path.join(LOGDIR, "mapping_tabelle.json")
COLONNE_PROBLEMATICHE_LOG = os.path.join(LOGDIR, "colonne_problematiche.json")
STRUTTURA_FILE_GESCON = "struttura_File_GESCON"
ROSETTA_STABILI = "rossetta_stabili"
def log_event(event, **kwargs):
row = {"event": event, "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
row.update(kwargs)
print("[LOG]", row)
with open(LOGFILE, "a") as f:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
def log_colonna_problematica(tablename, colname, reason):
rec = {"table": tablename, "column": colname, "reason": reason, "timestamp": datetime.datetime.now().isoformat()}
try:
if os.path.exists(COLONNE_PROBLEMATICHE_LOG):
with open(COLONNE_PROBLEMATICHE_LOG, "r") as f:
data = json.load(f)
else:
data = []
except Exception:
data = []
data.append(rec)
with open(COLONNE_PROBLEMATICHE_LOG, "w") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def log_mapping_tabella(mapping):
try:
if os.path.exists(MAPPING_TABELLE_LOG):
with open(MAPPING_TABELLE_LOG, "r") as f:
data = json.load(f)
else:
data = []
except Exception:
data = []
data.append(mapping)
with open(MAPPING_TABELLE_LOG, "w") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def connect_mysql():
return pymysql.connect(
host=MYSQL_HOST,
user=MYSQL_USER,
password=MYSQL_PW,
database=MYSQL_DB,
charset="utf8mb4",
autocommit=True
)
def safe_name(s, max_len=None):
name = re.sub(r"[^a-zA-Z0-9_]", "_", str(s)).lower()
if max_len:
return name[:max_len]
return name
def get_struttura_file_id(cur, file_name):
cur.execute(f"""CREATE TABLE IF NOT EXISTS `{STRUTTURA_FILE_GESCON}` (
idfile INT AUTO_INCREMENT PRIMARY KEY,
file_name VARCHAR(255) UNIQUE
)""")
cur.execute(f"SELECT idfile FROM `{STRUTTURA_FILE_GESCON}` WHERE file_name=%s", (file_name,))
row = cur.fetchone()
if row:
return str(row[0]).zfill(4)
cur.execute(f"INSERT INTO `{STRUTTURA_FILE_GESCON}` (file_name) VALUES (%s)", (file_name,))
cur.execute(f"SELECT idfile FROM `{STRUTTURA_FILE_GESCON}` WHERE file_name=%s", (file_name,))
return str(cur.fetchone()[0]).zfill(4)
def infer_type(val, prefer_text=False, is_longtext=False):
if is_longtext:
return "LONGTEXT"
if prefer_text:
return "TEXT"
if val is None or str(val).strip() == "":
return "VARCHAR(255)"
s = str(val)
if re.fullmatch(r"-?\d+", s):
return "INT"
try:
float(s.replace(",", "."))
return "FLOAT"
except Exception:
pass
if len(s) > 255:
return "TEXT"
return "VARCHAR(255)"
def fix_header_conflicts(header):
CONFLICT_COLS = {
"id": "ID_CSV",
"anno": "Anno_CSV",
"codice_condominio": "Codice_Condominio_CSV",
"id_amministratore": "ID_Amministratore_CSV"
}
new_header = []
for col in header:
lower_col = col.lower()
if lower_col in CONFLICT_COLS:
new_header.append(CONFLICT_COLS[lower_col])
else:
new_header.append(col)
return new_header
def fix_row_conflicts(row):
CONFLICT_COLS = {
"id": "ID_CSV",
"anno": "Anno_CSV",
"codice_condominio": "Codice_Condominio_CSV",
"id_amministratore": "ID_Amministratore_CSV"
}
new_row = {}
for k, v in row.items():
lower_k = k.lower()
if lower_k in CONFLICT_COLS:
new_row[CONFLICT_COLS[lower_k]] = v
else:
new_row[k] = v
return new_row
def create_table(cur, table_name, header, sample_row, prefer_text=False, skip_cols=None, depth=0, max_cols=50):
if skip_cols is None:
skip_cols = []
fields_types = [
"`id` INT AUTO_INCREMENT PRIMARY KEY",
"`id_amministratore` VARCHAR(16)",
"`codice_condominio` VARCHAR(16)",
"`cartella_condominio` VARCHAR(4)",
"`anno` VARCHAR(4)"
]
usable_cols = [col for col in header if col not in skip_cols]
if len(usable_cols) > max_cols:
usable_cols = usable_cols[:max_cols]
for col in usable_cols:
sample = sample_row.get(col, "")
is_longtext = bool(re.search(r"(note|memo|descr|testo|osserv|lettera|comment|mess|causale)", col, re.IGNORECASE))
t = infer_type(sample, prefer_text, is_longtext)
fields_types.append(f"`{col}` {t}")
try:
sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (" \
f"{', '.join(fields_types)}, " \
f"_hash_row CHAR(32) UNIQUE, " \
f"_imported_at DATETIME)"
cur.execute(sql)
return []
except pymysql.err.OperationalError as e:
if e.args and e.args[0] == 1059: # Identifier name too long
print(f"[ERROR] Nome tabella troppo lungo: {table_name}")
return []
if e.args and e.args[0] == 1118:
candidate_cols = [col for col in usable_cols if re.search(r"(note|memo|descr|testo|osserv|dettaglio|comment|mess|causale|lettera)", col, re.IGNORECASE)]
if candidate_cols:
col_to_skip = candidate_cols[0]
log_colonna_problematica(table_name, col_to_skip, "Row size too large - skipped for import")
print(f"[ERROR] Colonna {col_to_skip} problematica nella tabella {table_name}, saltata per superare il limite di MySQL.")
return create_table(cur, table_name, [col for col in header if col != col_to_skip], sample_row, prefer_text, skip_cols + [col_to_skip], depth+1, max_cols)
elif len(usable_cols) > 1:
col_to_skip = usable_cols[-1]
log_colonna_problematica(table_name, col_to_skip, "Row size too large - generic column skipped for import")
print(f"[ERROR] Colonna {col_to_skip} problematica nella tabella {table_name}, saltata (generico) per superare il limite di MySQL.")
return create_table(cur, table_name, [col for col in header if col != col_to_skip], sample_row, prefer_text, skip_cols + [col_to_skip], depth+1, max_cols)
else:
print(f"[ERROR] Nessuna colonna importabile per la tabella {table_name}. Importazione saltata per limiti fisici MySQL.")
return skip_cols
else:
raise
def ensure_table_structure_split(cur, table_name_base, header, sample_row, cartella_cond, anno, idfile):
max_cols = 50
results = []
tot_cols = len(header)
idx = 0
split_suff = 1
while idx < tot_cols:
imported_cols = header[idx:idx+max_cols]
hash8 = table_name_base.split("_")[-1][:8]
nome_tabella = f"{table_name_base}_{str(split_suff).zfill(3)}"
nome_tabella = nome_tabella[:64]
problematic_cols = create_table(cur, nome_tabella, imported_cols, sample_row, prefer_text=True, skip_cols=[], max_cols=max_cols)
if imported_cols:
results.append((nome_tabella, imported_cols, problematic_cols))
idx += max_cols
split_suff += 1
return results
def calc_row_hash_md5(header, row, id_amministratore, codice_condominio, cartella_condominio, anno):
values = [str(id_amministratore), str(codice_condominio), str(cartella_condominio), str(anno)] + [str(row.get(col, "")).strip() for col in header]
return hashlib.md5("|".join(values).encode("utf-8")).hexdigest()
def get_existing_hashes(cur, table_name):
try:
cur.execute(f"SELECT _hash_row FROM `{table_name}`;")
return set(row[0] for row in cur.fetchall())
except Exception:
return set()
def insert_row(cur, table_name, header, row, id_amministratore, codice_condominio, cartella_condominio, anno, existing_hashes):
hash_row = calc_row_hash_md5(header, row, id_amministratore, codice_condominio, cartella_condominio, anno)
if hash_row in existing_hashes:
return False
cols = ["`id_amministratore`", "`codice_condominio`", "`cartella_condominio`", "`anno`"] + [f"`{col}`" for col in header]
vals = [id_amministratore, codice_condominio, cartella_condominio, anno] + [row.get(col, None) for col in header]
cols += ["_hash_row", "_imported_at"]
vals += [hash_row, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")]
placeholders = ", ".join(["%s"] * len(cols))
sql = f"INSERT INTO `{table_name}` ({', '.join(cols)}) VALUES ({placeholders})"
cur.execute(sql, vals)
existing_hashes.add(hash_row)
return True
def get_mapping_from_stabili(csv_path):
mapping = {}
with open(csv_path, encoding="utf-8", errors="ignore") as f:
reader = csv.DictReader(f)
for row in reader:
cartella = row.get("nome_directory", "").strip()
mapping[cartella] = {
"codice_fiscale_cond": row.get("codice_fisc", "").strip().upper().ljust(16, "_")[:16],
"cf_amministratore": row.get("cf_amministratore", "").strip().upper().ljust(16, "_")[:16],
"cartella": cartella,
}
return mapping
def process_stabili_csv(stabili_csv):
conn = connect_mysql()
cur = conn.cursor()
table_mapping = ROSETTA_STABILI
with open(stabili_csv, encoding="utf-8", errors="ignore") as f:
reader = csv.DictReader(f)
rows = list(reader)
if not rows:
return
columns = [f"`{safe_name(col)}` TEXT" for col in rows[0].keys()]
sql = f"CREATE TABLE IF NOT EXISTS `{table_mapping}` (id INT AUTO_INCREMENT PRIMARY KEY, {', '.join(columns)})"
cur.execute(sql)
existing_hashes = set()
for row in rows:
row_hash = hashlib.md5("|".join(str(row.get(col, "")) for col in rows[0].keys()).encode("utf-8")).hexdigest()
if row_hash in existing_hashes:
continue
placeholders = ",".join(["%s"] * len(row))
cur.execute(f"INSERT INTO `{table_mapping}` ({', '.join(f'`{safe_name(col)}`' for col in row.keys())}) VALUES ({placeholders})", tuple(row.values()))
existing_hashes.add(row_hash)
cur.close()
conn.close()
def process_csv(csv_path, cur, stabili_mapping):
parts = os.path.normpath(csv_path).split(os.sep)
cf_amministratore = parts[-5] if len(parts) > 5 else "GESCON"
codice_fiscale_cond = parts[-4] if len(parts) > 4 else "GESCON"
cartella_cond = parts[-3] if len(parts) > 3 else "0000"
anno = parts[-2] if len(parts) > 2 else "0000"
nomefile_orig = os.path.splitext(os.path.basename(csv_path))[0]
dati_cond = stabili_mapping.get(cartella_cond, {})
if dati_cond.get("cf_amministratore"):
cf_amministratore = dati_cond["cf_amministratore"]
if dati_cond.get("codice_fiscale_cond"):
codice_fiscale_cond = dati_cond["codice_fiscale_cond"]
idfile = get_struttura_file_id(cur, nomefile_orig)
hash8 = hashlib.md5(nomefile_orig.encode("utf-8")).hexdigest()[:8]
table_name_base = f"{cf_amministratore}_{codice_fiscale_cond}_{cartella_cond.zfill(4)}_{anno.zfill(4)}_{idfile}_{hash8}"
log_event("import_table", csv=csv_path, table_mysql=table_name_base, cf_amministratore=cf_amministratore, codice_fiscale_condominio=codice_fiscale_cond, cartella_condominio=cartella_cond, anno=anno)
mapping = {
"file_origine": csv_path,
"table_mysql": table_name_base,
"cf_amministratore": cf_amministratore,
"codice_fiscale_condominio": codice_fiscale_cond,
"cartella_condominio": cartella_cond,
"anno": anno,
"idfile": idfile,
"hash": hash8
}
log_mapping_tabella(mapping)
with open(csv_path, encoding="utf-8", errors="ignore") as f:
reader = csv.DictReader(f)
rows = list(reader)
log_event("csv_read", file=csv_path, rows=len(rows))
if not rows:
log_event("empty_table", table=table_name_base)
return
header_fixed = fix_header_conflicts(rows[0].keys())
rows_fixed = [fix_row_conflicts(row) for row in rows]
split_tables = ensure_table_structure_split(cur, table_name_base, header_fixed, rows_fixed[0], cartella_cond, anno, idfile)
for nome_tabella, imported_cols, problematic_cols in split_tables:
rows_final = [
{k: v for k, v in row.items() if k in imported_cols}
for row in rows_fixed
]
existing_hashes = set()
try:
existing_hashes = get_existing_hashes(cur, nome_tabella)
except Exception:
existing_hashes = set()
imported = 0
for row in rows_final:
try:
if insert_row(cur, nome_tabella, imported_cols, row, cf_amministratore, codice_fiscale_cond, cartella_cond, anno, existing_hashes):
imported += 1
except Exception as row_e:
log_event("row_error", table=nome_tabella, data=row, error=str(row_e))
log_event("imported_rows", table=nome_tabella, rows=imported)
print(f"Importate {imported} nuove righe in {nome_tabella}")
def process_mdb(mdb_path, cur, stabili_mapping):
parts = os.path.normpath(mdb_path).split(os.sep)
cf_amministratore = parts[-5] if len(parts) > 5 else "GESCON"
codice_fiscale_cond = parts[-4] if len(parts) > 4 else "GESCON"
cartella_cond = parts[-3] if len(parts) > 3 else "0000"
anno = parts[-2] if len(parts) > 2 else "0000"
nomefile_orig = os.path.splitext(os.path.basename(mdb_path))[0]
dati_cond = stabili_mapping.get(cartella_cond, {})
if dati_cond.get("cf_amministratore"):
cf_amministratore = dati_cond["cf_amministratore"]
if dati_cond.get("codice_fiscale_cond"):
codice_fiscale_cond = dati_cond["codice_fiscale_cond"]
idfile = get_struttura_file_id(cur, nomefile_orig)
hash8 = hashlib.md5(nomefile_orig.encode("utf-8")).hexdigest()[:8]
table_name_base = f"{cf_amministratore}_{codice_fiscale_cond}_{cartella_cond.zfill(4)}_{anno.zfill(4)}_{idfile}_{hash8}"
log_event("import_table", mdb=mdb_path, table_mysql=table_name_base, cf_amministratore=cf_amministratore, codice_fiscale_condominio=codice_fiscale_cond, cartella_condominio=cartella_cond, anno=anno)
mapping = {
"file_origine": mdb_path,
"table_mysql": table_name_base,
"cf_amministratore": cf_amministratore,
"codice_fiscale_condominio": codice_fiscale_cond,
"cartella_condominio": cartella_cond,
"anno": anno,
"idfile": idfile,
"hash": hash8
}
log_mapping_tabella(mapping)
try:
tables = subprocess.check_output(["mdb-tables", "-1", mdb_path]).decode().split()
except Exception as e:
log_event("error_mdb", file=mdb_path, error=str(e))
print(f"[ERROR] Errore lettura tabelle: {e}")
return
for tab in tables:
try:
# 1. HEADER: mdb-export -H file.mdb Tabella
header_proc = subprocess.check_output(
["mdb-export", "-H", mdb_path, tab]
)
header = [h.strip() for h in header_proc.decode(errors="ignore").strip().split(",")]
header_fixed = fix_header_conflicts(header)
# 2. DATI: mdb-export --no-header -d "," file.mdb Tabella
proc = subprocess.Popen(
["mdb-export", "--no-header", "-d", ",", mdb_path, tab],
stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
out, err = proc.communicate()
if proc.returncode != 0:
log_event("mdb_export_error", table=tab, error=err.decode())
print(f"[DEBUG] mdb-export error on {tab}: {err.decode()}")
continue
out = out.decode(errors="ignore")
lines = out.splitlines()
if not lines or not lines[0].strip():
log_event("empty_table", table=tab)
print(f"[DEBUG] Tabella {tab} - Numero righe lette: 0")
continue
reader = csv.DictReader(lines, fieldnames=header_fixed)
rows = [fix_row_conflicts(row) for row in reader]
log_event("mdb_read", table=tab, rows=len(rows))
if not rows:
log_event("empty_table", table=tab)
continue
split_tables = ensure_table_structure_split(cur, table_name_base, header_fixed, rows[0], cartella_cond, anno, idfile)
for nome_tabella, imported_cols, problematic_cols in split_tables:
rows_final = [
{k: v for k, v in row.items() if k in imported_cols}
for row in rows
]
# Importa SOLO le righe nuove (quelle NON già presenti, deduplica via hash)
existing_hashes = set()
try:
existing_hashes = get_existing_hashes(cur, nome_tabella)
except Exception:
existing_hashes = set()
imported = 0
for row in rows_final:
try:
if insert_row(cur, nome_tabella, imported_cols, row, cf_amministratore, codice_fiscale_cond, cartella_cond, anno, existing_hashes):
imported += 1
except Exception as row_e:
log_event("row_error", table=nome_tabella, data=row, error=str(row_e))
log_event("imported_rows", table=nome_tabella, rows=imported)
print(f"Importate {imported} nuove righe in {nome_tabella}")
except Exception as e:
log_event("table_error", table=table_name_base, error=str(e))
print(f"[ERROR] Errore su tabella {tab}: {e}")
def main():
stabili_csv = os.path.join(INPUT_ROOT, "dbc", "Stabili", "Stabili.csv")
process_stabili_csv(stabili_csv)
stabili_mapping = get_mapping_from_stabili(stabili_csv)
conn = connect_mysql()
cur = conn.cursor()
# Prima importa tutti i CSV
for root, _, files in os.walk(INPUT_ROOT):
for fname in files:
if fname.lower().endswith(".csv") and not fname.lower().startswith("stabili"):
csv_path = os.path.join(root, fname)
process_csv(csv_path, cur, stabili_mapping)
# Poi importa tutti gli MDB (aggiornamento/incrementale)
for root, _, files in os.walk(INPUT_ROOT):
for fname in files:
if fname.lower().endswith(".mdb"):
mdb_path = os.path.join(root, fname)
process_mdb(mdb_path, cur, stabili_mapping)
cur.close()
conn.close()
log_event("import_complete")
if __name__ == "__main__":
main()
# COMANDO PULIZIA TABELLE:
# mysql -u <user> -p<password> <db> -e "SET GROUP_CONCAT_MAX_LEN=1000000;SELECT GROUP_CONCAT(CONCAT('DROP TABLE IF EXISTS \`', table_name, '\`;') SEPARATOR ' ') FROM information_schema.tables WHERE table_schema = '<db>' AND (table_name LIKE 'rossetta_stabili%' OR table_name LIKE 'struttura_File_GESCON%' OR table_name REGEXP '^[A-Z0-9_]{16}_[A-Z0-9_]{16}_[0-9]{4}_[0-9_]{4}_[0-9]{4}_[a-f0-9]{8}(_[0-9]{3})?$');" | tail -n1 | mysql -u <user> -p<password> <db>