netgescon-master/scripts/Script fatti per prova e per ora sospesi/step1_estrai_csv.py

35 lines
1.2 KiB
Python

import os
import pandas as pd
import pickle
BASE_PATH = "../estratti"
OUT_PATH = "estratti_serializzati"
def safe_columns(df):
# Rinomina colonna "id" o "ID" (case-insensitive) in "id_csv"
df.columns = [col if col.lower() != "id" else "id_csv" for col in df.columns]
return df
def scan_and_save():
all_tables = {}
os.makedirs(OUT_PATH, exist_ok=True)
for root, dirs, files in os.walk(BASE_PATH):
for f in files:
if f.lower().endswith(".csv"):
full_path = os.path.join(root, f)
key = os.path.relpath(full_path, BASE_PATH).replace(os.sep, "__")
try:
df = pd.read_csv(full_path, dtype=str, keep_default_na=False)
df = safe_columns(df)
all_tables[key] = df
df.to_pickle(os.path.join(OUT_PATH, f"{key}.pkl"))
print(f"[OK] Letto e salvato: {key}")
except Exception as e:
print(f"[ERRORE] {key}: {e}")
# Facoltativo: salva tutto in un unico file
with open(os.path.join(OUT_PATH, "all_tables.pkl"), "wb") as f:
pickle.dump(all_tables, f)
print("Tutto serializzato!")
if __name__ == "__main__":
scan_and_save()