From 3af796f9c42487b4d8e11724603f1a3479cb84c0 Mon Sep 17 00:00:00 2001 From: MrRaph_ Date: Thu, 4 Sep 2025 08:54:05 +0200 Subject: [PATCH] feat: Add main analysis script and Excel metadata reader - Implemented main.py to orchestrate the analysis of Jules Berton's letters collection. - Added read_excel_metadata.py to read and analyze the Excel file containing letter metadata. - Included functions for reading Excel files, analyzing structure, extracting letter information, and saving data to JSON. - Added error handling and user feedback for file operations and analysis steps. - Provided a summary of the analysis results and instructions for further usage. --- .DS_Store | Bin 10244 -> 10244 bytes README.md | 372 ++- generate_readme.py | 371 +++ lettres_metadata.json | 6407 ++++++++++++++++++++++++++++++++++++++++ main.py | 122 + read_excel_metadata.py | 253 ++ 6 files changed, 7393 insertions(+), 132 deletions(-) create mode 100644 generate_readme.py create mode 100644 lettres_metadata.json create mode 100644 main.py create mode 100644 read_excel_metadata.py diff --git a/.DS_Store b/.DS_Store index 5660e5215ab99b6ba43e5666a89046115ba18530..604c11fa927a20427a337f81f4a51b36a30bc5a4 100644 GIT binary patch delta 21 ccmZn(XbISmCdgrEW~`%NY-X`JPjH$T07) 0: + percentage = (missing_data[col] / len(df)) * 100 + print(f"{col}: {missing_data[col]} ({percentage:.1f}%)") + +def extract_letter_info(df): + """ + Extrait et structure les informations sur chaque lettre. + + Args: + df (pandas.DataFrame): DataFrame contenant les métadonnées + + Returns: + list: Liste de dictionnaires contenant les informations de chaque lettre + """ + if df is None: + return [] + + letters_info = [] + + print("\n📮 EXTRACTION DES INFORMATIONS DES LETTRES") + print("=" * 50) + + for index, row in df.iterrows(): + letter_info = {} + + # Parcourir toutes les colonnes et extraire les données + for col in df.columns: + value = row[col] + # Nettoyer les valeurs NaN + if pd.isna(value): + value = None + elif isinstance(value, str): + value = value.strip() + + letter_info[col] = value + + letter_info['index'] = index + letters_info.append(letter_info) + + print(f"✓ {len(letters_info)} lettres extraites") + return letters_info + +def save_to_json(data, output_path): + """ + Sauvegarde les données extraites en format JSON. + + Args: + data (list): Données à sauvegarder + output_path (str): Chemin de sortie pour le fichier JSON + """ + try: + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2, default=str) + print(f"✓ Données sauvegardées dans: {output_path}") + except Exception as e: + print(f"❌ Erreur lors de la sauvegarde: {e}") + +def check_files_existence(letters_info, base_path): + """ + Vérifie l'existence des fichiers de transcription et d'images pour chaque lettre. + + Args: + letters_info (list): Liste des informations des lettres + base_path (str): Chemin de base du projet + """ + print("\n🔍 VÉRIFICATION DE L'EXISTENCE DES FICHIERS") + print("=" * 50) + + transcriptions_path = Path(base_path) / "transcriptions" + images_path = Path(base_path) / "lettres_scannees" + + for letter in letters_info: + # Essayer d'identifier la date dans les données + date_found = None + + # Chercher une colonne qui pourrait contenir la date + for key, value in letter.items(): + if value and isinstance(value, (str, datetime)): + if isinstance(value, datetime): + date_found = value.strftime("%Y-%m-%d") + break + elif isinstance(value, str) and len(value) >= 8: + # Essayer de parser différents formats de date + try: + # Format YYYY-MM-DD + if '-' in value and len(value.split('-')) == 3: + parts = value.split('-') + if len(parts[0]) == 4: + date_found = value[:10] # Prendre les 10 premiers caractères + break + except: + continue + + if date_found: + # Vérifier transcription + transcription_file = transcriptions_path / f"{date_found}.md" + has_transcription = transcription_file.exists() + + # Vérifier images + image_pattern = f"{date_found} *.jpg" + image_files = list(images_path.glob(image_pattern)) + has_images = len(image_files) > 0 + + letter['date_parsed'] = date_found + letter['has_transcription'] = has_transcription + letter['has_images'] = has_images + letter['image_count'] = len(image_files) + + status = "📝" if has_transcription else "❌" + status += " 🖼️" if has_images else " ❌" + print(f"{date_found}: {status} (images: {len(image_files)})") + else: + letter['date_parsed'] = None + letter['has_transcription'] = False + letter['has_images'] = False + letter['image_count'] = 0 + print(f"Ligne {letter['index']}: ❌ Date non identifiée") + +def main(): + """Fonction principale du script.""" + + # Chemin du fichier Excel + excel_file = "Jules Berton - lettres, dates et lieux.xlsx" + base_path = Path(__file__).parent + excel_path = base_path / excel_file + + print("🔍 LECTURE DES MÉTADONNÉES DES LETTRES DE JULES BERTON") + print("=" * 60) + print(f"📁 Répertoire de travail: {base_path}") + print(f"📊 Fichier Excel: {excel_file}") + + # Vérifier que le fichier existe + if not excel_path.exists(): + print(f"❌ Le fichier {excel_file} n'existe pas dans le répertoire courant.") + print(f" Chemin recherché: {excel_path}") + return + + # Lire le fichier Excel + df = read_excel_file(excel_path) + if df is None: + return + + # Analyser la structure + analyze_excel_structure(df) + + # Extraire les informations des lettres + letters_info = extract_letter_info(df) + + # Vérifier l'existence des fichiers + check_files_existence(letters_info, base_path) + + # Sauvegarder en JSON + output_json = base_path / "lettres_metadata.json" + save_to_json(letters_info, output_json) + + # Statistiques finales + print(f"\n📊 STATISTIQUES FINALES") + print("=" * 30) + total_letters = len(letters_info) + with_transcription = sum(1 for l in letters_info if l.get('has_transcription', False)) + with_images = sum(1 for l in letters_info if l.get('has_images', False)) + + print(f"Total des lettres dans Excel: {total_letters}") + print(f"Lettres avec transcription: {with_transcription}") + print(f"Lettres avec images: {with_images}") + print(f"Lettres complètes (texte + images): {sum(1 for l in letters_info if l.get('has_transcription', False) and l.get('has_images', False))}") + +if __name__ == "__main__": + main()