import os import glob import json import argparse import sys from collections import defaultdict def parse_node_manifest(filepath): """ Parses a single node manifest file. Returns: host (str): The hostname found in the header. file_data (dict): Dictionary { filepath: md5_hash } """ host = None file_data = {} try: with open(filepath, 'r') as f: for line in f: line = line.strip() if not line: continue # Parse Headers if line.startswith('#'): if line.startswith('# host='): host = line.split('=')[1].strip() continue # Parse Data (md5 absolute_path) # We split by whitespace, maxsplit=1 to preserve spaces in filenames if they exist parts = line.split(maxsplit=1) if len(parts) == 2: md5, path = parts file_data[path] = md5 except Exception as e: print(f"Error reading file {filepath}: {e}", file=sys.stderr) return None, None if not host: print(f"Warning: No '# host=' header found in {filepath}. Using filename as ID.", file=sys.stderr) host = os.path.basename(filepath) return host, file_data def generate_golden_manifest(input_dir, output_file): manifest_files = glob.glob(os.path.join(input_dir, '*')) if not manifest_files: print(f"No files found in directory: {input_dir}") return # 1. Aggregation Phase # Structure: global_registry[filepath] = { hostname: hash } global_registry = defaultdict(dict) all_hosts = set() print(f"Scanning {len(manifest_files)} manifests...") for fpath in manifest_files: # Skip if it's a directory if os.path.isdir(fpath): continue host, data = parse_node_manifest(fpath) if host and data: all_hosts.add(host) for path, md5 in data.items(): global_registry[path][host] = md5 # 2. Analysis Phase cluster_manifest = { "meta": { "total_hosts": len(all_hosts), "hosts": list(sorted(all_hosts)), "generated_at": str(os.path.basename(output_file)) }, "consistent_files": {}, # Files identical on ALL hosts "varying_files": {} # Files that differ or are missing on some hosts } print("Analyzing file consistency...") for path, host_map in global_registry.items(): unique_hashes = set(host_map.values()) present_on_hosts = set(host_map.keys()) # Condition 1: Consistent # Present on ALL hosts AND has exactly 1 unique hash if present_on_hosts == all_hosts and len(unique_hashes) == 1: # Store just the hash, as it is the "Ground Truth" cluster_manifest["consistent_files"][path] = list(unique_hashes)[0] # Condition 2: Varying else: issue_type = [] if len(unique_hashes) > 1: issue_type.append("hash_mismatch") if present_on_hosts != all_hosts: issue_type.append("presence_mismatch") # For varying files, we need the specific details per host # so the comparison tool knows what to expect where. # Fill in "MISSING" for hosts that don't have the file full_map = host_map.copy() for h in all_hosts: if h not in full_map: full_map[h] = "MISSING" cluster_manifest["varying_files"][path] = { "issues": issue_type, "states": full_map } # 3. Output Phase try: with open(output_file, 'w') as f: json.dump(cluster_manifest, f, indent=2, sort_keys=True) print(f"Success! Cluster manifest written to: {output_file}") print(f"Stats: {len(cluster_manifest['consistent_files'])} consistent files, {len(cluster_manifest['varying_files'])} varying files.") except Exception as e: print(f"Error writing output file: {e}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate a cluster-wide ground truth manifest.") parser.add_argument("input_dir", help="Directory containing node manifest files") parser.add_argument("output_file", help="Path to write the resulting JSON manifest") args = parser.parse_args() generate_golden_manifest(args.input_dir, args.output_file)