Files
nmd-md5sum-manifest/stages/stage2-analyze-clusterwide-manifests.py
2026-03-24 09:34:37 +00:00

134 lines
4.6 KiB
Python
Executable File

import os
import glob
import json
import argparse
import sys
from collections import defaultdict
def parse_node_manifest(filepath):
"""
Parses a single node manifest file.
Returns:
host (str): The hostname found in the header.
file_data (dict): Dictionary { filepath: md5_hash }
"""
host = None
file_data = {}
try:
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
if not line:
continue
# Parse Headers
if line.startswith('#'):
if line.startswith('# host='):
host = line.split('=')[1].strip()
continue
# Parse Data (md5 absolute_path)
# We split by whitespace, maxsplit=1 to preserve spaces in filenames if they exist
parts = line.split(maxsplit=1)
if len(parts) == 2:
md5, path = parts
file_data[path] = md5
except Exception as e:
print(f"Error reading file {filepath}: {e}", file=sys.stderr)
return None, None
if not host:
print(f"Warning: No '# host=' header found in {filepath}. Using filename as ID.", file=sys.stderr)
host = os.path.basename(filepath)
return host, file_data
def generate_golden_manifest(input_dir, output_file):
manifest_files = glob.glob(os.path.join(input_dir, '*'))
if not manifest_files:
print(f"No files found in directory: {input_dir}")
return
# 1. Aggregation Phase
# Structure: global_registry[filepath] = { hostname: hash }
global_registry = defaultdict(dict)
all_hosts = set()
print(f"Scanning {len(manifest_files)} manifests...")
for fpath in manifest_files:
# Skip if it's a directory
if os.path.isdir(fpath):
continue
host, data = parse_node_manifest(fpath)
if host and data:
all_hosts.add(host)
for path, md5 in data.items():
global_registry[path][host] = md5
# 2. Analysis Phase
cluster_manifest = {
"meta": {
"total_hosts": len(all_hosts),
"hosts": list(sorted(all_hosts)),
"generated_at": str(os.path.basename(output_file))
},
"consistent_files": {}, # Files identical on ALL hosts
"varying_files": {} # Files that differ or are missing on some hosts
}
print("Analyzing file consistency...")
for path, host_map in global_registry.items():
unique_hashes = set(host_map.values())
present_on_hosts = set(host_map.keys())
# Condition 1: Consistent
# Present on ALL hosts AND has exactly 1 unique hash
if present_on_hosts == all_hosts and len(unique_hashes) == 1:
# Store just the hash, as it is the "Ground Truth"
cluster_manifest["consistent_files"][path] = list(unique_hashes)[0]
# Condition 2: Varying
else:
issue_type = []
if len(unique_hashes) > 1:
issue_type.append("hash_mismatch")
if present_on_hosts != all_hosts:
issue_type.append("presence_mismatch")
# For varying files, we need the specific details per host
# so the comparison tool knows what to expect where.
# Fill in "MISSING" for hosts that don't have the file
full_map = host_map.copy()
for h in all_hosts:
if h not in full_map:
full_map[h] = "MISSING"
cluster_manifest["varying_files"][path] = {
"issues": issue_type,
"states": full_map
}
# 3. Output Phase
try:
with open(output_file, 'w') as f:
json.dump(cluster_manifest, f, indent=2, sort_keys=True)
print(f"Success! Cluster manifest written to: {output_file}")
print(f"Stats: {len(cluster_manifest['consistent_files'])} consistent files, {len(cluster_manifest['varying_files'])} varying files.")
except Exception as e:
print(f"Error writing output file: {e}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate a cluster-wide ground truth manifest.")
parser.add_argument("input_dir", help="Directory containing node manifest files")
parser.add_argument("output_file", help="Path to write the resulting JSON manifest")
args = parser.parse_args()
generate_golden_manifest(args.input_dir, args.output_file)