134 lines
4.6 KiB
Python
Executable File
134 lines
4.6 KiB
Python
Executable File
import os
|
|
import glob
|
|
import json
|
|
import argparse
|
|
import sys
|
|
from collections import defaultdict
|
|
|
|
def parse_node_manifest(filepath):
|
|
"""
|
|
Parses a single node manifest file.
|
|
Returns:
|
|
host (str): The hostname found in the header.
|
|
file_data (dict): Dictionary { filepath: md5_hash }
|
|
"""
|
|
host = None
|
|
file_data = {}
|
|
|
|
try:
|
|
with open(filepath, 'r') as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
# Parse Headers
|
|
if line.startswith('#'):
|
|
if line.startswith('# host='):
|
|
host = line.split('=')[1].strip()
|
|
continue
|
|
|
|
# Parse Data (md5 absolute_path)
|
|
# We split by whitespace, maxsplit=1 to preserve spaces in filenames if they exist
|
|
parts = line.split(maxsplit=1)
|
|
if len(parts) == 2:
|
|
md5, path = parts
|
|
file_data[path] = md5
|
|
|
|
except Exception as e:
|
|
print(f"Error reading file {filepath}: {e}", file=sys.stderr)
|
|
return None, None
|
|
|
|
if not host:
|
|
print(f"Warning: No '# host=' header found in {filepath}. Using filename as ID.", file=sys.stderr)
|
|
host = os.path.basename(filepath)
|
|
|
|
return host, file_data
|
|
|
|
def generate_golden_manifest(input_dir, output_file):
|
|
manifest_files = glob.glob(os.path.join(input_dir, '*'))
|
|
|
|
if not manifest_files:
|
|
print(f"No files found in directory: {input_dir}")
|
|
return
|
|
|
|
# 1. Aggregation Phase
|
|
# Structure: global_registry[filepath] = { hostname: hash }
|
|
global_registry = defaultdict(dict)
|
|
all_hosts = set()
|
|
|
|
print(f"Scanning {len(manifest_files)} manifests...")
|
|
|
|
for fpath in manifest_files:
|
|
# Skip if it's a directory
|
|
if os.path.isdir(fpath):
|
|
continue
|
|
|
|
host, data = parse_node_manifest(fpath)
|
|
if host and data:
|
|
all_hosts.add(host)
|
|
for path, md5 in data.items():
|
|
global_registry[path][host] = md5
|
|
|
|
# 2. Analysis Phase
|
|
cluster_manifest = {
|
|
"meta": {
|
|
"total_hosts": len(all_hosts),
|
|
"hosts": list(sorted(all_hosts)),
|
|
"generated_at": str(os.path.basename(output_file))
|
|
},
|
|
"consistent_files": {}, # Files identical on ALL hosts
|
|
"varying_files": {} # Files that differ or are missing on some hosts
|
|
}
|
|
|
|
print("Analyzing file consistency...")
|
|
|
|
for path, host_map in global_registry.items():
|
|
unique_hashes = set(host_map.values())
|
|
present_on_hosts = set(host_map.keys())
|
|
|
|
# Condition 1: Consistent
|
|
# Present on ALL hosts AND has exactly 1 unique hash
|
|
if present_on_hosts == all_hosts and len(unique_hashes) == 1:
|
|
# Store just the hash, as it is the "Ground Truth"
|
|
cluster_manifest["consistent_files"][path] = list(unique_hashes)[0]
|
|
|
|
# Condition 2: Varying
|
|
else:
|
|
issue_type = []
|
|
if len(unique_hashes) > 1:
|
|
issue_type.append("hash_mismatch")
|
|
if present_on_hosts != all_hosts:
|
|
issue_type.append("presence_mismatch")
|
|
|
|
# For varying files, we need the specific details per host
|
|
# so the comparison tool knows what to expect where.
|
|
|
|
# Fill in "MISSING" for hosts that don't have the file
|
|
full_map = host_map.copy()
|
|
for h in all_hosts:
|
|
if h not in full_map:
|
|
full_map[h] = "MISSING"
|
|
|
|
cluster_manifest["varying_files"][path] = {
|
|
"issues": issue_type,
|
|
"states": full_map
|
|
}
|
|
|
|
# 3. Output Phase
|
|
try:
|
|
with open(output_file, 'w') as f:
|
|
json.dump(cluster_manifest, f, indent=2, sort_keys=True)
|
|
print(f"Success! Cluster manifest written to: {output_file}")
|
|
print(f"Stats: {len(cluster_manifest['consistent_files'])} consistent files, {len(cluster_manifest['varying_files'])} varying files.")
|
|
except Exception as e:
|
|
print(f"Error writing output file: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Generate a cluster-wide ground truth manifest.")
|
|
parser.add_argument("input_dir", help="Directory containing node manifest files")
|
|
parser.add_argument("output_file", help="Path to write the resulting JSON manifest")
|
|
|
|
args = parser.parse_args()
|
|
|
|
generate_golden_manifest(args.input_dir, args.output_file) |