#!/usr/bin/env python3 """ Fix non-ASCII ID tokens in an EPANET .inp file by mapping each unique non-ASCII-containing token to an ASCII-safe name. Outputs a new INP and a mapping file for review. Usage: python fix_inp_nonascii.py input.inp [output.inp] """ import re import sys from pathlib import Path if len(sys.argv) < 2: print("Usage: python fix_inp_nonascii.py input.inp [output.inp]") sys.exit(2) src = Path(sys.argv[1]) if len(sys.argv) > 2: dst = Path(sys.argv[2]) else: dst = src.with_name(src.stem + '-ascii' + src.suffix) text = src.read_text(encoding='utf-8') # Find tokens that contain at least one non-ASCII char. Token = contiguous non-whitespace sequence nonascii_tokens = set(re.findall(r"\S*[^\x00-\x7F]\S*", text)) if not nonascii_tokens: print("No non-ASCII tokens found. Copying source to destination unchanged.") dst.write_text(text, encoding='utf-8') sys.exit(0) used = set() mapping = {} counter = 1 # Sort tokens to get deterministic output for t in sorted(nonascii_tokens): # build ASCII prefix from characters that are safe (alnum, underscore, hyphen) prefix = ''.join(ch for ch in t if ord(ch) < 128 and (ch.isalnum() or ch in '_-')) if not prefix: prefix = 'ID' candidate = prefix # ensure candidate is unique and not equal to original token while candidate in used: candidate = f"{prefix}_x{counter}" counter += 1 # if candidate accidentally equals the original token (rare), force suffix if candidate == t: candidate = f"{prefix}_x{counter}" counter += 1 mapping[t] = candidate used.add(candidate) # Replace occurrences safely using regex word boundary style (escape token) new_text = text for src_token, dst_token in mapping.items(): # replace exact matches (no partial). Use lookarounds: not part of larger non-whitespace. pattern = re.escape(src_token) new_text = re.sub(pattern, dst_token, new_text) # Write output files dst.write_text(new_text, encoding='utf-8') mapfile = dst.with_suffix(dst.suffix + '.mapping.txt') with mapfile.open('w', encoding='utf-8') as f: for k, v in mapping.items(): f.write(f"{k} -> {v}\n") print(f"Wrote: {dst}\nMapping: {mapfile}\nReplaced {len(mapping)} non-ASCII tokens.")