TJWaterServer/epanet/fix_inp_nonascii.py

#!/usr/bin/env python3
"""
Fix non-ASCII ID tokens in an EPANET .inp file by mapping each unique non-ASCII-containing token
to an ASCII-safe name. Outputs a new INP and a mapping file for review.
Usage: python fix_inp_nonascii.py input.inp [output.inp]
"""
import re
import sys
from pathlib import Path

if len(sys.argv) < 2:
    print("Usage: python fix_inp_nonascii.py input.inp [output.inp]")
    sys.exit(2)

src = Path(sys.argv[1])
if len(sys.argv) > 2:
    dst = Path(sys.argv[2])
else:
    dst = src.with_name(src.stem + '-ascii' + src.suffix)

text = src.read_text(encoding='utf-8')
# Find tokens that contain at least one non-ASCII char. Token = contiguous non-whitespace sequence
nonascii_tokens = set(re.findall(r"\S*[^\x00-\x7F]\S*", text))
if not nonascii_tokens:
    print("No non-ASCII tokens found. Copying source to destination unchanged.")
    dst.write_text(text, encoding='utf-8')
    sys.exit(0)

used = set()
mapping = {}
counter = 1
# Sort tokens to get deterministic output
for t in sorted(nonascii_tokens):
    # build ASCII prefix from characters that are safe (alnum, underscore, hyphen)
    prefix = ''.join(ch for ch in t if ord(ch) < 128 and (ch.isalnum() or ch in '_-'))
    if not prefix:
        prefix = 'ID'
    candidate = prefix
    # ensure candidate is unique and not equal to original token
    while candidate in used:
        candidate = f"{prefix}_x{counter}"
        counter += 1
    # if candidate accidentally equals the original token (rare), force suffix
    if candidate == t:
        candidate = f"{prefix}_x{counter}"
        counter += 1
    mapping[t] = candidate
    used.add(candidate)

# Replace occurrences safely using regex word boundary style (escape token)
new_text = text
for src_token, dst_token in mapping.items():
    # replace exact matches (no partial). Use lookarounds: not part of larger non-whitespace.
    pattern = re.escape(src_token)
    new_text = re.sub(pattern, dst_token, new_text)

# Write output files
dst.write_text(new_text, encoding='utf-8')
mapfile = dst.with_suffix(dst.suffix + '.mapping.txt')
with mapfile.open('w', encoding='utf-8') as f:
    for k, v in mapping.items():
        f.write(f"{k} -> {v}\n")

print(f"Wrote: {dst}\nMapping: {mapfile}\nReplaced {len(mapping)} non-ASCII tokens.")