65 lines
2.2 KiB
Python
65 lines
2.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix non-ASCII ID tokens in an EPANET .inp file by mapping each unique non-ASCII-containing token
|
|
to an ASCII-safe name. Outputs a new INP and a mapping file for review.
|
|
Usage: python fix_inp_nonascii.py input.inp [output.inp]
|
|
"""
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python fix_inp_nonascii.py input.inp [output.inp]")
|
|
sys.exit(2)
|
|
|
|
src = Path(sys.argv[1])
|
|
if len(sys.argv) > 2:
|
|
dst = Path(sys.argv[2])
|
|
else:
|
|
dst = src.with_name(src.stem + '-ascii' + src.suffix)
|
|
|
|
text = src.read_text(encoding='utf-8')
|
|
# Find tokens that contain at least one non-ASCII char. Token = contiguous non-whitespace sequence
|
|
nonascii_tokens = set(re.findall(r"\S*[^\x00-\x7F]\S*", text))
|
|
if not nonascii_tokens:
|
|
print("No non-ASCII tokens found. Copying source to destination unchanged.")
|
|
dst.write_text(text, encoding='utf-8')
|
|
sys.exit(0)
|
|
|
|
used = set()
|
|
mapping = {}
|
|
counter = 1
|
|
# Sort tokens to get deterministic output
|
|
for t in sorted(nonascii_tokens):
|
|
# build ASCII prefix from characters that are safe (alnum, underscore, hyphen)
|
|
prefix = ''.join(ch for ch in t if ord(ch) < 128 and (ch.isalnum() or ch in '_-'))
|
|
if not prefix:
|
|
prefix = 'ID'
|
|
candidate = prefix
|
|
# ensure candidate is unique and not equal to original token
|
|
while candidate in used:
|
|
candidate = f"{prefix}_x{counter}"
|
|
counter += 1
|
|
# if candidate accidentally equals the original token (rare), force suffix
|
|
if candidate == t:
|
|
candidate = f"{prefix}_x{counter}"
|
|
counter += 1
|
|
mapping[t] = candidate
|
|
used.add(candidate)
|
|
|
|
# Replace occurrences safely using regex word boundary style (escape token)
|
|
new_text = text
|
|
for src_token, dst_token in mapping.items():
|
|
# replace exact matches (no partial). Use lookarounds: not part of larger non-whitespace.
|
|
pattern = re.escape(src_token)
|
|
new_text = re.sub(pattern, dst_token, new_text)
|
|
|
|
# Write output files
|
|
dst.write_text(new_text, encoding='utf-8')
|
|
mapfile = dst.with_suffix(dst.suffix + '.mapping.txt')
|
|
with mapfile.open('w', encoding='utf-8') as f:
|
|
for k, v in mapping.items():
|
|
f.write(f"{k} -> {v}\n")
|
|
|
|
print(f"Wrote: {dst}\nMapping: {mapfile}\nReplaced {len(mapping)} non-ASCII tokens.")
|