fix bug and refine ,end of 2025
This commit is contained in:
64
epanet/fix_inp_nonascii.py
Normal file
64
epanet/fix_inp_nonascii.py
Normal file
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fix non-ASCII ID tokens in an EPANET .inp file by mapping each unique non-ASCII-containing token
|
||||
to an ASCII-safe name. Outputs a new INP and a mapping file for review.
|
||||
Usage: python fix_inp_nonascii.py input.inp [output.inp]
|
||||
"""
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python fix_inp_nonascii.py input.inp [output.inp]")
|
||||
sys.exit(2)
|
||||
|
||||
src = Path(sys.argv[1])
|
||||
if len(sys.argv) > 2:
|
||||
dst = Path(sys.argv[2])
|
||||
else:
|
||||
dst = src.with_name(src.stem + '-ascii' + src.suffix)
|
||||
|
||||
text = src.read_text(encoding='utf-8')
|
||||
# Find tokens that contain at least one non-ASCII char. Token = contiguous non-whitespace sequence
|
||||
nonascii_tokens = set(re.findall(r"\S*[^\x00-\x7F]\S*", text))
|
||||
if not nonascii_tokens:
|
||||
print("No non-ASCII tokens found. Copying source to destination unchanged.")
|
||||
dst.write_text(text, encoding='utf-8')
|
||||
sys.exit(0)
|
||||
|
||||
used = set()
|
||||
mapping = {}
|
||||
counter = 1
|
||||
# Sort tokens to get deterministic output
|
||||
for t in sorted(nonascii_tokens):
|
||||
# build ASCII prefix from characters that are safe (alnum, underscore, hyphen)
|
||||
prefix = ''.join(ch for ch in t if ord(ch) < 128 and (ch.isalnum() or ch in '_-'))
|
||||
if not prefix:
|
||||
prefix = 'ID'
|
||||
candidate = prefix
|
||||
# ensure candidate is unique and not equal to original token
|
||||
while candidate in used:
|
||||
candidate = f"{prefix}_x{counter}"
|
||||
counter += 1
|
||||
# if candidate accidentally equals the original token (rare), force suffix
|
||||
if candidate == t:
|
||||
candidate = f"{prefix}_x{counter}"
|
||||
counter += 1
|
||||
mapping[t] = candidate
|
||||
used.add(candidate)
|
||||
|
||||
# Replace occurrences safely using regex word boundary style (escape token)
|
||||
new_text = text
|
||||
for src_token, dst_token in mapping.items():
|
||||
# replace exact matches (no partial). Use lookarounds: not part of larger non-whitespace.
|
||||
pattern = re.escape(src_token)
|
||||
new_text = re.sub(pattern, dst_token, new_text)
|
||||
|
||||
# Write output files
|
||||
dst.write_text(new_text, encoding='utf-8')
|
||||
mapfile = dst.with_suffix(dst.suffix + '.mapping.txt')
|
||||
with mapfile.open('w', encoding='utf-8') as f:
|
||||
for k, v in mapping.items():
|
||||
f.write(f"{k} -> {v}\n")
|
||||
|
||||
print(f"Wrote: {dst}\nMapping: {mapfile}\nReplaced {len(mapping)} non-ASCII tokens.")
|
||||
Reference in New Issue
Block a user