TJWaterServer/epanet/fix_valve_ids.py

#!/usr/bin/env python3
import re
from pathlib import Path

inp = Path(r"d:\TJWaterServer\epanet\szhskeleton-patternfixed-ascii.inp")
mapf = Path(r"d:\TJWaterServer\epanet\szhskeleton-patternfixed-ascii.inp.mapping.txt")
out = Path(r"d:\TJWaterServer\epanet\szhskeleton-patternfixed-ascii-fixed.inp")
outmap = out.with_suffix(out.suffix + '.mapping.txt')

text = inp.read_text(encoding='utf-8')

# parse mapping file (original -> mapped)
map_original_to_mapped = {}
if mapf.exists():
    for line in mapf.read_text(encoding='utf-8').splitlines():
        if '->' in line:
            a,b = line.split('->',1)
            map_original_to_mapped[a.strip()] = b.strip()

# find [VALVES] block
m = re.search(r"(?mi)^\[VALVES\]\s*(?:;.*\n)?(.*?)(?=^\[|\Z)", text, flags=re.S|re.M)
if not m:
    print('No [VALVES] section found')
    raise SystemExit(1)
block = m.group(1)

# extract IDs (first non-empty token at start of each non-comment line)
ids = []
line_offsets = []
lines = block.splitlines()
for i,l in enumerate(lines):
    if not l.strip() or l.strip().startswith(';'):
        continue
    # split by whitespace
    toks = l.split()
    if toks:
        ids.append(toks[0])
        line_offsets.append((i, l))

# find duplicates
from collections import defaultdict
count = defaultdict(list)
for idx, token in enumerate(ids):
    count[token].append(idx)

dups = {k:v for k,v in count.items() if len(v)>1}

print(f'Found {len(ids)} valve IDs; {len(dups)} duplicates')
for k,v in list(dups.items())[:40]:
    print(k, 'occurs', len(v), 'times')

# Also find mapped collisions: multiple originals mapped to same mapped token
mapped_rev = defaultdict(list)
for orig,mapped in map_original_to_mapped.items():
    mapped_rev[mapped].append(orig)
collisions = {m:origlist for m,origlist in mapped_rev.items() if len(origlist)>1}
print('\nMapped collisions (same mapped token from multiple originals):', len(collisions))
for m,ol in list(collisions.items())[:40]:
    print(m, ' <- ', ol[:5])

# We'll fix any ID that is purely digits, or any duplicate ID in the valves block.
fixed_map = {}  # oldToken -> newToken
used = set(ids)  # existing tokens in valves
suffix_counter = 1

for token, positions in dups.items():
    # choose new unique names for subsequent occurrences (leave first occurrence as-is)
    for pos_index, occ in enumerate(positions):
        if pos_index == 0:
            continue
        base = token
        # if base is all digits or starts with digit, prefix with VAL_
        if re.fullmatch(r"\d+", base) or re.match(r"^\d", base):
            candidate = f'VAL_{base}'
        else:
            candidate = f'{base}_dup'
        # ensure uniqueness
        while candidate in used:
            candidate = f'{candidate}_{suffix_counter}'
            suffix_counter += 1
        used.add(candidate)
        fixed_map[token + f'__occ{pos_index}'] = candidate

# The above approach requires us to identify which exact occurrence to replace. We'll instead build a replacement pass that replaces only the Nth occurrence.
# Build per-token occurrence numbers to replace subsequent ones.
occ_to_new = {}  # (token, occ_index) -> newname
for token, positions in dups.items():
    for pos_index, occ in enumerate(positions):
        if pos_index == 0:
            continue
        if re.fullmatch(r"\d+", token) or re.match(r"^\d", token):
            candidate = f'VAL_{token}'
        else:
            candidate = f'{token}_dup'
        while candidate in used:
            candidate = f'{candidate}_{suffix_counter}'
            suffix_counter += 1
        used.add(candidate)
        occ_to_new[(token, pos_index)] = candidate

# Now construct new block replacing the Nth occurrence of duplicates token
new_lines = []
occ_seen = defaultdict(int)
for l in lines:
    if not l.strip() or l.strip().startswith(';'):
        new_lines.append(l)
        continue
    toks = l.split()
    token = toks[0]
    occ_seen[token] += 1
    occ_idx = occ_seen[token]-1
    if (token, occ_idx) in occ_to_new:
        new_token = occ_to_new[(token, occ_idx)]
        # replace only the first token in the line
        rest = l[len(l.lstrip()):]
        # reconstruct preserving leading whitespace
        leading = l[:len(l)-len(l.lstrip())]
        # find start index of token in line
        m2 = re.match(r"(\s*)" + re.escape(token), l)
        if m2:
            leading = m2.group(1)
        new_line = leading + new_token + l[m2.end():]
        new_lines.append(new_line)
        # record mapping for global replacement
        fixed_map[token + f'__occ{occ_idx}'] = new_token
    else:
        new_lines.append(l)

# write new file by replacing block
new_block = '\n'.join(new_lines) + '\n'
new_text = text[:m.start(1)] + new_block + text[m.end(1):]
out.write_text(new_text, encoding='utf-8')

# Create an updated mapping file: show which tokens were changed and why
with outmap.open('w', encoding='utf-8') as f:
    f.write('Changes applied to fix duplicate valve IDs:\n')
    for k,v in occ_to_new.items():
        token, occ = k
        f.write(f'{token} occurrence {occ} -> {v}\n')
    f.write('\nNote: These replacements are only for valve ID occurrences beyond the first.\n')

print('Wrote', out, 'and mapping', outmap)
print('Replacements:', len(occ_to_new))
print('If you want different naming (e.g. prefix with V_), rerun with that preference.')