#!/usr/bin/env python3 import re from pathlib import Path inp = Path(r"d:\TJWaterServer\epanet\szhskeleton-patternfixed-ascii.inp") mapf = Path(r"d:\TJWaterServer\epanet\szhskeleton-patternfixed-ascii.inp.mapping.txt") out = Path(r"d:\TJWaterServer\epanet\szhskeleton-patternfixed-ascii-fixed.inp") outmap = out.with_suffix(out.suffix + '.mapping.txt') text = inp.read_text(encoding='utf-8') # parse mapping file (original -> mapped) map_original_to_mapped = {} if mapf.exists(): for line in mapf.read_text(encoding='utf-8').splitlines(): if '->' in line: a,b = line.split('->',1) map_original_to_mapped[a.strip()] = b.strip() # find [VALVES] block m = re.search(r"(?mi)^\[VALVES\]\s*(?:;.*\n)?(.*?)(?=^\[|\Z)", text, flags=re.S|re.M) if not m: print('No [VALVES] section found') raise SystemExit(1) block = m.group(1) # extract IDs (first non-empty token at start of each non-comment line) ids = [] line_offsets = [] lines = block.splitlines() for i,l in enumerate(lines): if not l.strip() or l.strip().startswith(';'): continue # split by whitespace toks = l.split() if toks: ids.append(toks[0]) line_offsets.append((i, l)) # find duplicates from collections import defaultdict count = defaultdict(list) for idx, token in enumerate(ids): count[token].append(idx) dups = {k:v for k,v in count.items() if len(v)>1} print(f'Found {len(ids)} valve IDs; {len(dups)} duplicates') for k,v in list(dups.items())[:40]: print(k, 'occurs', len(v), 'times') # Also find mapped collisions: multiple originals mapped to same mapped token mapped_rev = defaultdict(list) for orig,mapped in map_original_to_mapped.items(): mapped_rev[mapped].append(orig) collisions = {m:origlist for m,origlist in mapped_rev.items() if len(origlist)>1} print('\nMapped collisions (same mapped token from multiple originals):', len(collisions)) for m,ol in list(collisions.items())[:40]: print(m, ' <- ', ol[:5]) # We'll fix any ID that is purely digits, or any duplicate ID in the valves block. fixed_map = {} # oldToken -> newToken used = set(ids) # existing tokens in valves suffix_counter = 1 for token, positions in dups.items(): # choose new unique names for subsequent occurrences (leave first occurrence as-is) for pos_index, occ in enumerate(positions): if pos_index == 0: continue base = token # if base is all digits or starts with digit, prefix with VAL_ if re.fullmatch(r"\d+", base) or re.match(r"^\d", base): candidate = f'VAL_{base}' else: candidate = f'{base}_dup' # ensure uniqueness while candidate in used: candidate = f'{candidate}_{suffix_counter}' suffix_counter += 1 used.add(candidate) fixed_map[token + f'__occ{pos_index}'] = candidate # The above approach requires us to identify which exact occurrence to replace. We'll instead build a replacement pass that replaces only the Nth occurrence. # Build per-token occurrence numbers to replace subsequent ones. occ_to_new = {} # (token, occ_index) -> newname for token, positions in dups.items(): for pos_index, occ in enumerate(positions): if pos_index == 0: continue if re.fullmatch(r"\d+", token) or re.match(r"^\d", token): candidate = f'VAL_{token}' else: candidate = f'{token}_dup' while candidate in used: candidate = f'{candidate}_{suffix_counter}' suffix_counter += 1 used.add(candidate) occ_to_new[(token, pos_index)] = candidate # Now construct new block replacing the Nth occurrence of duplicates token new_lines = [] occ_seen = defaultdict(int) for l in lines: if not l.strip() or l.strip().startswith(';'): new_lines.append(l) continue toks = l.split() token = toks[0] occ_seen[token] += 1 occ_idx = occ_seen[token]-1 if (token, occ_idx) in occ_to_new: new_token = occ_to_new[(token, occ_idx)] # replace only the first token in the line rest = l[len(l.lstrip()):] # reconstruct preserving leading whitespace leading = l[:len(l)-len(l.lstrip())] # find start index of token in line m2 = re.match(r"(\s*)" + re.escape(token), l) if m2: leading = m2.group(1) new_line = leading + new_token + l[m2.end():] new_lines.append(new_line) # record mapping for global replacement fixed_map[token + f'__occ{occ_idx}'] = new_token else: new_lines.append(l) # write new file by replacing block new_block = '\n'.join(new_lines) + '\n' new_text = text[:m.start(1)] + new_block + text[m.end(1):] out.write_text(new_text, encoding='utf-8') # Create an updated mapping file: show which tokens were changed and why with outmap.open('w', encoding='utf-8') as f: f.write('Changes applied to fix duplicate valve IDs:\n') for k,v in occ_to_new.items(): token, occ = k f.write(f'{token} occurrence {occ} -> {v}\n') f.write('\nNote: These replacements are only for valve ID occurrences beyond the first.\n') print('Wrote', out, 'and mapping', outmap) print('Replacements:', len(occ_to_new)) print('If you want different naming (e.g. prefix with V_), rerun with that preference.')