145 lines
5.2 KiB
Python
145 lines
5.2 KiB
Python
#!/usr/bin/env python3
|
|
import re
|
|
from pathlib import Path
|
|
|
|
inp = Path(r"d:\TJWaterServer\epanet\szhskeleton-patternfixed-ascii.inp")
|
|
mapf = Path(r"d:\TJWaterServer\epanet\szhskeleton-patternfixed-ascii.inp.mapping.txt")
|
|
out = Path(r"d:\TJWaterServer\epanet\szhskeleton-patternfixed-ascii-fixed.inp")
|
|
outmap = out.with_suffix(out.suffix + '.mapping.txt')
|
|
|
|
text = inp.read_text(encoding='utf-8')
|
|
|
|
# parse mapping file (original -> mapped)
|
|
map_original_to_mapped = {}
|
|
if mapf.exists():
|
|
for line in mapf.read_text(encoding='utf-8').splitlines():
|
|
if '->' in line:
|
|
a,b = line.split('->',1)
|
|
map_original_to_mapped[a.strip()] = b.strip()
|
|
|
|
# find [VALVES] block
|
|
m = re.search(r"(?mi)^\[VALVES\]\s*(?:;.*\n)?(.*?)(?=^\[|\Z)", text, flags=re.S|re.M)
|
|
if not m:
|
|
print('No [VALVES] section found')
|
|
raise SystemExit(1)
|
|
block = m.group(1)
|
|
|
|
# extract IDs (first non-empty token at start of each non-comment line)
|
|
ids = []
|
|
line_offsets = []
|
|
lines = block.splitlines()
|
|
for i,l in enumerate(lines):
|
|
if not l.strip() or l.strip().startswith(';'):
|
|
continue
|
|
# split by whitespace
|
|
toks = l.split()
|
|
if toks:
|
|
ids.append(toks[0])
|
|
line_offsets.append((i, l))
|
|
|
|
# find duplicates
|
|
from collections import defaultdict
|
|
count = defaultdict(list)
|
|
for idx, token in enumerate(ids):
|
|
count[token].append(idx)
|
|
|
|
dups = {k:v for k,v in count.items() if len(v)>1}
|
|
|
|
print(f'Found {len(ids)} valve IDs; {len(dups)} duplicates')
|
|
for k,v in list(dups.items())[:40]:
|
|
print(k, 'occurs', len(v), 'times')
|
|
|
|
# Also find mapped collisions: multiple originals mapped to same mapped token
|
|
mapped_rev = defaultdict(list)
|
|
for orig,mapped in map_original_to_mapped.items():
|
|
mapped_rev[mapped].append(orig)
|
|
collisions = {m:origlist for m,origlist in mapped_rev.items() if len(origlist)>1}
|
|
print('\nMapped collisions (same mapped token from multiple originals):', len(collisions))
|
|
for m,ol in list(collisions.items())[:40]:
|
|
print(m, ' <- ', ol[:5])
|
|
|
|
# We'll fix any ID that is purely digits, or any duplicate ID in the valves block.
|
|
fixed_map = {} # oldToken -> newToken
|
|
used = set(ids) # existing tokens in valves
|
|
suffix_counter = 1
|
|
|
|
for token, positions in dups.items():
|
|
# choose new unique names for subsequent occurrences (leave first occurrence as-is)
|
|
for pos_index, occ in enumerate(positions):
|
|
if pos_index == 0:
|
|
continue
|
|
base = token
|
|
# if base is all digits or starts with digit, prefix with VAL_
|
|
if re.fullmatch(r"\d+", base) or re.match(r"^\d", base):
|
|
candidate = f'VAL_{base}'
|
|
else:
|
|
candidate = f'{base}_dup'
|
|
# ensure uniqueness
|
|
while candidate in used:
|
|
candidate = f'{candidate}_{suffix_counter}'
|
|
suffix_counter += 1
|
|
used.add(candidate)
|
|
fixed_map[token + f'__occ{pos_index}'] = candidate
|
|
|
|
# The above approach requires us to identify which exact occurrence to replace. We'll instead build a replacement pass that replaces only the Nth occurrence.
|
|
# Build per-token occurrence numbers to replace subsequent ones.
|
|
occ_to_new = {} # (token, occ_index) -> newname
|
|
for token, positions in dups.items():
|
|
for pos_index, occ in enumerate(positions):
|
|
if pos_index == 0:
|
|
continue
|
|
if re.fullmatch(r"\d+", token) or re.match(r"^\d", token):
|
|
candidate = f'VAL_{token}'
|
|
else:
|
|
candidate = f'{token}_dup'
|
|
while candidate in used:
|
|
candidate = f'{candidate}_{suffix_counter}'
|
|
suffix_counter += 1
|
|
used.add(candidate)
|
|
occ_to_new[(token, pos_index)] = candidate
|
|
|
|
# Now construct new block replacing the Nth occurrence of duplicates token
|
|
new_lines = []
|
|
occ_seen = defaultdict(int)
|
|
for l in lines:
|
|
if not l.strip() or l.strip().startswith(';'):
|
|
new_lines.append(l)
|
|
continue
|
|
toks = l.split()
|
|
token = toks[0]
|
|
occ_seen[token] += 1
|
|
occ_idx = occ_seen[token]-1
|
|
if (token, occ_idx) in occ_to_new:
|
|
new_token = occ_to_new[(token, occ_idx)]
|
|
# replace only the first token in the line
|
|
rest = l[len(l.lstrip()):]
|
|
# reconstruct preserving leading whitespace
|
|
leading = l[:len(l)-len(l.lstrip())]
|
|
# find start index of token in line
|
|
m2 = re.match(r"(\s*)" + re.escape(token), l)
|
|
if m2:
|
|
leading = m2.group(1)
|
|
new_line = leading + new_token + l[m2.end():]
|
|
new_lines.append(new_line)
|
|
# record mapping for global replacement
|
|
fixed_map[token + f'__occ{occ_idx}'] = new_token
|
|
else:
|
|
new_lines.append(l)
|
|
|
|
# write new file by replacing block
|
|
new_block = '\n'.join(new_lines) + '\n'
|
|
new_text = text[:m.start(1)] + new_block + text[m.end(1):]
|
|
out.write_text(new_text, encoding='utf-8')
|
|
|
|
# Create an updated mapping file: show which tokens were changed and why
|
|
with outmap.open('w', encoding='utf-8') as f:
|
|
f.write('Changes applied to fix duplicate valve IDs:\n')
|
|
for k,v in occ_to_new.items():
|
|
token, occ = k
|
|
f.write(f'{token} occurrence {occ} -> {v}\n')
|
|
f.write('\nNote: These replacements are only for valve ID occurrences beyond the first.\n')
|
|
|
|
print('Wrote', out, 'and mapping', outmap)
|
|
print('Replacements:', len(occ_to_new))
|
|
print('If you want different naming (e.g. prefix with V_), rerun with that preference.')
|