import docx from docx.shared import Pt import nltk from nltk.corpus import cmudict import re from operator import itemgetter # ========================================== # 1. SETUP & TMS v.3 DATA MAPS (SOUND: TH /θ/) # ========================================== TARGET_SOUND = 'θ' INPUT_FILE = 'TH-voiceless.docx' OUTPUT_FILE = 'TH-voiceless-TMS-v3-FINAL.docx' try: cmu_dict = cmudict.dict() except: nltk.download('cmudict') cmu_dict = cmudict.dict() CAS_MAP = {'b':1,'p':1,'m':1,'n':1,'w':1,'h':1,'t':2,'d':2,'k':2,'g':2,'f':2,'ŋ':2,'j':2,'v':3,'s':3,'z':3,'ʧ':3,'ʤ':3,'l':4,'ɹ':4,'ʃ':4,'ʒ':4,'θ':4,'ð':4} CPS_MAP = {'b':1,'p':1,'m':1,'w':1,'t':2,'d':2,'n':2,'s':2,'z':2,'l':2,'ɹ':2,'f':2,'v':2,'θ':2,'ð':2,'k':3,'g':3,'ŋ':3,'ʃ':3,'ʒ':3,'j':3,'h':3,'ʧ':3,'ʤ':3} VOWEL_BASE = {'ɑ':1,'ə':1,'i':2,'u':2,'o':3,'e':3,'æ':3,'ɔ':3,'ɪ':4,'ɛ':4,'ʊ':4,'ʌ':4,'eɪ':4,'oʊ':4,'aɪ':5,'aʊ':5,'ɔɪ':5,'ɝ':5,'j':5,'w':5} # ANNEX A 2.A Categories FRONT_V = {'i', 'e', 'eɪ', 'æ', 'ɪ', 'ɛ'} BACK_V = {'u', 'o', 'oʊ', 'ɑ', 'ɔ', 'ʊ'} SONORITY = {'p':1, 'b':1, 't':1, 'd':1, 'k':1, 'g':1, 'ʧ':1, 'ʤ':1,'f':2, 'v':2, 'θ':2, 'ð':2, 's':2, 'z':2, 'ʃ':2, 'ʒ':2, 'h':2,'m':3, 'n':3, 'ŋ':3, 'l':4, 'ɹ':4, 'j':5, 'w':5,'ɪ':6, 'ɛ':6, 'æ':6, 'ʊ':6, 'ə':6, 'ʌ':6,'i':7, 'eɪ':7, 'ɑ':7, 'ɔ':7, 'oʊ':7, 'u':7, 'aɪ':7, 'aʊ':7, 'ɔɪ':7, 'ɝ':7} IPA_MAP = {'AA':'ɑ','AE':'æ','AH0':'ə','AH1':'ʌ','AH2':'ʌ','AO':'ɔ','AW':'aʊ','AY':'aɪ','EH':'ɛ','ER':'ɝ','EY':'eɪ','IH':'ɪ','IY':'i','OW':'oʊ','OY':'ɔɪ','UH':'ʊ','UW':'u','P':'p','B':'b','T':'t','D':'d','K':'k','G':'g','M':'m','N':'n','NG':'ŋ','F':'f','V':'v','TH':'θ','DH':'ð','S':'s','Z':'z','SH':'ʃ','ZH':'ʒ','HH':'h','CH':'ʧ','JH':'ʤ','L':'l','R':'ɹ','Y':'j','W':'w'} VOWELS_ONLY = set(VOWEL_BASE.keys()) def phonological_divide(ipa_units): if not ipa_units: return "" GLIDES = {'j', 'w'} v_indices = [i for i, u in enumerate(ipa_units) if u in VOWELS_ONLY and u not in GLIDES] if len(v_indices) < 2: return "".join(ipa_units) breaks = [] for v_pair in range(len(v_indices) - 1): start, end = v_indices[v_pair] + 1, v_indices[v_pair+1] best_break, min_score = start, 99 for i in range(start, end + 1): score = SONORITY.get(ipa_units[i], 1) if score <= min_score: min_score, best_break = score, i breaks.append(best_break) result = [] for idx, unit in enumerate(ipa_units): if idx in breaks: result.append("-") result.append(unit) return "".join(result) def calculate_vks_v3_full(ipa_seq): total_vks = 0.0 for i, p in enumerate(ipa_seq): if p not in VOWEL_BASE: continue base, mult = VOWEL_BASE[p], 1.0 prev = ipa_seq[i-1] if i > 0 else None nxt = ipa_seq[i+1] if i < len(ipa_seq)-1 else None # ANNEX A 2.A: INTERDENTAL RULES if prev in ['θ', 'ð']: if p in BACK_V: mult *= 2.0 # Inhibitory: Tongue retraction elif p in FRONT_V: mult *= 0.5 # Facilitative: Forward posture # Standard Liquid/Rhotic constraints if nxt in ['l', 'ɹ', 'ɝ']: mult *= 2.0 if prev in ['ɹ', 'l'] and p not in {'ə', 'ʌ', 'ɝ'}: mult *= 2.0 total_vks += base * mult return total_vks def analyze_single_word_v3(word_text): clean_w = re.sub(r'[^a-z]', '', word_text.replace('-', '').lower()) if clean_w in cmu_dict: phonemes = cmu_dict[clean_w][0] ipa_seq = [IPA_MAP.get(ph, IPA_MAP.get(re.sub(r'\d', '', ph), ph.lower())) for ph in phonemes] divided = phonological_divide(ipa_seq) w_numeric = {'cas': sum(CAS_MAP.get(p,0) for p in ipa_seq if p in CAS_MAP), 'cps': sum(CPS_MAP.get(p,0) for p in ipa_seq if p in CPS_MAP), 'vks': calculate_vks_v3_full(ipa_seq)} sylls = [] for syll in divided.split('-'): s_phonemes = [] idx = 0 while idx < len(syll): found = False for length in [2, 1]: chunk = syll[idx:idx+length] if chunk in SONORITY: s_phonemes.append(chunk); idx += length; found = True; break if not found: idx += 1 # --- SHAPE FIX: Treat glides j and w as C for the Shape report string --- shape = "".join(["C" if (p in CAS_MAP or p in {'j', 'w'}) else "V" for p in s_phonemes if p in SONORITY]) sylls.append({ 'shape': shape, 'c': [p for p in s_phonemes if (p in CAS_MAP or p in {'j', 'w'})], 'v': [p for p in s_phonemes if p in VOWELS_ONLY and p not in {'j', 'w'}] }) return {'ipa': divided, 'numeric': w_numeric, 'sylls': sylls} return None def analyze_phrase_unit_v3(full_phrase_text): words = full_phrase_text.split() phrase_ipa, all_sylls, total_numeric = [], [], {'cas': 0, 'cps': 0, 'vks': 0, 'st': 0} for w in words: data = analyze_single_word_v3(w) if data: phrase_ipa.append(data['ipa']); total_numeric['cas'] += data['numeric']['cas']; total_numeric['cps'] += data['numeric']['cps']; total_numeric['vks'] += data['numeric']['vks']; all_sylls.extend(data['sylls']) if not phrase_ipa: return None if len(all_sylls) > 1: for i in range(1, len(all_sylls)): prev, curr = all_sylls[i-1], all_sylls[i] if prev['c'] != curr['c'] or prev['v'] != curr['v']: total_numeric['st'] += 0.5 if (prev['c'] == curr['c'] or prev['v'] == curr['v']) else 1.0 if prev['shape'] != curr['shape']: total_numeric['st'] += 0.5 if prev['v'] and prev['v'][-1] in ['aɪ', 'aʊ', 'ɔɪ', 'eɪ', 'oʊ'] and any(p in ['ɹ', 'ɝ', 'l'] for p in curr['c'] + curr['v']): total_numeric['st'] += 0.75 return {'ipa': " ".join(phrase_ipa), 'tms': round(sum(total_numeric.values()), 2), 'shape': "-".join([s['shape'] for s in all_sylls]), 'group': "Gr.2" if "CC" in "".join([s['shape'] for s in all_sylls]) else "Gr.1"} def process_document(): doc_in = docx.Document(INPUT_FILE) results = [] for para in doc_in.paragraphs: line = para.text.strip() if not line or any(h in line for h in ["Articulation", "Word List"]): continue for item in [i.strip() for i in line.split(',')]: if len(item) < 2: continue data = analyze_phrase_unit_v3(item) if data: data['original'] = item; results.append(data) results.sort(key=itemgetter('tms')) rep = docx.Document() for i, d in enumerate(results): p = rep.add_paragraph(f"{i+1} | {d['original']} | /{d['ipa']}/ | TMS: {d['tms']} | {d['group']} | Shape: {d['shape']}") p.runs[0].font.size = Pt(9) rep.save(OUTPUT_FILE) if __name__ == "__main__": process_document()