import docx from docx.shared import Pt import nltk from nltk.corpus import cmudict import re from operator import itemgetter # ========================================== # 1. SETUP & TMS v.3 DATA MAPS (SOUND: K) # ========================================== TARGET_SOUND = 'k' INPUT_FILE = 'K.docx' OUTPUT_FILE = 'K-TMS-v3-FINAL.docx' try: cmu_dict = cmudict.dict() except: nltk.download('cmudict') cmu_dict = cmudict.dict() CAS_MAP = { 'b':1,'p':1,'m':1,'n':1,'w':1,'h':1, 't':2,'d':2,'k':2,'g':2,'f':2,'ŋ':2,'j':2, 'v':3,'s':3,'z':3,'ʧ':3,'ʤ':3, 'l':4,'ɹ':4,'ʃ':4,'ʒ':4,'θ':4,'ð':4 } CPS_MAP = { 'b':1,'p':1,'m':1,'w':1, 't':2,'d':2,'n':2,'s':2,'z':2,'l':2,'ɹ':2,'f':2,'v':2,'θ':2,'ð':2, 'k':3,'g':3,'ŋ':3,'ʃ':3,'ʒ':3,'j':3,'h':3,'ʧ':3,'ʤ':3 } VOWEL_BASE = { 'ɑ':1,'ə':1, 'i':2,'u':2, 'o':3,'e':3,'æ':3,'ɔ':3, 'ɪ':4,'ɛ':4,'ʊ':4,'ʌ':4,'eɪ':4,'oʊ':4, 'aɪ':5,'aʊ':5,'ɔɪ':5,'ɝ':5,'j':5,'w':5 } CENTRAL_V = {'ə', 'ʌ', 'ɝ'} FRONT_V = {'i', 'e', 'eɪ', 'æ', 'ɪ', 'ɛ'} BACK_V = {'u', 'o', 'oʊ', 'ɑ', 'ɔ', 'ʊ'} ROUND_V = {'u', 'o', 'oʊ', 'ɔ'} LOW_V = {'ɑ', 'æ'} SPREAD_V = {'i', 'e', 'eɪ', 'æ'} LAX_FRONT = {'ɪ', 'ɛ'} IPA_MAP = { 'AA':'ɑ','AE':'æ','AH0':'ə','AH1':'ʌ','AH2':'ʌ','AO':'ɔ','AW':'aʊ','AY':'aɪ', 'EH':'ɛ','ER':'ɝ','EY':'eɪ','IH':'ɪ','IY':'i','OW':'oʊ','OY':'ɔɪ', 'UH':'ʊ','UW':'u','P':'p','B':'b','T':'t','D':'d','K':'k','G':'g', 'M':'m','N':'n','NG':'ŋ','F':'f','V':'v','TH':'θ','DH':'ð','S':'s', 'Z':'z','SH':'ʃ','ZH':'ʒ','HH':'h','CH':'ʧ','JH':'ʤ','L':'l','R':'ɹ', 'Y':'j','W':'w' } SONORITY = { 'p':1, 'b':1, 't':1, 'd':1, 'k':1, 'g':1, 'ʧ':1, 'ʤ':1, 'f':2, 'v':2, 'θ':2, 'ð':2, 's':2, 'z':2, 'ʃ':2, 'ʒ':2, 'h':2, 'm':3, 'n':3, 'ŋ':3, 'l':4, 'ɹ':4, 'j':5, 'w':5, 'ɪ':6, 'ɛ':6, 'æ':6, 'ʊ':6, 'ə':6, 'ʌ':6, 'i':7, 'eɪ':7, 'ɑ':7, 'ɔ':7, 'oʊ':7, 'u':7, 'aɪ':7, 'aʊ':7, 'ɔɪ':7, 'ɝ':7 } VOWELS_ONLY = set(VOWEL_BASE.keys()) def phonological_divide(ipa_units): if not ipa_units: return "" GLIDES = {'j', 'w'} v_indices = [i for i, u in enumerate(ipa_units) if u in VOWELS_ONLY and u not in GLIDES] if len(v_indices) < 2: return "".join(ipa_units) breaks = [] for v_pair in range(len(v_indices) - 1): start, end = v_indices[v_pair] + 1, v_indices[v_pair+1] best_break, min_score = start, 99 for i in range(start, end + 1): score = SONORITY.get(ipa_units[i], 1) if score <= min_score: min_score, best_break = score, i breaks.append(best_break) result = [] for idx, unit in enumerate(ipa_units): if idx in breaks: result.append("-") result.append(unit) return "".join(result) def calculate_vks_v3_full(ipa_seq): total_vks = 0.0 for i, p in enumerate(ipa_seq): if p not in VOWEL_BASE: continue base = VOWEL_BASE[p] mult = 1.0 prev = ipa_seq[i-1] if i > 0 else None nxt = ipa_seq[i+1] if i < len(ipa_seq)-1 else None # 1. Lip Group if prev in ['f', 'v']: if p in ROUND_V: mult *= 2.0 elif p in SPREAD_V: mult *= 0.5 if prev in ['p', 'b', 'm'] and p in CENTRAL_V: if nxt not in ['l', 'ɹ', 'ɝ']: mult *= 0.5 if prev == 'w': if p in ROUND_V: mult *= 0.5 if p in SPREAD_V: mult *= 2.0 # 2. Tongue Tip Group if prev in ['θ', 'ð', 't', 'd', 'n', 's', 'z']: if p in BACK_V: mult *= 2.0 elif p in ['i', 'e']: mult *= 0.5 if nxt in ['θ', 'ð', 't', 'd', 'n', 's', 'z'] and p in LAX_FRONT: mult *= 0.5 # 3. Tongue Body Group (Velar Focus) if prev in ['ʃ', 'ʒ', 'ʧ', 'ʤ', 'j']: if p in LOW_V: mult *= 2.0 elif p in ['i', 'u'] or p in ROUND_V: mult *= 0.5 if prev in ['k', 'g', 'ŋ']: if p in FRONT_V: mult *= 2.0 # Tongue Fronting Conflict (Annex A 3.B) elif p in BACK_V: mult *= 0.5 # Facilitative # 4. Liquid Group if nxt in ['l', 'ɹ', 'ɝ']: mult *= 2.0 if prev in ['ɹ', 'l'] and p in ['ɪ', 'ɛ', 'ʊ', 'ʌ', 'æ', 'ɔ']: mult *= 2.0 if prev == 'ɹ' and p in CENTRAL_V: mult *= 0.5 # 5. Resonance Rule if (prev in ['m', 'n', 'ŋ'] and nxt in ['p', 'b', 't', 'd', 'k', 'g', 'ʧ', 'ʤ']) or \ (nxt in ['m', 'n', 'ŋ'] and prev in ['p', 'b', 't', 'd', 'k', 'g', 'ʧ', 'ʤ']): mult *= 2.0 total_vks += base * mult return total_vks def analyze_single_word_v3(word_text): clean_w = re.sub(r'[^a-z]', '', word_text.replace('-', '').lower()) if clean_w in cmu_dict: phonemes = cmu_dict[clean_w][0] ipa_seq = [IPA_MAP.get(ph, IPA_MAP.get(re.sub(r'\d', '', ph), ph.lower())) for ph in phonemes] divided = phonological_divide(ipa_seq) w_numeric = {'cas': 0, 'cps': 0, 'vks': calculate_vks_v3_full(ipa_seq)} syll_data = [] for p in ipa_seq: if p in CAS_MAP: w_numeric['cas'] += CAS_MAP[p] w_numeric['cps'] += CPS_MAP[p] for syll in divided.split('-'): s_phonemes = [] i = 0 while i < len(syll): found = False for length in [2, 1]: chunk = syll[i:i+length] if chunk in SONORITY: s_phonemes.append(chunk); i += length; found = True; break if not found: i += 1 # --- SHAPE FIX: Treat j and w as C for the Shape report string --- shape = "".join(["C" if (p in CAS_MAP or p in {'j', 'w'}) else "V" for p in s_phonemes if p in SONORITY]) sylls_info = { 'shape': shape, 'c': [p for p in s_phonemes if (p in CAS_MAP or p in {'j','w'})], 'v': [p for p in s_phonemes if p in VOWELS_ONLY and p not in {'j','w'}] } syll_data.append(sylls_info) return {'ipa': divided, 'numeric': w_numeric, 'sylls': syll_data} return None def analyze_phrase_unit_v3(full_phrase_text): words = full_phrase_text.split() phrase_ipa = [] total_numeric = {'cas': 0, 'cps': 0, 'vks': 0, 'st': 0} all_sylls_data = [] for w in words: data = analyze_single_word_v3(w) if data: phrase_ipa.append(data['ipa']) total_numeric['cas'] += data['numeric']['cas'] total_numeric['cps'] += data['numeric']['cps'] total_numeric['vks'] += data['numeric']['vks'] all_sylls_data.extend(data['sylls']) if not phrase_ipa: return None if len(all_sylls_data) > 1: for i in range(1, len(all_sylls_data)): prev, curr = all_sylls_data[i-1], all_sylls_data[i] if prev['c'] == curr['c'] and prev['v'] == curr['v']: pass elif prev['c'] == curr['c'] or prev['v'] == curr['v']: total_numeric['st'] += 0.5 else: total_numeric['st'] += 1.0 if prev['shape'] != curr['shape']: total_numeric['st'] += 0.5 if prev['v'] and prev['v'][-1] in ['aɪ', 'aʊ', 'ɔɪ', 'eɪ', 'oʊ']: if any(p in ['ɹ', 'ɝ', 'l'] for p in curr['c'] + curr['v']): total_numeric['st'] += 0.75 tms = round(total_numeric['cas'] + total_numeric['cps'] + total_numeric['vks'] + total_numeric['st'], 2) full_shape_str = "-".join([s['shape'] for s in all_sylls_data]) return { 'ipa': " ".join(phrase_ipa), 'tms': tms, 'shape': full_shape_str, 'group': "Gr.2" if "CC" in full_shape_str.replace('-', '') else "Gr.1" } def process_document(): doc_in = docx.Document(INPUT_FILE) results = [] headers = ["Articulation", "Word List", "Syllable Count", "Position", "Words with"] for para in doc_in.paragraphs: line = para.text.strip() if not line or any(h in line for h in headers) or (':' in line and ',' not in line): continue for item in [i.strip() for i in line.split(',')]: if len(item) < 2: continue data = analyze_phrase_unit_v3(item) if data: data['original'] = item results.append(data) results.sort(key=itemgetter('tms')) rep = docx.Document() for i, d in enumerate(results): line = f"{i+1} | {d['original']} | /{d['ipa']}/ | TMS: {d['tms']} | {d['group']} | Shape: {d['shape']}" p = rep.add_paragraph(line) p.runs[0].font.size = Pt(9) rep.save(OUTPUT_FILE) if __name__ == "__main__": process_document()