import docx
from docx.shared import Pt
import nltk
from nltk.corpus import cmudict
import re
from operator import itemgetter

# ==========================================
# 1. SETUP & TMS v.3 DATA MAPS (SOUND: TH /θ/)
# ==========================================
TARGET_SOUND = 'θ'
INPUT_FILE = 'TH-voiceless.docx'
OUTPUT_FILE = 'TH-voiceless-TMS-v3-FINAL.docx'

try:
    cmu_dict = cmudict.dict()
except:
    nltk.download('cmudict')
    cmu_dict = cmudict.dict()

CAS_MAP = {'b':1,'p':1,'m':1,'n':1,'w':1,'h':1,'t':2,'d':2,'k':2,'g':2,'f':2,'ŋ':2,'j':2,'v':3,'s':3,'z':3,'ʧ':3,'ʤ':3,'l':4,'ɹ':4,'ʃ':4,'ʒ':4,'θ':4,'ð':4}
CPS_MAP = {'b':1,'p':1,'m':1,'w':1,'t':2,'d':2,'n':2,'s':2,'z':2,'l':2,'ɹ':2,'f':2,'v':2,'θ':2,'ð':2,'k':3,'g':3,'ŋ':3,'ʃ':3,'ʒ':3,'j':3,'h':3,'ʧ':3,'ʤ':3}
VOWEL_BASE = {'ɑ':1,'ə':1,'i':2,'u':2,'o':3,'e':3,'æ':3,'ɔ':3,'ɪ':4,'ɛ':4,'ʊ':4,'ʌ':4,'eɪ':4,'oʊ':4,'aɪ':5,'aʊ':5,'ɔɪ':5,'ɝ':5,'j':5,'w':5}

# ANNEX A 2.A Categories
FRONT_V = {'i', 'e', 'eɪ', 'æ', 'ɪ', 'ɛ'}
BACK_V = {'u', 'o', 'oʊ', 'ɑ', 'ɔ', 'ʊ'}

SONORITY = {'p':1, 'b':1, 't':1, 'd':1, 'k':1, 'g':1, 'ʧ':1, 'ʤ':1,'f':2, 'v':2, 'θ':2, 'ð':2, 's':2, 'z':2, 'ʃ':2, 'ʒ':2, 'h':2,'m':3, 'n':3, 'ŋ':3, 'l':4, 'ɹ':4, 'j':5, 'w':5,'ɪ':6, 'ɛ':6, 'æ':6, 'ʊ':6, 'ə':6, 'ʌ':6,'i':7, 'eɪ':7, 'ɑ':7, 'ɔ':7, 'oʊ':7, 'u':7, 'aɪ':7, 'aʊ':7, 'ɔɪ':7, 'ɝ':7}
IPA_MAP = {'AA':'ɑ','AE':'æ','AH0':'ə','AH1':'ʌ','AH2':'ʌ','AO':'ɔ','AW':'aʊ','AY':'aɪ','EH':'ɛ','ER':'ɝ','EY':'eɪ','IH':'ɪ','IY':'i','OW':'oʊ','OY':'ɔɪ','UH':'ʊ','UW':'u','P':'p','B':'b','T':'t','D':'d','K':'k','G':'g','M':'m','N':'n','NG':'ŋ','F':'f','V':'v','TH':'θ','DH':'ð','S':'s','Z':'z','SH':'ʃ','ZH':'ʒ','HH':'h','CH':'ʧ','JH':'ʤ','L':'l','R':'ɹ','Y':'j','W':'w'}
VOWELS_ONLY = set(VOWEL_BASE.keys())

def phonological_divide(ipa_units):
    if not ipa_units: return ""
    GLIDES = {'j', 'w'}
    v_indices = [i for i, u in enumerate(ipa_units) if u in VOWELS_ONLY and u not in GLIDES]
    if len(v_indices) < 2: return "".join(ipa_units)
    breaks = []
    for v_pair in range(len(v_indices) - 1):
        start, end = v_indices[v_pair] + 1, v_indices[v_pair+1]
        best_break, min_score = start, 99
        for i in range(start, end + 1):
            score = SONORITY.get(ipa_units[i], 1)
            if score <= min_score: min_score, best_break = score, i
        breaks.append(best_break)
    result = []
    for idx, unit in enumerate(ipa_units):
        if idx in breaks: result.append("-")
        result.append(unit)
    return "".join(result)

def calculate_vks_v3_full(ipa_seq):
    total_vks = 0.0
    for i, p in enumerate(ipa_seq):
        if p not in VOWEL_BASE: continue
        base, mult = VOWEL_BASE[p], 1.0
        prev = ipa_seq[i-1] if i > 0 else None
        nxt = ipa_seq[i+1] if i < len(ipa_seq)-1 else None
        
        # ANNEX A 2.A: INTERDENTAL RULES
        if prev in ['θ', 'ð']:
            if p in BACK_V: mult *= 2.0  # Inhibitory: Tongue retraction
            elif p in FRONT_V: mult *= 0.5 # Facilitative: Forward posture
        
        # Standard Liquid/Rhotic constraints
        if nxt in ['l', 'ɹ', 'ɝ']: mult *= 2.0
        if prev in ['ɹ', 'l'] and p not in {'ə', 'ʌ', 'ɝ'}: mult *= 2.0
        
        total_vks += base * mult
    return total_vks

def analyze_single_word_v3(word_text):
    clean_w = re.sub(r'[^a-z]', '', word_text.replace('-', '').lower())
    if clean_w in cmu_dict:
        phonemes = cmu_dict[clean_w][0]
        ipa_seq = [IPA_MAP.get(ph, IPA_MAP.get(re.sub(r'\d', '', ph), ph.lower())) for ph in phonemes]
        divided = phonological_divide(ipa_seq)
        w_numeric = {'cas': sum(CAS_MAP.get(p,0) for p in ipa_seq if p in CAS_MAP), 'cps': sum(CPS_MAP.get(p,0) for p in ipa_seq if p in CPS_MAP), 'vks': calculate_vks_v3_full(ipa_seq)}
        sylls = []
        for syll in divided.split('-'):
            s_phonemes = []
            idx = 0
            while idx < len(syll):
                found = False
                for length in [2, 1]:
                    chunk = syll[idx:idx+length]
                    if chunk in SONORITY: s_phonemes.append(chunk); idx += length; found = True; break
                if not found: idx += 1
            
            # --- SHAPE FIX: Treat glides j and w as C for the Shape report string ---
            shape = "".join(["C" if (p in CAS_MAP or p in {'j', 'w'}) else "V" for p in s_phonemes if p in SONORITY])
            
            sylls.append({
                'shape': shape, 
                'c': [p for p in s_phonemes if (p in CAS_MAP or p in {'j', 'w'})], 
                'v': [p for p in s_phonemes if p in VOWELS_ONLY and p not in {'j', 'w'}]
            })
            
        return {'ipa': divided, 'numeric': w_numeric, 'sylls': sylls}
    return None

def analyze_phrase_unit_v3(full_phrase_text):
    words = full_phrase_text.split()
    phrase_ipa, all_sylls, total_numeric = [], [], {'cas': 0, 'cps': 0, 'vks': 0, 'st': 0}
    for w in words:
        data = analyze_single_word_v3(w)
        if data:
            phrase_ipa.append(data['ipa']); total_numeric['cas'] += data['numeric']['cas']; total_numeric['cps'] += data['numeric']['cps']; total_numeric['vks'] += data['numeric']['vks']; all_sylls.extend(data['sylls'])
    if not phrase_ipa: return None
    if len(all_sylls) > 1:
        for i in range(1, len(all_sylls)):
            prev, curr = all_sylls[i-1], all_sylls[i]
            if prev['c'] != curr['c'] or prev['v'] != curr['v']: total_numeric['st'] += 0.5 if (prev['c'] == curr['c'] or prev['v'] == curr['v']) else 1.0
            if prev['shape'] != curr['shape']: total_numeric['st'] += 0.5
            if prev['v'] and prev['v'][-1] in ['aɪ', 'aʊ', 'ɔɪ', 'eɪ', 'oʊ'] and any(p in ['ɹ', 'ɝ', 'l'] for p in curr['c'] + curr['v']): total_numeric['st'] += 0.75
    return {'ipa': " ".join(phrase_ipa), 'tms': round(sum(total_numeric.values()), 2), 'shape': "-".join([s['shape'] for s in all_sylls]), 'group': "Gr.2" if "CC" in "".join([s['shape'] for s in all_sylls]) else "Gr.1"}

def process_document():
    doc_in = docx.Document(INPUT_FILE)
    results = []
    for para in doc_in.paragraphs:
        line = para.text.strip()
        if not line or any(h in line for h in ["Articulation", "Word List"]): continue
        for item in [i.strip() for i in line.split(',')]:
            if len(item) < 2: continue
            data = analyze_phrase_unit_v3(item)
            if data: data['original'] = item; results.append(data)
    results.sort(key=itemgetter('tms'))
    rep = docx.Document()
    for i, d in enumerate(results):
        p = rep.add_paragraph(f"{i+1} | {d['original']} | /{d['ipa']}/ | TMS: {d['tms']} | {d['group']} | Shape: {d['shape']}")
        p.runs[0].font.size = Pt(9)
    rep.save(OUTPUT_FILE)

if __name__ == "__main__": process_document()