import docx
from docx.shared import Pt
import nltk
from nltk.corpus import cmudict
import re
from operator import itemgetter

# ==========================================
# 1. SETUP & TMS v.3 DATA MAPS (SOUND: K)
# ==========================================
TARGET_SOUND = 'k'
INPUT_FILE = 'K.docx'
OUTPUT_FILE = 'K-TMS-v3-FINAL.docx'

try:
    cmu_dict = cmudict.dict()
except:
    nltk.download('cmudict')
    cmu_dict = cmudict.dict()

CAS_MAP = {
    'b':1,'p':1,'m':1,'n':1,'w':1,'h':1,
    't':2,'d':2,'k':2,'g':2,'f':2,'ŋ':2,'j':2,
    'v':3,'s':3,'z':3,'ʧ':3,'ʤ':3,
    'l':4,'ɹ':4,'ʃ':4,'ʒ':4,'θ':4,'ð':4
}

CPS_MAP = {
    'b':1,'p':1,'m':1,'w':1,
    't':2,'d':2,'n':2,'s':2,'z':2,'l':2,'ɹ':2,'f':2,'v':2,'θ':2,'ð':2,
    'k':3,'g':3,'ŋ':3,'ʃ':3,'ʒ':3,'j':3,'h':3,'ʧ':3,'ʤ':3
}

VOWEL_BASE = {
    'ɑ':1,'ə':1,
    'i':2,'u':2,
    'o':3,'e':3,'æ':3,'ɔ':3,
    'ɪ':4,'ɛ':4,'ʊ':4,'ʌ':4,'eɪ':4,'oʊ':4,
    'aɪ':5,'aʊ':5,'ɔɪ':5,'ɝ':5,'j':5,'w':5
}

CENTRAL_V = {'ə', 'ʌ', 'ɝ'}
FRONT_V = {'i', 'e', 'eɪ', 'æ', 'ɪ', 'ɛ'}
BACK_V = {'u', 'o', 'oʊ', 'ɑ', 'ɔ', 'ʊ'}
ROUND_V = {'u', 'o', 'oʊ', 'ɔ'}
LOW_V = {'ɑ', 'æ'}
SPREAD_V = {'i', 'e', 'eɪ', 'æ'}
LAX_FRONT = {'ɪ', 'ɛ'}

IPA_MAP = {
    'AA':'ɑ','AE':'æ','AH0':'ə','AH1':'ʌ','AH2':'ʌ','AO':'ɔ','AW':'aʊ','AY':'aɪ',
    'EH':'ɛ','ER':'ɝ','EY':'eɪ','IH':'ɪ','IY':'i','OW':'oʊ','OY':'ɔɪ',
    'UH':'ʊ','UW':'u','P':'p','B':'b','T':'t','D':'d','K':'k','G':'g',
    'M':'m','N':'n','NG':'ŋ','F':'f','V':'v','TH':'θ','DH':'ð','S':'s',
    'Z':'z','SH':'ʃ','ZH':'ʒ','HH':'h','CH':'ʧ','JH':'ʤ','L':'l','R':'ɹ',
    'Y':'j','W':'w'
}

SONORITY = {
    'p':1, 'b':1, 't':1, 'd':1, 'k':1, 'g':1, 'ʧ':1, 'ʤ':1,
    'f':2, 'v':2, 'θ':2, 'ð':2, 's':2, 'z':2, 'ʃ':2, 'ʒ':2, 'h':2,
    'm':3, 'n':3, 'ŋ':3, 'l':4, 'ɹ':4, 'j':5, 'w':5,
    'ɪ':6, 'ɛ':6, 'æ':6, 'ʊ':6, 'ə':6, 'ʌ':6,
    'i':7, 'eɪ':7, 'ɑ':7, 'ɔ':7, 'oʊ':7, 'u':7, 'aɪ':7, 'aʊ':7, 'ɔɪ':7, 'ɝ':7
}

VOWELS_ONLY = set(VOWEL_BASE.keys())

def phonological_divide(ipa_units):
    if not ipa_units: return ""
    GLIDES = {'j', 'w'}
    v_indices = [i for i, u in enumerate(ipa_units) if u in VOWELS_ONLY and u not in GLIDES]
    if len(v_indices) < 2: return "".join(ipa_units)
    breaks = []
    for v_pair in range(len(v_indices) - 1):
        start, end = v_indices[v_pair] + 1, v_indices[v_pair+1]
        best_break, min_score = start, 99
        for i in range(start, end + 1):
            score = SONORITY.get(ipa_units[i], 1)
            if score <= min_score: min_score, best_break = score, i
        breaks.append(best_break)
    result = []
    for idx, unit in enumerate(ipa_units):
        if idx in breaks: result.append("-")
        result.append(unit)
    return "".join(result)

def calculate_vks_v3_full(ipa_seq):
    total_vks = 0.0
    for i, p in enumerate(ipa_seq):
        if p not in VOWEL_BASE: continue
        base = VOWEL_BASE[p]
        mult = 1.0
        prev = ipa_seq[i-1] if i > 0 else None
        nxt = ipa_seq[i+1] if i < len(ipa_seq)-1 else None

        # 1. Lip Group
        if prev in ['f', 'v']:
            if p in ROUND_V: mult *= 2.0
            elif p in SPREAD_V: mult *= 0.5
        if prev in ['p', 'b', 'm'] and p in CENTRAL_V:
            if nxt not in ['l', 'ɹ', 'ɝ']: mult *= 0.5
        if prev == 'w':
            if p in ROUND_V: mult *= 0.5
            if p in SPREAD_V: mult *= 2.0

        # 2. Tongue Tip Group
        if prev in ['θ', 'ð', 't', 'd', 'n', 's', 'z']:
            if p in BACK_V: mult *= 2.0
            elif p in ['i', 'e']: mult *= 0.5
        if nxt in ['θ', 'ð', 't', 'd', 'n', 's', 'z'] and p in LAX_FRONT:
            mult *= 0.5

        # 3. Tongue Body Group (Velar Focus)
        if prev in ['ʃ', 'ʒ', 'ʧ', 'ʤ', 'j']:
            if p in LOW_V: mult *= 2.0
            elif p in ['i', 'u'] or p in ROUND_V: mult *= 0.5
        if prev in ['k', 'g', 'ŋ']:
            if p in FRONT_V: mult *= 2.0 # Tongue Fronting Conflict (Annex A 3.B)
            elif p in BACK_V: mult *= 0.5 # Facilitative

        # 4. Liquid Group
        if nxt in ['l', 'ɹ', 'ɝ']: mult *= 2.0
        if prev in ['ɹ', 'l'] and p in ['ɪ', 'ɛ', 'ʊ', 'ʌ', 'æ', 'ɔ']: mult *= 2.0
        if prev == 'ɹ' and p in CENTRAL_V: mult *= 0.5

        # 5. Resonance Rule
        if (prev in ['m', 'n', 'ŋ'] and nxt in ['p', 'b', 't', 'd', 'k', 'g', 'ʧ', 'ʤ']) or \
           (nxt in ['m', 'n', 'ŋ'] and prev in ['p', 'b', 't', 'd', 'k', 'g', 'ʧ', 'ʤ']):
            mult *= 2.0

        total_vks += base * mult
    return total_vks

def analyze_single_word_v3(word_text):
    clean_w = re.sub(r'[^a-z]', '', word_text.replace('-', '').lower())
    if clean_w in cmu_dict:
        phonemes = cmu_dict[clean_w][0]
        ipa_seq = [IPA_MAP.get(ph, IPA_MAP.get(re.sub(r'\d', '', ph), ph.lower())) for ph in phonemes]
        divided = phonological_divide(ipa_seq)
        
        w_numeric = {'cas': 0, 'cps': 0, 'vks': calculate_vks_v3_full(ipa_seq)}
        syll_data = []
        for p in ipa_seq:
            if p in CAS_MAP:
                w_numeric['cas'] += CAS_MAP[p]
                w_numeric['cps'] += CPS_MAP[p]
        
        for syll in divided.split('-'):
            s_phonemes = []
            i = 0
            while i < len(syll):
                found = False
                for length in [2, 1]:
                    chunk = syll[i:i+length]
                    if chunk in SONORITY:
                        s_phonemes.append(chunk); i += length; found = True; break
                if not found: i += 1
            
            # --- SHAPE FIX: Treat j and w as C for the Shape report string ---
            shape = "".join(["C" if (p in CAS_MAP or p in {'j', 'w'}) else "V" for p in s_phonemes if p in SONORITY])
            
            sylls_info = {
                'shape': shape,
                'c': [p for p in s_phonemes if (p in CAS_MAP or p in {'j','w'})],
                'v': [p for p in s_phonemes if p in VOWELS_ONLY and p not in {'j','w'}]
            }
            syll_data.append(sylls_info)
            
        return {'ipa': divided, 'numeric': w_numeric, 'sylls': syll_data}
    return None

def analyze_phrase_unit_v3(full_phrase_text):
    words = full_phrase_text.split()
    phrase_ipa = []
    total_numeric = {'cas': 0, 'cps': 0, 'vks': 0, 'st': 0}
    all_sylls_data = []
    for w in words:
        data = analyze_single_word_v3(w)
        if data:
            phrase_ipa.append(data['ipa'])
            total_numeric['cas'] += data['numeric']['cas']
            total_numeric['cps'] += data['numeric']['cps']
            total_numeric['vks'] += data['numeric']['vks']
            all_sylls_data.extend(data['sylls'])
    if not phrase_ipa: return None

    if len(all_sylls_data) > 1:
        for i in range(1, len(all_sylls_data)):
            prev, curr = all_sylls_data[i-1], all_sylls_data[i]
            if prev['c'] == curr['c'] and prev['v'] == curr['v']:
                pass
            elif prev['c'] == curr['c'] or prev['v'] == curr['v']:
                total_numeric['st'] += 0.5
            else:
                total_numeric['st'] += 1.0
            
            if prev['shape'] != curr['shape']:
                total_numeric['st'] += 0.5
            
            if prev['v'] and prev['v'][-1] in ['aɪ', 'aʊ', 'ɔɪ', 'eɪ', 'oʊ']:
                if any(p in ['ɹ', 'ɝ', 'l'] for p in curr['c'] + curr['v']):
                    total_numeric['st'] += 0.75

    tms = round(total_numeric['cas'] + total_numeric['cps'] + total_numeric['vks'] + total_numeric['st'], 2)
    full_shape_str = "-".join([s['shape'] for s in all_sylls_data])
    return {
        'ipa': " ".join(phrase_ipa),
        'tms': tms,
        'shape': full_shape_str,
        'group': "Gr.2" if "CC" in full_shape_str.replace('-', '') else "Gr.1"
    }

def process_document():
    doc_in = docx.Document(INPUT_FILE)
    results = []
    headers = ["Articulation", "Word List", "Syllable Count", "Position", "Words with"]
    for para in doc_in.paragraphs:
        line = para.text.strip()
        if not line or any(h in line for h in headers) or (':' in line and ',' not in line): continue
        for item in [i.strip() for i in line.split(',')]:
            if len(item) < 2: continue
            data = analyze_phrase_unit_v3(item)
            if data:
                data['original'] = item
                results.append(data)
    results.sort(key=itemgetter('tms'))
    rep = docx.Document()
    for i, d in enumerate(results):
        line = f"{i+1} | {d['original']} | /{d['ipa']}/ | TMS: {d['tms']} | {d['group']} | Shape: {d['shape']}"
        p = rep.add_paragraph(line)
        p.runs[0].font.size = Pt(9)
    rep.save(OUTPUT_FILE)

if __name__ == "__main__":
    process_document()