Documentation for this module may be created at မဝ်ဂျူ:se-IPA/doc

local export = {}

local lang = require("Module:languages").getByCode("se")

local BREVE = mw.ustring.char(0x0306)


local letters_phonemes = {
	["ˈ"] = "ː",
	
	["a"] = "a", ["ạ"] = "a",
	["á"] = "aː", ["áˈ"] = "aˑ", ["á" .. BREVE ] = "a",
	["b"] = "b",
	["c"] = "t͡s",
	["č"] = "t͡ʃ",
	["d"] = "d",
	["đ"] = "ð",
	["ẹ"] = "e", ["ē"] = "eː",
	["g"] = "ɡ",
	["ī"] = "iː",
	["kh"] = "kʰ",
	["ọ"] = "o", ["ō"] = "oː",
	["ph"] = "pʰ",
	["š"] = "ʃ",
	["th"] = "tʰ",
	["ŧ"] = "θ",
	["ū"] = "uː",
	["z"] = "d͡z",
	["ž"] = "d͡ʒ",
	
	["ea"] = "ea̯", ["eaˈ"] = "e̯a", ["ea" .. BREVE] = "ĕă̯",
	["ie"] = "ie̯", ["ieˈ"] = "i̯e", ["ie" .. BREVE] = "ĭĕ̯",
	["oa"] = "oɑ̯", ["oaˈ"] = "o̯ɑ", ["oa" .. BREVE] = "ŏɑ̯̆",
	["uo"] = "uo̯", ["uoˈ"] = "u̯o", ["uo" .. BREVE] = "ŭŏ̯",
	
	["dj"] = "ɟ",
	["lj"] = "ʎ",
	["nj"] = "ɲ",
	
	["hj"] = "j̥", ["hjj"] = "j̥.j̥", ["hjˈj"] = "j̥ː.j̥",
	["hl"] = "l̥", ["hll"] = "l̥.l̥", ["hlˈl"] = "l̥ː.l̥",
	["hm"] = "m̥", ["hmm"] = "m̥.m̥", ["hmˈm"] = "m̥ː.m̥",
	["hn"] = "n̥", ["hnn"] = "n̥.n̥", ["hnˈn"] = "n̥ː.n̥",
	["hr"] = "r̥", ["hrr"] = "r̥.r̥", ["hrˈr"] = "r̥ː.r̥",
}


--	This adds letters_phonemes["e"] = "e", letters_phonemes["i"] = "i", etc.
for letter in mw.ustring.gmatch("efhijklmnŋoprstuv", ".") do
	letters_phonemes[letter] = letter
end

-- Preaspirated
for letter in mw.ustring.gmatch("ptcčk", ".") do
	letters_phonemes["h" .. letter] = "h" .. letters_phonemes[letter]
	letters_phonemes["h" .. letter .. letter] = "hː" .. letters_phonemes[letter]
end


local function get_phoneme(remainder)
	-- Find the longest string of letters that matches a recognised sequence in the list
	local longestmatch = ""
	
	for letter, _ in pairs(letters_phonemes) do
		if mw.ustring.sub(remainder, 1, mw.ustring.len(letter)) == letter and mw.ustring.len(letter) > mw.ustring.len(longestmatch) then
			longestmatch = letter
		end
	end
	
	if mw.ustring.len(longestmatch) > 0 then
		return longestmatch, mw.ustring.sub(remainder, mw.ustring.len(longestmatch) + 1)
	else
		return mw.ustring.sub(remainder, 1, 1), mw.ustring.sub(remainder, 2)
	end
end


local function get_syllable(remainder)
	local syll = {cons = {}, vowel = ""}
	local cons
	
	while mw.ustring.find(remainder, "^([^aạáeẹēiīoọōuū]+)") do
		cons, remainder = get_phoneme(remainder)
		
		if cons == "nˈnj" then
			require("Module:debug").track("se-IPA/nnj")
		end
		
		if cons == "ˈ" then
			syll.cons.quantity = 3
		else
			if cons == "dj" or cons == "lj" then
				if syll.cons[#syll.cons] == string.sub(cons, 1, 1) then
					syll.cons[#syll.cons] = cons
					syll.cons.quantity = 3
				else
					table.insert(syll.cons, cons)
				end
			elseif cons == "nj" and syll.cons[#syll.cons] == "n" then
				syll.cons[#syll.cons] = "nj"
			end
			
			table.insert(syll.cons, cons)
		end
	end
	
	if mw.ustring.find(remainder, "^([aạáeẹēiīoọōuū]+)") then
		syll.vowel, remainder = get_phoneme(remainder)
	end
	
	if remainder == "" then
		remainder = nil
	end
	
	-- Determine consonant quantity
	if not syll.cons.quantity then
		if not syll.cons[2] then
			syll.cons.quantity = 1
		else
			if mw.ustring.find(syll.cons[#syll.cons], "(.)%1$") or (syll.cons[#syll.cons] == syll.cons[#syll.cons - 1] and not mw.ustring.find(syll.cons[#syll.cons], "^[bdgzž]$")) or (syll.cons[#syll.cons - 1] == "p" and syll.cons[#syll.cons] == "m") or (syll.cons[#syll.cons - 1] == "t" and syll.cons[#syll.cons] == "n") or (syll.cons[#syll.cons - 1] == "t" and syll.cons[#syll.cons] == "nj") or (syll.cons[#syll.cons - 1] == "k" and syll.cons[#syll.cons] == "ŋ") then
				syll.cons.quantity = 2
			else
				syll.cons.quantity = 3
			end
		end
	end
	
	return syll, remainder
end


-- Split the word into syllables of C(C)V shape
local function split_syllables(remainder)
	remainder = mw.ustring.lower(remainder)
	remainder = mw.ustring.gsub(remainder, "([aáeēiīoōuū])i", "%1j")
	
	local syllables = {}
	local syll
	
	while remainder do
		syll, remainder = get_syllable(remainder)
		table.insert(syllables, syll)
	end
	
	syllables.count = #syllables
	
	if syllables[#syllables].vowel == "" then
		syllables.count = syllables.count - 1
	end
	
	return syllables
end


local function shorten(vowel)
	vowel = mw.ustring.gsub(vowel, "^[ēīōū]$", {["ē"] = "e", ["ī"] = "i", ["ō"] = "o", ["ū"] = "u"})
	
	for _, v in ipairs({"á", "ea", "ie", "oa", "uo"}) do
		vowel = mw.ustring.gsub(vowel, v, v .. BREVE)
	end
	
	return vowel
end


local function shift(vowel)
	for _, v in ipairs({"á", "ea", "ie", "oa", "uo"}) do
		vowel = mw.ustring.gsub(vowel, v, v .. "ˈ")
	end
	
	return vowel
end


local function lengthen(vowel)
	vowel = mw.ustring.gsub(vowel, "^[eiou]$", {["e"] = "ē", ["i"] = "ī", ["o"] = "ō", ["u"] = "ū"})
	vowel = mw.ustring.gsub(vowel, BREVE, "")
	
	return vowel
end

-- Determine whether long vowels should be shortened before certain consonants
local function should_shorten(syll, nextsyll)
	if nextsyll.cons[1] then
		if mw.ustring.find(nextsyll.cons[1], "^h([ptcčk])%1$") then
			-- Long preaspirate
			return true
		elseif mw.ustring.find(nextsyll.cons[1], "^([đflmnŋrsšv])ˈ%1$") then
			-- Overlong vowel
			return true
		elseif (syll.vowel == "ie" or syll.vowel == "uo") and mw.ustring.find(nextsyll.vowel, "^[áīū]$") then
			if mw.ustring.find(nextsyll.cons[1], "^([bcčdgkptzž])%1$") then
				-- Geminate stop
				return true
			elseif nextsyll.cons[1] == "pm" or nextsyll.cons[1] == "tn" or nextsyll.cons[1] == "tnj" or nextsyll.cons[1] == "kŋ" then
				-- Glottalised nasal
				return true
			elseif nextsyll.cons[2] and not mw.ustring.find(nextsyll.cons[2], "^h[ptcčk]$") then
				-- Clusters, except when the second element is a strong-grade preaspirate
				return true
			end
		elseif (syll.vowel == "ie" or syll.vowel == "uo") and nextsyll.vowel == "a" then
			if mw.ustring.find(nextsyll.cons[1], "^([bdgzž])%1$") then
				-- Geminate voiced stop
				return true
			elseif nextsyll.cons[2] and not mw.ustring.find(nextsyll.cons[2], "(.)%1$") and not mw.ustring.find(nextsyll.cons[2], "^h[ptcčk]$") and not (nextsyll.cons[2] == "pm" or nextsyll.cons[2] == "tn" or nextsyll.cons[2] == "tnj" or nextsyll.cons[2] == "kŋ") then
				-- Clusters, except when the second element is long, or a preaspirate, or a preglottalised nasal
				return true
			end
		end
	end
	
	return false	
end


local function convert_spelling(syllables)
	local foot = 0
	
	for i, syll in ipairs(syllables) do
		if syll.vowel == "" then
			if syll.cons[#syll.cons] == "t" then
				syll.cons[#syll.cons] = "ht"
			elseif syll.cons[#syll.cons] == "d" then
				syll.cons[#syll.cons] = "t"
			end
			
			break
		end
		
		local nextsyll = syllables[i + 1] or {cons = {}, vowel = ""}
		
		foot = foot + 1
		
		if foot == 3 and i ~= syllables.count then
			foot = 1
		end
		
		-- Make i and u long in even syllables
		if foot == 2 and (syll.vowel == "i" or syll.vowel == "u") and nextsyll.cons[1] ~= "j" then
			syll.vowel = lengthen(syll.vowel)
		end
		
		if #syll.cons == 1 and syll.vowel ~= "" then
			-- Postaspiration
			syll.cons[1] = mw.ustring.gsub(syll.cons[1], "^([kpt])$", "%1h")
		end
		
		if #syll.cons > 1 then
			if syll.cons[#syll.cons] == syll.cons[#syll.cons - 1] and syll.cons[#syll.cons - 2] and mw.ustring.find(syll.cons[#syll.cons - 2], "[cčkpsšt]$") then
				-- Ungeminate last consonant after voiceless
				syll.cons[#syll.cons] = nil
			elseif mw.ustring.find(syll.cons[#syll.cons], "[cčkpsšt]$") then
				-- Ungeminate last consonant after voiceless
				syll.cons[#syll.cons] = mw.ustring.gsub(syll.cons[#syll.cons], "(.)%1$", "%1")
			else
				-- Preaspirate final voiceless consonant after voiced
				syll.cons[#syll.cons] = mw.ustring.gsub(syll.cons[#syll.cons], "^([cčkpt])$", "h%1")
				syll.cons[#syll.cons] = mw.ustring.gsub(syll.cons[#syll.cons], "^([cčkpt])%1$", "h%1%1")
			end
			
			-- Devoice final geminates
			if syll.cons[#syll.cons] == "bb" then
				syll.cons[#syll.cons] = "pp"
			elseif syll.cons[#syll.cons] == "dd" then
				syll.cons[#syll.cons] = "tt"
			elseif syll.cons[#syll.cons] == "gg" then
				syll.cons[#syll.cons] = "kk"
			elseif syll.cons[#syll.cons] == "zz" then
				syll.cons[#syll.cons] = "cc"
			elseif syll.cons[#syll.cons] == "žž" then
				syll.cons[#syll.cons] = "čč"
			end
		end
		
		-- Devoice remaining single voiced consonants
		for j, cons in ipairs(syll.cons) do
			if cons == "b" and syll.cons[j - 1] ~= "b" and (j ~= 1 or syll.cons[2] ~= "b" and syll.cons[2] ~= "m") then
				syll.cons[j] = "p"
			elseif cons == "d" and syll.cons[j - 1] ~= "d" and (j ~= 1 or syll.cons[2] ~= "d" and syll.cons[2] ~= "n" and syll.cons[2] ~= "nj") then
				syll.cons[j] = "t"
			elseif cons == "g" and syll.cons[j - 1] ~= "g" and (j ~= 1 or syll.cons[2] ~= "g" and syll.cons[2] ~= "ŋ") then
				syll.cons[j] = "k"
			elseif cons == "z" and syll.cons[j - 1] ~= "z" and (j ~= 1 or syll.cons[2] ~= "z") then
				syll.cons[j] = "c"
			elseif cons == "ž" and syll.cons[j - 1] ~= "ž" and (j ~= 1 or syll.cons[2] ~= "ž") then
				syll.cons[j] = "č"
			end
		end
		
		-- Regularise divergent spellings in clusters
		--if #syll.cons > 2 then
		--	error("Clusters with more than 2 consonants are not yet supported.")
		--end
		
		if foot == 2 and syll.cons.quantity == 3 then
			-- Lengthen initial sonorant in quantity 3
			table.insert(syll.cons, 2, "ˈ")
		end
		
		-- Secondary stress
		if foot == 1 and i > 1 then
			if #syll.cons == 1 then
				table.insert(syll.cons, 1, "ˌ")
			elseif #syll.cons == 2 then
				table.insert(syll.cons, 2, "ˌ")
			end
		end
	end
	
	-- This needs to be a separate pass because otherwise unstressed ī and ū won't have been lengthened yet
	for i, syll in ipairs(syllables) do
		local nextsyll = syllables[i + 1] or {cons = {}, vowel = ""}
		
	--	if should_shorten(syll, nextsyll) then
	--		syll.vowel = shorten(syll.vowel)
		if mw.ustring.find(nextsyll.vowel, "^[ạẹọ]$") then
			syll.vowel = shift(syll.vowel)
		end
	end
end


-- Dialect-specific conversions
local function dialect(syllables)
	for i, syll in ipairs(syllables) do
		-- Western Finnmark dialect
		if syll.cons[1] then
			if syll.cons[#syll.cons] == "ŋ" then
				syll.cons[#syll.cons] = "nj"
				
				if syll.cons[#syll.cons - 1] == "ˈ" then
					if syll.cons[#syll.cons - 2] then
						syll.cons[#syll.cons - 2] = mw.ustring.gsub(syll.cons[#syll.cons - 2], "^[gkŋ]$", {["g"] = "d", ["k"] = "t", ["ŋ"] = "nj"})
					end
				else
					if syll.cons[#syll.cons - 1] then
						syll.cons[#syll.cons - 1] = mw.ustring.gsub(syll.cons[#syll.cons - 1], "^[gk]$", {["g"] = "d", ["k"] = "t", ["ŋ"] = "nj"})
					end
				end
			end
		end
	end
end


-- Convert word to IPA
local function to_IPA(syllables)
	for i, syll in ipairs(syllables) do
		for j, cons in ipairs(syll.cons) do
			if syll.vowel == "" and cons == "ht" then
				syll.cons[j] = "h(t)"
			elseif letters_phonemes[cons] then
				-- Drop the final part after the tie bar
				if string.find(letters_phonemes[cons], "͡", nil, true) and syll.cons[j] == syll.cons[j + (syll.cons[j + 1] == "ˈ" and 2 or 1)] then
					syll.cons[j] = mw.ustring.gsub(letters_phonemes[cons], "͡.*$", "")
				else
					syll.cons[j] = letters_phonemes[cons]
				end
			end
		end
		
		syll.vowel = letters_phonemes[syll.vowel] or syll.vowel
		
		syllables[i] = table.concat(syll.cons) .. syll.vowel
	end
	
	return "ˈ" .. table.concat(syllables)
end


function export.IPA(frame)
	local params = {
		[1] = {default = mw.title.getCurrentTitle().text},
	}
	
	local args = require("Module:parameters").process(frame:getParent().args, params)
	
	local syllables = split_syllables(args[1])
	convert_spelling(syllables)
	dialect(syllables)
	
	return
		require("Module:accent qualifier").format_qualifiers({"ခါတဝ်ခိုၚ်နဝ်"}) .. " " ..
		require("Module:IPA").format_IPA_full(lang, {{pron = "/" .. to_IPA(syllables) .. "/"}}) ..
		require("Module:utilities").format_categories({lang:getCanonicalName() .. "မအရေဝ်" .. tostring(syllables.count) .. "ပါင်ဂမၠိုင်"}, lang)
end

return export