Skip to content

fixed phonemizer for language #131

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 20, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 42 additions & 37 deletions src/libs/transformers/utils/phonemize.ts
Original file line number Diff line number Diff line change
Expand Up @@ -243,15 +243,14 @@ const HINDI_PHONEME_MAP: { [key: string]: string } = {
};

// Add function to detect script
function isDevanagari(text: string): boolean {
return /[\u0900-\u097F]/.test(text);
}
// function isDevanagari(text: string): boolean {
// return /[\u0900-\u097F]/.test(text);
// }

// Add Spanish phoneme detection
function isSpanish(text: string): boolean {
// Check for Spanish-specific characters and common patterns
return /[áéíóúñ¿¡]|ll|ñ/i.test(text);
}
// function isSpanish(text: string): boolean {
// return /[áéíóúñ¿¡]|ll|ñ/i.test(text);
// }

// Add Spanish phoneme mappings
const SPANISH_PHONEME_MAP: { [key: string]: string } = {
Expand Down Expand Up @@ -323,9 +322,17 @@ export async function phonemize(text: string, language = "a", norm = true) {
text = normalize_text(text);
}

// 2. Check script type
const hasDevanagari = isDevanagari(text);
const hasSpanish = isSpanish(text);
// 2. Map language codes to processing types
const languageMap: { [key: string]: string } = {
'a': 'en-us', // American English
'b': 'en', // British English
'h': 'hindi', // Hindi
'e': 'spanish', // Spanish
'f': 'french', // French
'z': 'chinese' // Chinese
};

const targetLanguage = languageMap[language] || 'en-us';

// 3. Split into chunks, to ensure we preserve punctuation
const sections = split(text, PUNCTUATION_PATTERN);
Expand All @@ -335,35 +342,33 @@ export async function phonemize(text: string, language = "a", norm = true) {
sections.map(async ({ match, text }) => {
if (match) return text;

if (hasDevanagari) {
// Process text in chunks to maintain syllable structure
return text.split(/(?=[क-ह])/)
.map(chunk => processHindiSyllable(
Array.from(chunk)
.map(char => HINDI_PHONEME_MAP[char] || char)
.join('')
))
.join('');
} else if (hasSpanish) {
// Handle Spanish text
let result = text.toLowerCase();
switch (targetLanguage) {
case 'hindi':
// console.log('Hindi phonemization');
return text.split(/(?=[क-ह])/)
.map(chunk => processHindiSyllable(
Array.from(chunk)
.map(char => HINDI_PHONEME_MAP[char] || char)
.join('')
))
.join('');

// Handle special cases first
result = result
.replace(/ch/g, 'tʃ')
.replace(/ll/g, 'j')
.replace(/rr/g, 'r')
// Add rule for 'c' before 'i' and 'e'
.replace(/c([ie])/g, 's$1'); // Use 's' for Latin American Spanish
case 'spanish':
// console.log('Spanish phonemization');
let result = text.toLowerCase();
result = result
.replace(/ch/g, 'tʃ')
.replace(/ll/g, 'j')
.replace(/rr/g, 'r')
.replace(/c([ie])/g, 's$1');

return Array.from(result)
.map(char => SPANISH_PHONEME_MAP[char] || char)
.join('');

// Then handle individual characters
return Array.from(result)
.map(char => SPANISH_PHONEME_MAP[char] || char)
.join('');
} else {
// Use existing English phonemization
const lang = language === "a" ? "en-us" : "en";
return (await espeakng(text, lang)).join(" ");
default: // en-us or en
// console.log('Default phonemization');
return (await espeakng(text, targetLanguage)).join(" ");
}
})
)).join("");
Expand Down