Spaces:
Running
Running
p
commited on
Commit
·
ba0fb36
1
Parent(s):
7b6aa43
enable some langs supported by num2words
Browse files- app.py +9 -5
- num2words_lang_map.json +29 -0
app.py
CHANGED
|
@@ -47,8 +47,12 @@ lang_codes = {key + " (" + lang_codes[key] + ")": lang_codes[key] for key in lan
|
|
| 47 |
# Extract language names
|
| 48 |
language_names = list(lang_codes.keys())
|
| 49 |
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
-
|
|
|
|
| 52 |
# Find all numbers in the text using regex
|
| 53 |
numbers = re.findall(r"\d+", text)
|
| 54 |
# Sort numbers in descending order of length
|
|
@@ -57,7 +61,7 @@ def convert_eng_numbers_to_words(text):
|
|
| 57 |
|
| 58 |
# Replace numbers with their word equivalents
|
| 59 |
for number in sorted_numbers:
|
| 60 |
-
number_word = num2words(int(number))
|
| 61 |
text = text.replace(number, number_word)
|
| 62 |
|
| 63 |
return text
|
|
@@ -82,9 +86,9 @@ def prepare_sentences(text, lang="mya"):
|
|
| 82 |
text = convert_mya_numbers_to_words(text)
|
| 83 |
text = text.replace("\u104A", ",").replace("\u104B", ".")
|
| 84 |
|
| 85 |
-
if lang
|
| 86 |
-
|
| 87 |
-
|
| 88 |
print("Processed text", text)
|
| 89 |
|
| 90 |
paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()]
|
|
|
|
| 47 |
# Extract language names
|
| 48 |
language_names = list(lang_codes.keys())
|
| 49 |
|
| 50 |
+
# Load num2words_lang_map
|
| 51 |
+
with open("num2words_lang_map.json") as f:
|
| 52 |
+
num2words_lang_map = json.load(f, object_pairs_hook=OrderedDict)
|
| 53 |
|
| 54 |
+
|
| 55 |
+
def convert_numbers_to_words_num2words(text, lang):
|
| 56 |
# Find all numbers in the text using regex
|
| 57 |
numbers = re.findall(r"\d+", text)
|
| 58 |
# Sort numbers in descending order of length
|
|
|
|
| 61 |
|
| 62 |
# Replace numbers with their word equivalents
|
| 63 |
for number in sorted_numbers:
|
| 64 |
+
number_word = num2words(int(number), lang=num2words_lang_map[lang][0])
|
| 65 |
text = text.replace(number, number_word)
|
| 66 |
|
| 67 |
return text
|
|
|
|
| 86 |
text = convert_mya_numbers_to_words(text)
|
| 87 |
text = text.replace("\u104A", ",").replace("\u104B", ".")
|
| 88 |
|
| 89 |
+
if lang in num2words_lang_map:
|
| 90 |
+
print("num2words supports this lang", lang)
|
| 91 |
+
text = convert_numbers_to_words_num2words(text, lang)
|
| 92 |
print("Processed text", text)
|
| 93 |
|
| 94 |
paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()]
|
num2words_lang_map.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"eng": ["en", "English, default"],
|
| 3 |
+
"amh": ["am", "Amharic"],
|
| 4 |
+
"ara": ["ar", "Arabic"],
|
| 5 |
+
"deu": ["de", "German"],
|
| 6 |
+
"spa": ["es", "Spanish"],
|
| 7 |
+
"fas": ["fa", "Farsi"],
|
| 8 |
+
"fin": ["fi", "Finnish"],
|
| 9 |
+
"fra": ["fr", "French"],
|
| 10 |
+
"heb": ["he", "Hebrew"],
|
| 11 |
+
"hun": ["hu", "Hungarian"],
|
| 12 |
+
"ind": ["id", "Indonesian"],
|
| 13 |
+
"isl": ["is", "Icelandic"],
|
| 14 |
+
"kan": ["kn", "Kannada"],
|
| 15 |
+
"kor": ["ko", "Korean"],
|
| 16 |
+
"kaz": ["kz", "Kazakh"],
|
| 17 |
+
"lav": ["lv", "Latvian"],
|
| 18 |
+
"pol": ["pl", "Polish"],
|
| 19 |
+
"swe": ["sv", "Swedish"],
|
| 20 |
+
"ron": ["ro", "Romanian"],
|
| 21 |
+
"rus": ["ru", "Russian"],
|
| 22 |
+
"tel": ["te", "Telugu"],
|
| 23 |
+
"tgk": ["tg", "Tajik"],
|
| 24 |
+
"tur": ["tr", "Turkish"],
|
| 25 |
+
"tha": ["th", "Thai"],
|
| 26 |
+
"vie": ["vi", "Vietnamese"],
|
| 27 |
+
"nld": ["nl", "Dutch"],
|
| 28 |
+
"ukr": ["uk", "Ukrainian"]
|
| 29 |
+
}
|