Odoo18-Base/odoo/tools/arabic_reshaper/__init__.py
2025-01-06 10:57:38 +07:00

144 lines
5.2 KiB
Python

# The following code was copied from the original author's repository
# at https://github.com/mpcabd/python-arabic-reshaper/tree/v3.0.0/arabic_reshaper
# Version: 3.0.0
# This work is licensed under the MIT License.
# To view a copy of this license, visit https://opensource.org/licenses/MIT
# Written by Abdullah Diab (mpcabd)
# Email: mpcabd@gmail.com
# Website: http://mpcabd.xyz
#
# This code was simplified by removing configuration (keeping only the default
# configuration) then constant-folding all the configuration items by hand.
import re
from itertools import repeat
from .letters import (UNSHAPED, ISOLATED, TATWEEL, ZWJ, LETTERS_ARABIC, FINAL,
INITIAL, MEDIAL, connects_with_letters_before_and_after,
connects_with_letter_before, connects_with_letter_after)
__all__ = ['reshape']
HARAKAT_RE = re.compile(
'['
'\u0610-\u061a'
'\u064b-\u065f'
'\u0670'
'\u06d6-\u06dc'
'\u06df-\u06e8'
'\u06ea-\u06ed'
'\u08d4-\u08e1'
'\u08d4-\u08ed'
'\u08e3-\u08ff'
']',
re.UNICODE | re.VERBOSE
)
LIGATURES_RE = re.compile("""
\u0627\u0644\u0644\u0647 # ARABIC LIGATURE ALLAH
| \u0644\u0627 # ARABIC LIGATURE LAM WITH ALEF
| \u0644\u0623 # ARABIC LIGATURE LAM WITH ALEF WITH HAMZA ABOVE
| \u0644\u0625 # ARABIC LIGATURE LAM WITH ALEF WITH HAMZA BELOW
| \u0644\u0622 # ARABIC LIGATURE LAM WITH ALEF WITH MADDA ABOVE
""", re.UNICODE | re.VERBOSE)
GROUP_INDEX_TO_LIGATURE_FORMs = [
('\N{ARABIC LIGATURE ALLAH ISOLATED FORM}', '', '', ''),
('\N{ARABIC LIGATURE LAM WITH ALEF ISOLATED FORM}', '', '', '\N{ARABIC LIGATURE LAM WITH ALEF FINAL FORM}'),
('\N{ARABIC LIGATURE LAM WITH ALEF WITH HAMZA ABOVE ISOLATED FORM}', '', '', '\N{ARABIC LIGATURE LAM WITH ALEF WITH HAMZA ABOVE FINAL FORM}'),
('\N{ARABIC LIGATURE LAM WITH ALEF WITH HAMZA BELOW ISOLATED FORM}', '', '', '\N{ARABIC LIGATURE LAM WITH ALEF WITH HAMZA BELOW FINAL FORM}'),
('\N{ARABIC LIGATURE LAM WITH ALEF WITH MADDA ABOVE ISOLATED FORM}', '', '', '\N{ARABIC LIGATURE LAM WITH ALEF WITH MADDA ABOVE FINAL FORM}'),
]
def reshape(text):
if not text:
return ''
output = []
LETTER = 0
FORM = 1
NOT_SUPPORTED = -1
for letter in text:
if HARAKAT_RE.match(letter):
pass
elif letter not in LETTERS_ARABIC:
output.append((letter, NOT_SUPPORTED))
elif not output: # first letter
output.append((letter, ISOLATED))
else:
previous_letter = output[-1]
if (
previous_letter[FORM] == NOT_SUPPORTED or
not connects_with_letter_before(letter, LETTERS_ARABIC) or
not connects_with_letter_after(previous_letter[LETTER], LETTERS_ARABIC) or
(previous_letter[FORM] == FINAL and not connects_with_letters_before_and_after(previous_letter[LETTER], LETTERS_ARABIC))
):
output.append((letter, ISOLATED))
elif previous_letter[FORM] == ISOLATED:
output[-1] = (previous_letter[LETTER], INITIAL)
output.append((letter, FINAL))
# Otherwise, we will change the previous letter to connect
# to the current letter
else:
output[-1] = (previous_letter[LETTER], MEDIAL)
output.append((letter, FINAL))
# Remove ZWJ if it's the second to last item as it won't be useful
if len(output) > 1 and output[-2][LETTER] == ZWJ:
output.pop(len(output) - 2)
if output and output[-1][LETTER] == ZWJ:
output.pop()
# Clean text from Harakat to be able to find ligatures
text = HARAKAT_RE.sub('', text)
for match in LIGATURES_RE.finditer(text):
group_index = next((
i for i, group in enumerate(match.groups()) if group
), -1)
forms = GROUP_INDEX_TO_LIGATURE_FORMs[group_index]
a, b = match.span()
a_form = output[a][FORM]
b_form = output[b - 1][FORM]
# +-----------+----------+---------+---------+----------+
# | a \ b | ISOLATED | INITIAL | MEDIAL | FINAL |
# +-----------+----------+---------+---------+----------+
# | ISOLATED | ISOLATED | INITIAL | INITIAL | ISOLATED |
# | INITIAL | ISOLATED | INITIAL | INITIAL | ISOLATED |
# | MEDIAL | FINAL | MEDIAL | MEDIAL | FINAL |
# | FINAL | FINAL | MEDIAL | MEDIAL | FINAL |
# +-----------+----------+---------+---------+----------+
if a_form in (ISOLATED, INITIAL):
if b_form in (ISOLATED, FINAL):
ligature_form = ISOLATED
else:
ligature_form = INITIAL
else:
if b_form in (ISOLATED, FINAL):
ligature_form = FINAL
else:
ligature_form = MEDIAL
if not forms[ligature_form]:
continue
output[a] = (forms[ligature_form], NOT_SUPPORTED)
output[a + 1:b] = repeat(('', NOT_SUPPORTED), b - 1 - a)
result = []
for o in output:
if o[LETTER]:
if o[FORM] == NOT_SUPPORTED or o[FORM] == UNSHAPED:
result.append(o[LETTER])
else:
result.append(LETTERS_ARABIC[o[LETTER]][o[FORM]])
return ''.join(result)