|
Package Tashaphyne ::
Module normalize
|
|
1
2
3 """
4 Utility functions used by to prepare an arabic text to search and index .
5 """
6 import re, string,sys
7 from arabic_const import *
8
9
10 HARAKAT_pat =re.compile(ur"["+u"".join([FATHATAN,DAMMATAN,KASRATAN,FATHA,DAMMA,KASRA,SUKUN,SHADDA])+u"]")
11 HAMZAT_pat =re.compile(ur"["+u"".join([WAW_HAMZA,YEH_HAMZA])+u"]");
12 ALEFAT_pat =re.compile(ur"["+u"".join([ALEF_MADDA,ALEF_HAMZA_ABOVE,ALEF_HAMZA_BELOW,HAMZA_ABOVE,HAMZA_BELOW])+u"]");
13 LAMALEFAT_pat =re.compile(ur"["+u"".join([LAM_ALEF,LAM_ALEF_HAMZA_ABOVE,LAM_ALEF_HAMZA_BELOW,LAM_ALEF_MADDA_ABOVE])+u"]");
14
15
16
17
18
19
21 """Strip vowel from a text and return a result text.
22 The striped marks are :
23 - FATHA, DAMMA, KASRA
24 - SUKUN
25 - SHADDA
26 - FATHATAN, DAMMATAN, KASRATAN, , , .
27 Example:
28 >>> text=u"الْعَرَبِيّةُ"
29 >>> strip_tashkeel(text)
30 العربية
31
32 @param text: arabic text.
33 @type text: unicode.
34 @return: return a striped text.
35 @rtype: unicode.
36 """
37 return HARAKAT_pat.sub('', text)
38
39
40
41
43 """
44 Strip tatweel from a text and return a result text.
45
46 Example:
47 >>> text=u"العـــــربية"
48 >>> strip_tatweel(text)
49 العربية
50
51 @param text: arabic text.
52 @type text: unicode.
53 @return: return a striped text.
54 @rtype: unicode.
55 """
56 return re.sub(ur'[%s]' % TATWEEL, '', text)
57
58
59
61 """Normalize Hamza forms into one form, and return a result text.
62 The converted letters are :
63 - The converted lettersinto HAMZA are: WAW_HAMZA,YEH_HAMZA
64 - The converted lettersinto ALEF are: ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW ,HAMZA_ABOVE, HAMZA_BELOW
65
66 Example:
67 >>> text=u"أهؤلاء من أولئكُ"
68 >>> normalize_hamza(text)
69 اهءلاء من اولءكُ
70
71 @param text: arabic text.
72 @type text: unicode.
73 @return: return a converted text.
74 @rtype: unicode.
75 """
76 text=ALEFAT_pat.sub(ALEF, text)
77 return HAMZAT_pat.sub(HAMZA, text)
78
79
81 """Normalize Lam Alef ligatures into two letters (LAM and ALEF), and Tand return a result text.
82 Some systems present lamAlef ligature as a single letter, this function convert it into two letters,
83 The converted letters into LAM and ALEF are :
84 - LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE
85
86 Example:
87 >>> text=u"لانها لالء الاسلام"
88 >>> normalize_lamalef(text)
89 لانها لالئ الاسلام
90
91 @param text: arabic text.
92 @type text: unicode.
93 @return: return a converted text.
94 @rtype: unicode.
95 """
96 return LAMALEFAT_pat.sub(u'%s%s'%(LAM,ALEF), text)
97
98
100 """Normalize some spellerrors like, TEH_MARBUTA into HEH,ALEF_MAKSURA into YEH, and Tand return a result text.
101 In some context users omit the difference between TEH_MARBUTA and HEH, and ALEF_MAKSURA and YEh.
102 The conversions are:
103 - TEH_MARBUTA into HEH
104 - ALEF_MAKSURA into YEH
105
106 Example:
107 >>> text=u"اشترت سلمى دمية وحلوى"
108 >>> normalize_spellerrors(text)
109 اشترت سلمي دميه وحلوي
110
111 @param text: arabic text.
112 @type text: unicode.
113 @return: return a converted text.
114 @rtype: unicode.
115 """
116 text=re.sub(ur'[%s]' % TEH_MARBUTA, HEH, text)
117 return re.sub(ur'[%s]' % ALEF_MAKSURA, YEH, text)
118
119
120
121
122
124 """Normalize input text and return a result text.
125 Normalize a text by :
126 - strip tashkeel
127 - strip tatweel
128 - normalize Hamza
129 - normalize Lam Alef.
130 - normalize Teh Marbuta and Alef Maksura
131 Example:
132 >>> text=u'أستشتري دمـــى آلية لأبنائك قبل الإغلاق'
133 >>> normalize_searchtext(text)
134 استشتري دمي اليه لابناءك قبل الاغلاق
135
136 @param text: arabic text.
137 @type text: unicode.
138 @return: return a normalized text.
139 @rtype: unicode.
140 """
141 text=strip_tashkeel(text);
142 text=strip_tatweel(text);
143 text=normalize_lamalef(text);
144 text=normalize_hamza(text);
145 text=normalize_spellerrors(text);
146 return text;
147