Package Tashaphyne :: Module normalize
[hide private]
[frames] | no frames]

Source Code for Module Tashaphyne.normalize

  1  #!/usr/bin/python 
  2  # -*- coding=utf-8 -*- 
  3  """ 
  4  Utility functions used by to prepare an arabic text to search and index . 
  5  """ 
  6  import re, string,sys 
  7  from arabic_const import * 
  8   
  9   
 10  HARAKAT_pat =re.compile(ur"["+u"".join([FATHATAN,DAMMATAN,KASRATAN,FATHA,DAMMA,KASRA,SUKUN,SHADDA])+u"]") 
 11  HAMZAT_pat =re.compile(ur"["+u"".join([WAW_HAMZA,YEH_HAMZA])+u"]"); 
 12  ALEFAT_pat =re.compile(ur"["+u"".join([ALEF_MADDA,ALEF_HAMZA_ABOVE,ALEF_HAMZA_BELOW,HAMZA_ABOVE,HAMZA_BELOW])+u"]"); 
 13  LAMALEFAT_pat =re.compile(ur"["+u"".join([LAM_ALEF,LAM_ALEF_HAMZA_ABOVE,LAM_ALEF_HAMZA_BELOW,LAM_ALEF_MADDA_ABOVE])+u"]"); 
 14   
 15  ###################################################################### 
 16  #{ Indivudual Functions 
 17  ###################################################################### 
 18   
 19  #-------------------------------------- 
20 -def strip_tashkeel(text):
21 """Strip vowel from a text and return a result text. 22 The striped marks are : 23 - FATHA, DAMMA, KASRA 24 - SUKUN 25 - SHADDA 26 - FATHATAN, DAMMATAN, KASRATAN, , , . 27 Example: 28 >>> text=u"الْعَرَبِيّةُ" 29 >>> strip_tashkeel(text) 30 العربية 31 32 @param text: arabic text. 33 @type text: unicode. 34 @return: return a striped text. 35 @rtype: unicode. 36 """ 37 return HARAKAT_pat.sub('', text)
38 39 40 #strip tatweel from a text and return a result text 41 #--------------------------------------
42 -def strip_tatweel(text):
43 """ 44 Strip tatweel from a text and return a result text. 45 46 Example: 47 >>> text=u"العـــــربية" 48 >>> strip_tatweel(text) 49 العربية 50 51 @param text: arabic text. 52 @type text: unicode. 53 @return: return a striped text. 54 @rtype: unicode. 55 """ 56 return re.sub(ur'[%s]' % TATWEEL, '', text)
57 58 59 #--------------------------------------
60 -def normalize_hamza(text):
61 """Normalize Hamza forms into one form, and return a result text. 62 The converted letters are : 63 - The converted lettersinto HAMZA are: WAW_HAMZA,YEH_HAMZA 64 - The converted lettersinto ALEF are: ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW ,HAMZA_ABOVE, HAMZA_BELOW 65 66 Example: 67 >>> text=u"أهؤلاء من أولئكُ" 68 >>> normalize_hamza(text) 69 اهءلاء من اولءكُ 70 71 @param text: arabic text. 72 @type text: unicode. 73 @return: return a converted text. 74 @rtype: unicode. 75 """ 76 text=ALEFAT_pat.sub(ALEF, text) 77 return HAMZAT_pat.sub(HAMZA, text)
78 79 #--------------------------------------
80 -def normalize_lamalef(text):
81 """Normalize Lam Alef ligatures into two letters (LAM and ALEF), and Tand return a result text. 82 Some systems present lamAlef ligature as a single letter, this function convert it into two letters, 83 The converted letters into LAM and ALEF are : 84 - LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE 85 86 Example: 87 >>> text=u"لانها لالء الاسلام" 88 >>> normalize_lamalef(text) 89 لانها لالئ الاسلام 90 91 @param text: arabic text. 92 @type text: unicode. 93 @return: return a converted text. 94 @rtype: unicode. 95 """ 96 return LAMALEFAT_pat.sub(u'%s%s'%(LAM,ALEF), text)
97 98 #--------------------------------------
99 -def normalize_spellerrors(text):
100 """Normalize some spellerrors like, TEH_MARBUTA into HEH,ALEF_MAKSURA into YEH, and Tand return a result text. 101 In some context users omit the difference between TEH_MARBUTA and HEH, and ALEF_MAKSURA and YEh. 102 The conversions are: 103 - TEH_MARBUTA into HEH 104 - ALEF_MAKSURA into YEH 105 106 Example: 107 >>> text=u"اشترت سلمى دمية وحلوى" 108 >>> normalize_spellerrors(text) 109 اشترت سلمي دميه وحلوي 110 111 @param text: arabic text. 112 @type text: unicode. 113 @return: return a converted text. 114 @rtype: unicode. 115 """ 116 text=re.sub(ur'[%s]' % TEH_MARBUTA, HEH, text) 117 return re.sub(ur'[%s]' % ALEF_MAKSURA, YEH, text)
118 119 ###################################################################### 120 #{ Normalize One Function 121 ###################################################################### 122
123 -def normalize_searchtext(text):
124 """Normalize input text and return a result text. 125 Normalize a text by : 126 - strip tashkeel 127 - strip tatweel 128 - normalize Hamza 129 - normalize Lam Alef. 130 - normalize Teh Marbuta and Alef Maksura 131 Example: 132 >>> text=u'أستشتري دمـــى آلية لأبنائك قبل الإغلاق' 133 >>> normalize_searchtext(text) 134 استشتري دمي اليه لابناءك قبل الاغلاق 135 136 @param text: arabic text. 137 @type text: unicode. 138 @return: return a normalized text. 139 @rtype: unicode. 140 """ 141 text=strip_tashkeel(text); 142 text=strip_tatweel(text); 143 text=normalize_lamalef(text); 144 text=normalize_hamza(text); 145 text=normalize_spellerrors(text); 146 return text;
147