Package Tashaphyne :: Module stemming
[hide private]
[frames] | no frames]

Source Code for Module Tashaphyne.stemming

  1  # -*- coding: UTF-8 -*- 
  2  """ 
  3  Arabic Light Stemmer: a class which provides a configurable stemmer and segmentor for arabic text. 
  4   
  5  Features: 
  6  ========= 
  7          - Arabic word Light Stemming. 
  8          - Root Extraction. 
  9          - Word Segmentation  
 10          - Word normalization 
 11          - Default Arabic Affixes list. 
 12          - An customizable Light stemmer: possibility of change stemmer options and data. 
 13          - Data independent stemmer 
 14  Licence: 
 15  ======== 
 16          Author 2010, Taha Zerrouki <taha_zerrouki at gawab dot com> 
 17          Released under terms of Gnu Public License. 
 18          The Latest version of the license can be found on 
 19          "www.gnu.org/copyleft/gpl.html" 
 20   
 21  """ 
 22   
 23  import  re 
 24  from normalize import * 
 25  from stem_const import * 
 26   
 27   
28 -class ArabicLightStemmer:
29 """ 30 ArabicLightStemmer: a class which proved a configurable stemmer and segmentor for arabic text. 31 """
32 - def __init__(self):
33 #load affix information 34 # pass; 35 self.prefix_letters=DEFAULT_PREFIX_LETTERS 36 self.suffix_letters=DEFAULT_SUFFIX_LETTERS 37 self.infix_letters=DEFAULT_INFIX_LETTERS 38 self.max_prefix_length=DEFAULT_MAX_PREFIX; 39 self.max_suffix_length=DEFAULT_MAX_SUFFIX; 40 self.min_stem_length=DEFAULT_MIN_STEM; 41 self.joker=DEFAULT_JOKER; 42 self.prefix_list=DEFAULT_PREFIX_LIST; 43 self.suffix_list=DEFAULT_SUFFIX_LIST; 44 self.word=u""; 45 self.unvocalized=u""; 46 self.normalized=u""; 47 self.starword=u""; 48 self.root=u""; 49 self.left=0; 50 self.right=0; 51 self.segment_list=[];
52 ###################################################################### 53 #{ Attribut Functions 54 ######################################################################
55 - def get_prefix_letters(self, ):
56 """ return the prefixation letters. 57 This constant take DEFAULT_PREFIX_LETTERS by default. 58 @return: return a letters. 59 @rtype: unicode. 60 """ 61 return self.prefix_letters;
62
63 - def set_prefix_letters(self, new_prefix_letters):
64 """ set the prefixation letters. 65 This constant take DEFAULT_PREFIX_LETTERS by default. 66 @param new_prefix_letters: letters to be striped from a word, e.g.new_prefix_letters=u"وف":. 67 @type new_prefix_letters: unicode. 68 """ 69 self.prefix_letters=new_prefix_letters;
70
71 - def get_suffix_letters(self, ):
72 """ return the suffixation letters. 73 This constant take DEFAULT_SUFFIX_LETTERS by default. 74 @return: return a letters. 75 @rtype: unicode. 76 """ 77 return self.suffix_letters;
78
79 - def set_suffix_letters(self, new_suffix_letters):
80 """ set the suffixation letters. 81 This constant take DEFAULT_SUFFIX_LETTERS by default. 82 @param new_suffix_letters: letters to be striped from the end of a word, e.g.new_suffix_letters=u"ةون":. 83 @type new_suffix_letters: unicode. 84 """ 85 self.suffix_letters=new_suffix_letters;
86
87 - def get_infix_letters(self,):
88 """ get the inffixation letters. 89 This constant take DEFAULT_INFIX_LETTERS by default. 90 @return: infixes letters. 91 @rtype: unicode. 92 """ 93 return self.infix_letters;
94
95 - def set_infix_letters(self, new_infix_letters):
96 """ set the inffixation letters. 97 This constant take DEFAULT_INFIX_LETTERS by default. 98 @param new_infix_letters: letters to be striped from the middle of a word, e.g.new_infix_letters=u"أوي":. 99 @type new_infix_letters: unicode. 100 """ 101 self.infix_letters=new_infix_letters;
102 103
104 - def get_joker(self,):
105 """ get the joker letter. 106 This constant take DEFAULT_JOKER by default. 107 @return: joker letter. 108 @rtype: unicode. 109 """ 110 return self.joker;
111
112 - def set_joker(self,new_joker):
113 """ set the joker letter. 114 This constant take DEFAULT_JOKER by default. 115 @param new_joker: joker letter. 116 @type new_joker: unicode. 117 """ 118 if len(joker)>1: joker=joker[0]; 119 self.joker=new_joker;
120
121 - def get_max_prefix_length(self, ):
122 """ return the constant of max length of the prefix used by the stemmer. 123 This constant take DEFAULT_MAX_PREFIX_LENGTH by default. 124 @return: return a number. 125 @rtype: integer. 126 """ 127 return self.max_prefix_length;
128
129 - def set_max_prefix_length(self, new_max_prefix_length):
130 """ Set the constant of max length of the prefix used by the stemmer. 131 This constant take DEFAULT_MAX_PREFIX_LENGTH by default. 132 @param new_max_prefix_length: the new max prefix length constant. 133 @type new_max_prefix_length: integer. 134 """ 135 self.max_prefix_length=new_max_prefix_length;
136
137 - def get_max_suffix_length(self, ):
138 """ return the constant of max length of the suffix used by the stemmer. 139 This constant take DEFAULT_MAX_SUFFIX_LENGTH by default. 140 @return: return a number. 141 @rtype: integer. 142 """ 143 return self.max_suffix_length;
144
145 - def set_max_suffix_length(self, new_max_suffix_length):
146 """ Set the constant of max length of the suffix used by the stemmer. 147 This constant take DEFAULT_MAX_SUFFIX_LENGTH by default. 148 @param new_max_suffix_length: the new max suffix length constant. 149 @type new_max_suffix_length: integer. 150 """ 151 self.max_suffix_length=new_max_suffix_length;
152
153 - def get_min_stem_length(self, ):
154 """ return the constant of min length of the stem used by the stemmer. 155 This constant take DEFAULT_MIN_STEM_LENGTH by default. 156 @return: return a number. 157 @rtype: integer. 158 """ 159 return self.min_stem_length;
160
161 - def set_min_stem_length(self, new_min_stem_length):
162 """ Set the constant of min length of the stem used by the stemmer. 163 This constant take DEFAULT__MIN_STEM_LENGTH by default. 164 @param new_min_stem_length: the min stem length constant. 165 @type new_min_stem_length: integer. 166 """ 167 self.min_stem_length=new_min_stem_length;
168
169 - def get_prefix_list(self, ):
170 """ return the prefixes list used by the stemmer. 171 This constant take DEFAULT_PREFIX_LIST by default. 172 @return: prefixes list. 173 @rtype: set(). 174 """ 175 return self.prefix_list;
176 - def set_prefix_list(self, new_prefix_list):
177 """ Set prefixes list used by the stemmer. 178 This constant take DEFAULT_PREFIX_LIST by default. 179 @param new_prefix_list: a set of prefixes. 180 @type new_prefix_list: set of unicode string. 181 """ 182 self.prefix_list=new_prefix_list;
183
184 - def get_suffix_list(self, ):
185 """ return the suffixes list used by the stemmer. 186 This constant take DEFAULT_SUFFIX_LIST by default. 187 @return: suffixes list. 188 @rtype: set(). 189 """ 190 return self.suffix_list;
191
192 - def set_suffix_list(self, new_suffix_list):
193 """ Set suffixes list used by the stemmer. 194 This constant take DEFAULT_SUFFIX_LIST by default. 195 @param new_suffix_list: a set of suffixes. 196 @type new_suffix_list: set of unicode string. 197 """ 198 self.suffix_list=new_suffix_list;
199
200 - def set_word(self, new_word):
201 """ Set the word to treat by the stemmer. 202 @param new_word: the new word. 203 @type new_word: unicode. 204 """ 205 self.word=new_word;
206
207 - def get_word(self):
208 """ return the last word treated by the stemmer. 209 @return: word. 210 @rtype: unicode. 211 """ 212 return self.word;
213 ######################################################### 214 #{ Calculated Attribut Functions 215 ######################################################### 216
217 - def get_starword(self):
218 """ return the starlike word treated by the stemmer. 219 All non affix letters are converted to a joker. 220 The joker take by default DEFAULT_JOKER="*". 221 Exmaple: 222 >>> ArListem=ArabicLightStemmer(); 223 >>> word=u'أفتصربونني' 224 >>> stem=ArListem.lightStem(word); 225 >>> print ArListem.get_starword(); 226 أفت***ونني 227 228 @return: word. 229 @rtype: unicode. 230 """ 231 return self.starword;
232
233 - def get_root(self,prefix_index=-1,suffix_index=-1):
234 """ return the root of the treated word by the stemmer. 235 All non affix letters are converted to a joker. 236 All letters in the joker places are part of root. 237 The joker take by default DEFAULT_JOKER="*". 238 Example: 239 >>> ArListem=ArabicLightStemmer(); 240 >>> word=u'أفتصربونني' 241 >>> stem=ArListem.lightStem(word); 242 >>> print ArListem.get_starword(); 243 أفت***ونني 244 >>> print ArListem.get_root(); 245 ضرب 246 247 @param prefix_index: indicate the left stemming position 248 if =-1: not cosidered, and take the default word prefix lentgh. 249 @type prefix_index:integer. 250 @param suffix_index:indicate the right stemming position. 251 if =-1: not cosidered, and take the default word suffix position. 252 @type suffix_index: integer. 253 @return: root. 254 @rtype: unicode. 255 """ 256 if prefix_index>=0 or suffix_index>=0: 257 self.extract_root(prefix_index,suffix_index); 258 return self.root;
259
260 - def get_normalized(self):
261 """ return the normalized form of the treated word by the stemmer. 262 Some letters are converted into normal form like Hamzat. 263 Example: 264 >>> word=u"استؤجرُ" 265 >>> ArListem=ArabicLightStemmer(); 266 >>> stem=ArListem.lightStem(word); 267 >>> print ArListem.get_normalized(); 268 استءجر 269 270 @return: normalized word. 271 @rtype: unicode. 272 """ 273 return self.normalized;
274
275 - def get_unvocalized(self):
276 """ return the unvocalized form of the treated word by the stemmer. 277 Harakat are striped. 278 Example: 279 >>> word=u"الْعَرَبِيّةُ" 280 >>> ArListem=ArabicLightStemmer(); 281 >>> stem=ArListem.lightStem(word); 282 >>> print ArListem.get_unvocalized(); 283 العربية 284 285 @return: unvocalized word. 286 @rtype: unicode. 287 """ 288 return self.unvocalized;
289
290 - def get_left(self):
291 """ return the the left position of stemming (prefixe end position )in the word treated word by the stemmer. 292 Example: 293 >>> ArListem=ArabicLightStemmer(); 294 >>> word=u'أفتصربونني' 295 >>> stem=ArListem.lightStem(word); 296 >>> print ArListem.get_starword(); 297 أفت***ونني 298 >>> print ArListem.get_left();; 299 3 300 301 @return: the left position of stemming. 302 @rtype: integer. 303 """ 304 return self.left;
305
306 - def get_right(self):
307 """ return the the right position of stemming (suffixe start position )in the word treated word by the stemmer. 308 Example: 309 >>> ArListem=ArabicLightStemmer(); 310 >>> word=u'أفتصربونني' 311 >>> stem=ArListem.lightStem(word); 312 >>> print ArListem.get_starword(); 313 أفت***ونني 314 >>> print ArListem.get_right();; 315 6 316 317 @return: the right position of stemming. 318 @rtype: integer. 319 """ 320 321 return self.right;
322
323 - def get_stem(self,prefix_index=-1,suffix_index=-1):
324 """ return the stem of the treated word by the stemmer. 325 Example: 326 >>> ArListem=ArabicLightStemmer(); 327 >>> word=u'أفتكاتبانني' 328 >>> stem=ArListem.lightStem(word); 329 >>> print ArListem.get_stem(); 330 كاتب 331 332 @param prefix_index: indicate the left stemming position 333 if =-1: not cosidered, and take the default word prefix lentgh. 334 @type prefix_index:integer. 335 @param suffix_index:indicate the right stemming position. 336 if =-1: not cosidered, and take the default word suffix position. 337 @type suffix_index: integer. 338 @return: stem. 339 @rtype: unicode. 340 """ 341 if prefix_index<0: left=self.left; 342 else:left=prefix_index; 343 if suffix_index<0:right=self.right; 344 else:right=suffix_index; 345 return self.unvocalized[left:right];
346
347 - def get_starstem(self,prefix_index=-1,suffix_index=-1):
348 """ return the star form stem of the treated word by the stemmer. 349 All non affix letters are converted to a joker. 350 The joker take by default DEFAULT_JOKER="*". 351 Example: 352 >>> ArListem=ArabicLightStemmer(); 353 >>> word=u'أفتكاتبانني' 354 >>> stem=ArListem.lightStem(word); 355 >>> print ArListem.get_stem(); 356 كاتب 357 >>> print ArListem.get_starstem(); 358 *ات* 359 360 @param prefix_index: indicate the left stemming position 361 if =-1: not cosidered, and take the default word prefix lentgh. 362 @type prefix_index:integer. 363 @param suffix_index:indicate the right stemming position. 364 if =-1: not cosidered, and take the default word suffix position. 365 @type suffix_index: integer. 366 @return: stared form of stem. 367 @rtype: unicode. 368 """ 369 if prefix_index<0 and suffix_index<0: 370 return self.starword[self.left:self.right]; 371 else: 372 left=self.left; 373 right=self.right; 374 if prefix_index>=0: 375 left=prefix_index; 376 if suffix_index>=0: 377 right=suffix_index; 378 newstarstem=re.sub(u"[^%s]"%self.infix_letters,self.joker,self.starword[left:right]); 379 return newstarstem;
380 381 # def get_prefix(self): 382 # return self.unvocalized[:self.left]; 383
384 - def get_prefix(self,prefix_index=-1):
385 """ return the prefix of the treated word by the stemmer. 386 Example: 387 >>> ArListem=ArabicLightStemmer(); 388 >>> word=u'أفتكاتبانني' 389 >>> stem=ArListem.lightStem(word); 390 >>> print ArListem.get_prefix(); 391 أفت 392 393 @param prefix_index: indicate the left stemming position 394 if =-1: not cosidered, and take the default word prefix lentgh. 395 @type prefix_index:integer. 396 @return: prefixe. 397 @rtype: unicode. 398 """ 399 if prefix_index<0: 400 return self.unvocalized[:self.left] 401 else: 402 return self.unvocalized[:prefix_index]
403 404
405 - def get_suffix(self,suffix_index=-1):
406 """ return the suffix of the treated word by the stemmer. 407 Example: 408 >>> ArListem=ArabicLightStemmer(); 409 >>> word=u'أفتكاتبانني' 410 >>> stem=ArListem.lightStem(word); 411 >>> print ArListem.get_suffix(); 412 انني 413 414 @param suffix_index:indicate the right stemming position. 415 if =-1: not cosidered, and take the default word suffix position. 416 @type suffix_index: integer. 417 @return: suffixe. 418 @rtype: unicode. 419 """ 420 if suffix_index<0: 421 return self.unvocalized[self.right:] 422 else: 423 return self.unvocalized[suffix_index:]
424
425 - def get_affix(self,prefix_index=-1, suffix_index=-1):
426 """ return the affix of the treated word by the stemmer. 427 Example: 428 >>> ArListem=ArabicLightStemmer(); 429 >>> word=u'أفتكاتبانني' 430 >>> stem=ArListem.lightStem(word); 431 >>> print ArListem.get_affix(); 432 أفت-انني 433 434 @param prefix_index: indicate the left stemming position 435 if =-1: not cosidered, and take the default word prefix lentgh. 436 @type prefix_index:integer. 437 @param suffix_index:indicate the right stemming position. 438 if =-1: not cosidered, and take the default word suffix position. 439 @type suffix_index: integer. 440 @return: suffixe. 441 @rtype: unicode. 442 """ 443 return u"-".join([self.get_prefix(prefix_index),self.get_suffix(suffix_index)])
444
445 - def get_affix_tuple(self,prefix_index=-1, suffix_index=0):
446 """ return the affix tuple of the treated word by the stemmer. 447 Example: 448 >>> ArListem=ArabicLightStemmer(); 449 >>> word=u'أفتضاربانني' 450 >>> stem=ArListem.lightStem(word); 451 >>> print ArListem.get_affix_tuple(); 452 {'prefix': u'أفت', 'root': u'ضرب', 'suffix': u'انني', 'stem': u'ضارب'} 453 454 @param prefix_index: indicate the left stemming position 455 if =-1: not cosidered, and take the default word prefix lentgh. 456 @type prefix_index:integer. 457 @param suffix_index:indicate the right stemming position. 458 if =-1: not cosidered, and take the default word suffix position. 459 @type suffix_index: integer. 460 @return: affix tuple. 461 @rtype: dict. 462 """ 463 return { 464 'prefix':self.get_prefix(prefix_index), 465 'suffix':self.get_suffix(suffix_index), 466 'stem':self.get_stem(prefix_index,suffix_index), 467 'root':self.get_root(prefix_index,suffix_index),}
468 ######################################################### 469 #{ Stemming Functions 470 #########################################################
471 - def lightStem(self,word):
472 """ 473 Stemming function, stem an arabic word, and return a stem. 474 This function store in the instance the stemming positions (left, right), then it's possible to get other calculted attributs like : stem, prefixe, suffixe, root. 475 Example: 476 >>> ArListem=ArabicLightStemmer(); 477 >>> word=u'أفتضاربانني' 478 >>> stem=ArListem.lightStem(word); 479 >>> print ArListem.get_stem(); 480 ضارب 481 >>> print ArListem.get_starstem(); 482 *ا** 483 >>> print ArListem.get_left(); 484 3 485 >>> print ArListem.get_right(); 486 6 487 >>> print ArListem.get_root(); 488 ضرب 489 490 @param word: the input word. 491 @type word: unicode. 492 @return: stem. 493 @rtype: unicode. 494 """ 495 if word==u'': return u''; 496 starword,left, right =self.transformToStars(word); 497 #consititute the root 498 self.extract_root(); 499 return self.get_stem();
500
501 - def transformToStars(self,word):
502 """ 503 Transform all non affixation letters into a star. 504 the star is a joker(by default '*'). which indicates that the correspandent letter is an original. 505 this function is used by the stmmer to identify original letters., and return a stared form and stemming positions (left, right) 506 Example: 507 >>> ArListem=ArabicLightStemmer(); 508 >>> word=u'أفتضاربانني' 509 >>> starword,left, right=ArListem.transformToStrars(word); 510 (أفت*ا**انني, 3, 6) 511 512 @param word: the input word. 513 @type word: unicode 514 @return: (starword,left, right): 515 - starword : all original letters converted into a star 516 - left : the greater possible left stemming position. 517 - right : the greater possible right stemming position. 518 @rtype: tuple. 519 """ 520 self.word=word; 521 word=strip_tashkeel(word); 522 self.unvocalized=word; 523 word=re.sub("[%s]"%(ALEF_MADDA),HAMZA+ALEF,word) 524 word=re.sub("[^%s%s]"%(self.prefix_letters,self.suffix_letters),self.joker,word); 525 ln=len(word) 526 left=word.find(self.joker); 527 right=word.rfind(self.joker); 528 if left>=0: 529 left=min(left,self.max_prefix_length-1); 530 right=max(right+1,len(word)-self.max_suffix_length); 531 prefix=word[:left]; 532 stem=word[left:right]; 533 suffix=word[right:]; 534 prefix=re.sub("[^%s]"%self.prefix_letters,self.joker,prefix); 535 # avoid null infixes 536 if(self.infix_letters!=u""): 537 stem=re.sub("[^%s]"%self.infix_letters,self.joker,stem); 538 suffix=re.sub("[^%s]"%self.suffix_letters,self.joker,suffix); 539 word=prefix+stem+suffix; 540 541 left=word.find(self.joker); 542 right=word.rfind(self.joker); 543 # prefix_list=self.PREFIX_LIST; 544 # suffix_list=self.SUFFIX_LIST; 545 546 if left<0: 547 left=min(self.max_prefix_length,len(word)-2); 548 if left>=0: 549 prefix=word[:left]; 550 while prefix!="" and prefix not in self.prefix_list: 551 prefix=prefix[:-1]; 552 if right<0: 553 right=max(len(prefix),len(word)-self.max_suffix_length) 554 suffix=word[right:]; 555 556 while suffix!="" and suffix not in self.suffix_list: 557 suffix=suffix[1:]; 558 left=len(prefix); 559 right=len(word)-len(suffix) 560 stem=word[left:right]; 561 # convert stem into stars. 562 # a stem must starts with alef, or end with alef. 563 # any other infixes letter isnt infixe at the border of the stem. 564 #substitute all non infixes letters 565 if self.infix_letters!="": 566 stem=re.sub("[^%s]"%self.infix_letters,self.joker,stem); 567 568 # substitube teh in infixes the teh mst be in the first or second place, all others, are converted 569 # 570 # stem=stem[:2]+re.sub(TEH,self.joker,stem[2:]) 571 word=prefix+stem+suffix 572 # store result 573 self.left=left; 574 self.right=right; 575 self.starword=word; 576 self.extract_root(); 577 # return starword, left, right position of stem 578 return (word,left,right);
579
580 - def extract_root(self,prefix_index=-1,suffix_index=-1):
581 """ return the root of the treated word by the stemmer. 582 All non affix letters are converted to a joker. 583 All letters in the joker places are part of root. 584 The joker take by default DEFAULT_JOKER="*". 585 Example: 586 >>> ArListem=ArabicLightStemmer(); 587 >>> word=u'أفتصربونني' 588 >>> stem=ArListem.lightStem(word); 589 >>> print ArListem.get_starword(); 590 أفت***ونني 591 >>> print ArListem.get_root(); 592 ضرب 593 594 @param prefix_index: indicate the left stemming position 595 if =-1: not cosidered, and take the default word prefix lentgh. 596 @type prefix_index:integer. 597 @param suffix_index:indicate the right stemming position. 598 if =-1: not cosidered, and take the default word suffix position. 599 @type suffix_index: integer. 600 @return: root. 601 @rtype: unicode. 602 """ 603 starstem=self.get_starstem(prefix_index,suffix_index) 604 stem=self.get_stem(prefix_index,suffix_index) 605 root=u""; 606 if len(starstem)==len(stem): 607 for i in range(len(stem)): 608 if starstem[i]==self.joker: 609 root+=stem[i]; 610 else: 611 root=stem; 612 self.root=root; 613 return root;
614 615 616 617 618 ######################################################### 619 #{ Segmentation Functions 620 ######################################################### 621
622 - def segment(self,word):
623 """ generate a list of all posibble segmentation positions (lef, right) of the treated word by the stemmer. 624 Example: 625 >>> ArListem=ArabicLightStemmer(); 626 >>> word=u'فتضربين' 627 >>> print ArListem.segment(word); 628 set(([(1, 5), (2, 5), (0, 7)]) 629 630 @return: List of segmentation 631 @rtype: set of tuple of integer. 632 """ 633 self.word=word; 634 word,left,right=self.transformToStars(word); 635 # self.left=left; 636 # self.right=right; 637 # self.starword=word; 638 ln=len(word) 639 list_seg=set([(0,ln)]); 640 for i in range(left+1): 641 if right<ln: 642 for j in range(right-1,ln+1): 643 suffix=word[j:]; 644 prefix=word[:i]; 645 affix='-'.join([word[:i],word[j:]]); 646 if suffix in self.suffix_list and prefix in self.prefix_list: 647 list_seg.add((i,j)); 648 else: 649 prefix=word[:i]; 650 if prefix in self.prefix_list: 651 list_seg.add((i,right)); 652 #store list_seg 653 self.segment_list=list_seg; 654 return list_seg;
655
656 - def get_segment_list(self):
657 """ return a list of segmentation positions (lef, right) of the treated word by the stemmer. 658 Example: 659 >>> ArListem=ArabicLightStemmer(); 660 >>> word=u'فتضربين' 661 >>> ArListem.segment(word); 662 >>> print ArListem.get_segment_list(); 663 set(([(1, 5), (2, 5), (0, 7)]) 664 665 @return: List of segmentation 666 @rtype: set of tuple of integer. 667 """ 668 return self.segment_list;
669 670
671 - def get_affix_list(self, ):
672 """ return a list of affix tuple of the treated word by the stemmer. 673 Example: 674 >>> ArListem=ArabicLightStemmer(); 675 >>> word=u'فتضربين' 676 >>> ArListem.segment(word); 677 >>> print ArListem.get_affix_list(); 678 [{'prefix': u'ف', 'root': u'ضرب', 'suffix': u'\u064aن', 'stem': u'تضرب'}, 679 {'prefix': u'فت', 'root': u'ضرب', 'suffix': u'\u064aن', 'stem': u'ضرب'}, 680 {'prefix': u'', 'root': u'فضربن', 'suffix': u'', 'stem': u'فتضرب\u064aن'}] 681 682 @return: List of Affixes tuple 683 @rtype: list of dict. 684 """ 685 affix_list=[]; 686 for item in self.segment_list: 687 affix_list.append(self.get_affix_tuple(item[0],item[1])) 688 return affix_list;
689 690 691 ############################################################### 692 #{ General Functions 693 ############################################################### 694
695 - def normalize(self, word=u""):
696 """ 697 Normalize a word. 698 Convert some leters forms into unified form. 699 @param word: the input word, if word is empty, the word member of the class is normalized. 700 @type word: unicode. 701 @return: normalized word. 702 @rtype: unicode. 703 """ 704 705 if word==u'' and self.word==u"": 706 return u""; 707 elif word!= u'': 708 self.word=word; 709 else: 710 word=self.word; 711 self.normalized=normalize_searchtext(word); 712 return self.normalized;
713