|
Package Tashaphyne ::
Module stemming
|
|
1
2 """
3 Arabic Light Stemmer: a class which provides a configurable stemmer and segmentor for arabic text.
4
5 Features:
6 =========
7 - Arabic word Light Stemming.
8 - Root Extraction.
9 - Word Segmentation
10 - Word normalization
11 - Default Arabic Affixes list.
12 - An customizable Light stemmer: possibility of change stemmer options and data.
13 - Data independent stemmer
14 Licence:
15 ========
16 Author 2010, Taha Zerrouki <taha_zerrouki at gawab dot com>
17 Released under terms of Gnu Public License.
18 The Latest version of the license can be found on
19 "www.gnu.org/copyleft/gpl.html"
20
21 """
22
23 import re
24 from normalize import *
25 from stem_const import *
26
27
29 """
30 ArabicLightStemmer: a class which proved a configurable stemmer and segmentor for arabic text.
31 """
52
53
54
56 """ return the prefixation letters.
57 This constant take DEFAULT_PREFIX_LETTERS by default.
58 @return: return a letters.
59 @rtype: unicode.
60 """
61 return self.prefix_letters;
62
64 """ set the prefixation letters.
65 This constant take DEFAULT_PREFIX_LETTERS by default.
66 @param new_prefix_letters: letters to be striped from a word, e.g.new_prefix_letters=u"وف":.
67 @type new_prefix_letters: unicode.
68 """
69 self.prefix_letters=new_prefix_letters;
70
72 """ return the suffixation letters.
73 This constant take DEFAULT_SUFFIX_LETTERS by default.
74 @return: return a letters.
75 @rtype: unicode.
76 """
77 return self.suffix_letters;
78
80 """ set the suffixation letters.
81 This constant take DEFAULT_SUFFIX_LETTERS by default.
82 @param new_suffix_letters: letters to be striped from the end of a word, e.g.new_suffix_letters=u"ةون":.
83 @type new_suffix_letters: unicode.
84 """
85 self.suffix_letters=new_suffix_letters;
86
88 """ get the inffixation letters.
89 This constant take DEFAULT_INFIX_LETTERS by default.
90 @return: infixes letters.
91 @rtype: unicode.
92 """
93 return self.infix_letters;
94
96 """ set the inffixation letters.
97 This constant take DEFAULT_INFIX_LETTERS by default.
98 @param new_infix_letters: letters to be striped from the middle of a word, e.g.new_infix_letters=u"أوي":.
99 @type new_infix_letters: unicode.
100 """
101 self.infix_letters=new_infix_letters;
102
103
105 """ get the joker letter.
106 This constant take DEFAULT_JOKER by default.
107 @return: joker letter.
108 @rtype: unicode.
109 """
110 return self.joker;
111
113 """ set the joker letter.
114 This constant take DEFAULT_JOKER by default.
115 @param new_joker: joker letter.
116 @type new_joker: unicode.
117 """
118 if len(joker)>1: joker=joker[0];
119 self.joker=new_joker;
120
122 """ return the constant of max length of the prefix used by the stemmer.
123 This constant take DEFAULT_MAX_PREFIX_LENGTH by default.
124 @return: return a number.
125 @rtype: integer.
126 """
127 return self.max_prefix_length;
128
130 """ Set the constant of max length of the prefix used by the stemmer.
131 This constant take DEFAULT_MAX_PREFIX_LENGTH by default.
132 @param new_max_prefix_length: the new max prefix length constant.
133 @type new_max_prefix_length: integer.
134 """
135 self.max_prefix_length=new_max_prefix_length;
136
138 """ return the constant of max length of the suffix used by the stemmer.
139 This constant take DEFAULT_MAX_SUFFIX_LENGTH by default.
140 @return: return a number.
141 @rtype: integer.
142 """
143 return self.max_suffix_length;
144
146 """ Set the constant of max length of the suffix used by the stemmer.
147 This constant take DEFAULT_MAX_SUFFIX_LENGTH by default.
148 @param new_max_suffix_length: the new max suffix length constant.
149 @type new_max_suffix_length: integer.
150 """
151 self.max_suffix_length=new_max_suffix_length;
152
154 """ return the constant of min length of the stem used by the stemmer.
155 This constant take DEFAULT_MIN_STEM_LENGTH by default.
156 @return: return a number.
157 @rtype: integer.
158 """
159 return self.min_stem_length;
160
162 """ Set the constant of min length of the stem used by the stemmer.
163 This constant take DEFAULT__MIN_STEM_LENGTH by default.
164 @param new_min_stem_length: the min stem length constant.
165 @type new_min_stem_length: integer.
166 """
167 self.min_stem_length=new_min_stem_length;
168
170 """ return the prefixes list used by the stemmer.
171 This constant take DEFAULT_PREFIX_LIST by default.
172 @return: prefixes list.
173 @rtype: set().
174 """
175 return self.prefix_list;
177 """ Set prefixes list used by the stemmer.
178 This constant take DEFAULT_PREFIX_LIST by default.
179 @param new_prefix_list: a set of prefixes.
180 @type new_prefix_list: set of unicode string.
181 """
182 self.prefix_list=new_prefix_list;
183
185 """ return the suffixes list used by the stemmer.
186 This constant take DEFAULT_SUFFIX_LIST by default.
187 @return: suffixes list.
188 @rtype: set().
189 """
190 return self.suffix_list;
191
193 """ Set suffixes list used by the stemmer.
194 This constant take DEFAULT_SUFFIX_LIST by default.
195 @param new_suffix_list: a set of suffixes.
196 @type new_suffix_list: set of unicode string.
197 """
198 self.suffix_list=new_suffix_list;
199
201 """ Set the word to treat by the stemmer.
202 @param new_word: the new word.
203 @type new_word: unicode.
204 """
205 self.word=new_word;
206
208 """ return the last word treated by the stemmer.
209 @return: word.
210 @rtype: unicode.
211 """
212 return self.word;
213
214
215
216
218 """ return the starlike word treated by the stemmer.
219 All non affix letters are converted to a joker.
220 The joker take by default DEFAULT_JOKER="*".
221 Exmaple:
222 >>> ArListem=ArabicLightStemmer();
223 >>> word=u'أفتصربونني'
224 >>> stem=ArListem.lightStem(word);
225 >>> print ArListem.get_starword();
226 أفت***ونني
227
228 @return: word.
229 @rtype: unicode.
230 """
231 return self.starword;
232
233 - def get_root(self,prefix_index=-1,suffix_index=-1):
234 """ return the root of the treated word by the stemmer.
235 All non affix letters are converted to a joker.
236 All letters in the joker places are part of root.
237 The joker take by default DEFAULT_JOKER="*".
238 Example:
239 >>> ArListem=ArabicLightStemmer();
240 >>> word=u'أفتصربونني'
241 >>> stem=ArListem.lightStem(word);
242 >>> print ArListem.get_starword();
243 أفت***ونني
244 >>> print ArListem.get_root();
245 ضرب
246
247 @param prefix_index: indicate the left stemming position
248 if =-1: not cosidered, and take the default word prefix lentgh.
249 @type prefix_index:integer.
250 @param suffix_index:indicate the right stemming position.
251 if =-1: not cosidered, and take the default word suffix position.
252 @type suffix_index: integer.
253 @return: root.
254 @rtype: unicode.
255 """
256 if prefix_index>=0 or suffix_index>=0:
257 self.extract_root(prefix_index,suffix_index);
258 return self.root;
259
261 """ return the normalized form of the treated word by the stemmer.
262 Some letters are converted into normal form like Hamzat.
263 Example:
264 >>> word=u"استؤجرُ"
265 >>> ArListem=ArabicLightStemmer();
266 >>> stem=ArListem.lightStem(word);
267 >>> print ArListem.get_normalized();
268 استءجر
269
270 @return: normalized word.
271 @rtype: unicode.
272 """
273 return self.normalized;
274
276 """ return the unvocalized form of the treated word by the stemmer.
277 Harakat are striped.
278 Example:
279 >>> word=u"الْعَرَبِيّةُ"
280 >>> ArListem=ArabicLightStemmer();
281 >>> stem=ArListem.lightStem(word);
282 >>> print ArListem.get_unvocalized();
283 العربية
284
285 @return: unvocalized word.
286 @rtype: unicode.
287 """
288 return self.unvocalized;
289
291 """ return the the left position of stemming (prefixe end position )in the word treated word by the stemmer.
292 Example:
293 >>> ArListem=ArabicLightStemmer();
294 >>> word=u'أفتصربونني'
295 >>> stem=ArListem.lightStem(word);
296 >>> print ArListem.get_starword();
297 أفت***ونني
298 >>> print ArListem.get_left();;
299 3
300
301 @return: the left position of stemming.
302 @rtype: integer.
303 """
304 return self.left;
305
307 """ return the the right position of stemming (suffixe start position )in the word treated word by the stemmer.
308 Example:
309 >>> ArListem=ArabicLightStemmer();
310 >>> word=u'أفتصربونني'
311 >>> stem=ArListem.lightStem(word);
312 >>> print ArListem.get_starword();
313 أفت***ونني
314 >>> print ArListem.get_right();;
315 6
316
317 @return: the right position of stemming.
318 @rtype: integer.
319 """
320
321 return self.right;
322
323 - def get_stem(self,prefix_index=-1,suffix_index=-1):
324 """ return the stem of the treated word by the stemmer.
325 Example:
326 >>> ArListem=ArabicLightStemmer();
327 >>> word=u'أفتكاتبانني'
328 >>> stem=ArListem.lightStem(word);
329 >>> print ArListem.get_stem();
330 كاتب
331
332 @param prefix_index: indicate the left stemming position
333 if =-1: not cosidered, and take the default word prefix lentgh.
334 @type prefix_index:integer.
335 @param suffix_index:indicate the right stemming position.
336 if =-1: not cosidered, and take the default word suffix position.
337 @type suffix_index: integer.
338 @return: stem.
339 @rtype: unicode.
340 """
341 if prefix_index<0: left=self.left;
342 else:left=prefix_index;
343 if suffix_index<0:right=self.right;
344 else:right=suffix_index;
345 return self.unvocalized[left:right];
346
348 """ return the star form stem of the treated word by the stemmer.
349 All non affix letters are converted to a joker.
350 The joker take by default DEFAULT_JOKER="*".
351 Example:
352 >>> ArListem=ArabicLightStemmer();
353 >>> word=u'أفتكاتبانني'
354 >>> stem=ArListem.lightStem(word);
355 >>> print ArListem.get_stem();
356 كاتب
357 >>> print ArListem.get_starstem();
358 *ات*
359
360 @param prefix_index: indicate the left stemming position
361 if =-1: not cosidered, and take the default word prefix lentgh.
362 @type prefix_index:integer.
363 @param suffix_index:indicate the right stemming position.
364 if =-1: not cosidered, and take the default word suffix position.
365 @type suffix_index: integer.
366 @return: stared form of stem.
367 @rtype: unicode.
368 """
369 if prefix_index<0 and suffix_index<0:
370 return self.starword[self.left:self.right];
371 else:
372 left=self.left;
373 right=self.right;
374 if prefix_index>=0:
375 left=prefix_index;
376 if suffix_index>=0:
377 right=suffix_index;
378 newstarstem=re.sub(u"[^%s]"%self.infix_letters,self.joker,self.starword[left:right]);
379 return newstarstem;
380
381
382
383
385 """ return the prefix of the treated word by the stemmer.
386 Example:
387 >>> ArListem=ArabicLightStemmer();
388 >>> word=u'أفتكاتبانني'
389 >>> stem=ArListem.lightStem(word);
390 >>> print ArListem.get_prefix();
391 أفت
392
393 @param prefix_index: indicate the left stemming position
394 if =-1: not cosidered, and take the default word prefix lentgh.
395 @type prefix_index:integer.
396 @return: prefixe.
397 @rtype: unicode.
398 """
399 if prefix_index<0:
400 return self.unvocalized[:self.left]
401 else:
402 return self.unvocalized[:prefix_index]
403
404
406 """ return the suffix of the treated word by the stemmer.
407 Example:
408 >>> ArListem=ArabicLightStemmer();
409 >>> word=u'أفتكاتبانني'
410 >>> stem=ArListem.lightStem(word);
411 >>> print ArListem.get_suffix();
412 انني
413
414 @param suffix_index:indicate the right stemming position.
415 if =-1: not cosidered, and take the default word suffix position.
416 @type suffix_index: integer.
417 @return: suffixe.
418 @rtype: unicode.
419 """
420 if suffix_index<0:
421 return self.unvocalized[self.right:]
422 else:
423 return self.unvocalized[suffix_index:]
424
425 - def get_affix(self,prefix_index=-1, suffix_index=-1):
426 """ return the affix of the treated word by the stemmer.
427 Example:
428 >>> ArListem=ArabicLightStemmer();
429 >>> word=u'أفتكاتبانني'
430 >>> stem=ArListem.lightStem(word);
431 >>> print ArListem.get_affix();
432 أفت-انني
433
434 @param prefix_index: indicate the left stemming position
435 if =-1: not cosidered, and take the default word prefix lentgh.
436 @type prefix_index:integer.
437 @param suffix_index:indicate the right stemming position.
438 if =-1: not cosidered, and take the default word suffix position.
439 @type suffix_index: integer.
440 @return: suffixe.
441 @rtype: unicode.
442 """
443 return u"-".join([self.get_prefix(prefix_index),self.get_suffix(suffix_index)])
444
446 """ return the affix tuple of the treated word by the stemmer.
447 Example:
448 >>> ArListem=ArabicLightStemmer();
449 >>> word=u'أفتضاربانني'
450 >>> stem=ArListem.lightStem(word);
451 >>> print ArListem.get_affix_tuple();
452 {'prefix': u'أفت', 'root': u'ضرب', 'suffix': u'انني', 'stem': u'ضارب'}
453
454 @param prefix_index: indicate the left stemming position
455 if =-1: not cosidered, and take the default word prefix lentgh.
456 @type prefix_index:integer.
457 @param suffix_index:indicate the right stemming position.
458 if =-1: not cosidered, and take the default word suffix position.
459 @type suffix_index: integer.
460 @return: affix tuple.
461 @rtype: dict.
462 """
463 return {
464 'prefix':self.get_prefix(prefix_index),
465 'suffix':self.get_suffix(suffix_index),
466 'stem':self.get_stem(prefix_index,suffix_index),
467 'root':self.get_root(prefix_index,suffix_index),}
468
469
470
472 """
473 Stemming function, stem an arabic word, and return a stem.
474 This function store in the instance the stemming positions (left, right), then it's possible to get other calculted attributs like : stem, prefixe, suffixe, root.
475 Example:
476 >>> ArListem=ArabicLightStemmer();
477 >>> word=u'أفتضاربانني'
478 >>> stem=ArListem.lightStem(word);
479 >>> print ArListem.get_stem();
480 ضارب
481 >>> print ArListem.get_starstem();
482 *ا**
483 >>> print ArListem.get_left();
484 3
485 >>> print ArListem.get_right();
486 6
487 >>> print ArListem.get_root();
488 ضرب
489
490 @param word: the input word.
491 @type word: unicode.
492 @return: stem.
493 @rtype: unicode.
494 """
495 if word==u'': return u'';
496 starword,left, right =self.transformToStars(word);
497
498 self.extract_root();
499 return self.get_stem();
500
579
581 """ return the root of the treated word by the stemmer.
582 All non affix letters are converted to a joker.
583 All letters in the joker places are part of root.
584 The joker take by default DEFAULT_JOKER="*".
585 Example:
586 >>> ArListem=ArabicLightStemmer();
587 >>> word=u'أفتصربونني'
588 >>> stem=ArListem.lightStem(word);
589 >>> print ArListem.get_starword();
590 أفت***ونني
591 >>> print ArListem.get_root();
592 ضرب
593
594 @param prefix_index: indicate the left stemming position
595 if =-1: not cosidered, and take the default word prefix lentgh.
596 @type prefix_index:integer.
597 @param suffix_index:indicate the right stemming position.
598 if =-1: not cosidered, and take the default word suffix position.
599 @type suffix_index: integer.
600 @return: root.
601 @rtype: unicode.
602 """
603 starstem=self.get_starstem(prefix_index,suffix_index)
604 stem=self.get_stem(prefix_index,suffix_index)
605 root=u"";
606 if len(starstem)==len(stem):
607 for i in range(len(stem)):
608 if starstem[i]==self.joker:
609 root+=stem[i];
610 else:
611 root=stem;
612 self.root=root;
613 return root;
614
615
616
617
618
619
620
621
623 """ generate a list of all posibble segmentation positions (lef, right) of the treated word by the stemmer.
624 Example:
625 >>> ArListem=ArabicLightStemmer();
626 >>> word=u'فتضربين'
627 >>> print ArListem.segment(word);
628 set(([(1, 5), (2, 5), (0, 7)])
629
630 @return: List of segmentation
631 @rtype: set of tuple of integer.
632 """
633 self.word=word;
634 word,left,right=self.transformToStars(word);
635
636
637
638 ln=len(word)
639 list_seg=set([(0,ln)]);
640 for i in range(left+1):
641 if right<ln:
642 for j in range(right-1,ln+1):
643 suffix=word[j:];
644 prefix=word[:i];
645 affix='-'.join([word[:i],word[j:]]);
646 if suffix in self.suffix_list and prefix in self.prefix_list:
647 list_seg.add((i,j));
648 else:
649 prefix=word[:i];
650 if prefix in self.prefix_list:
651 list_seg.add((i,right));
652
653 self.segment_list=list_seg;
654 return list_seg;
655
657 """ return a list of segmentation positions (lef, right) of the treated word by the stemmer.
658 Example:
659 >>> ArListem=ArabicLightStemmer();
660 >>> word=u'فتضربين'
661 >>> ArListem.segment(word);
662 >>> print ArListem.get_segment_list();
663 set(([(1, 5), (2, 5), (0, 7)])
664
665 @return: List of segmentation
666 @rtype: set of tuple of integer.
667 """
668 return self.segment_list;
669
670
672 """ return a list of affix tuple of the treated word by the stemmer.
673 Example:
674 >>> ArListem=ArabicLightStemmer();
675 >>> word=u'فتضربين'
676 >>> ArListem.segment(word);
677 >>> print ArListem.get_affix_list();
678 [{'prefix': u'ف', 'root': u'ضرب', 'suffix': u'\u064aن', 'stem': u'تضرب'},
679 {'prefix': u'فت', 'root': u'ضرب', 'suffix': u'\u064aن', 'stem': u'ضرب'},
680 {'prefix': u'', 'root': u'فضربن', 'suffix': u'', 'stem': u'فتضرب\u064aن'}]
681
682 @return: List of Affixes tuple
683 @rtype: list of dict.
684 """
685 affix_list=[];
686 for item in self.segment_list:
687 affix_list.append(self.get_affix_tuple(item[0],item[1]))
688 return affix_list;
689
690
691
692
693
694
696 """
697 Normalize a word.
698 Convert some leters forms into unified form.
699 @param word: the input word, if word is empty, the word member of the class is normalized.
700 @type word: unicode.
701 @return: normalized word.
702 @rtype: unicode.
703 """
704
705 if word==u'' and self.word==u"":
706 return u"";
707 elif word!= u'':
708 self.word=word;
709 else:
710 word=self.word;
711 self.normalized=normalize_searchtext(word);
712 return self.normalized;
713