class Cassiopee::Crawler

Base class to index and search through a string

Constants

FILE_SUFFIX_EXT
FILE_SUFFIX_POS
METHOD_DIRECT
METHOD_SUFFIX
SUFFIXLEN

Attributes

ambiguous[RW]

Ambiguity map (Hash)

comments[RW]

Array of comment characters to skip lines in input sequence file

file_suffix[RW]

Suffix files name/path

maxthread[RW]

Max number fo threads to use (not yet used)

method[RW]

Method for search FORCE or SUFFIX

  • SUFFIX loads all suffixes and search through them afterwards, interesting for multiple searches (suffixes are reused)

  • FORCE checks matches while crossing the suffixes. Does not keep parsed data for later search FORCE method does not yet support optimal filters

useAmbiguity[RW]

Use alphabet ambiguity (dna/rna) in search, automatically set with loadAmbiguityFile

useCache[RW]

Manage basic cache to store previous match

use_store[RW]

Use persistent suffix file ?

Public Class Methods

new() click to toggle source
# File lib/cassiopee.rb, line 299
def initialize
    @useAmbiguity = false
    @ambiguous = nil
                @useCache = false
    @file_suffix = "crawler"
                
                @method = 0
                
                @prev_min_position = 0
                @prev_max_position = 0
        
                
    @suffix = nil
    @suffixmd5 = nil
    @position = 0
    
    @suffixes = Hash.new
    
    @matches = Array.new
    @curmatch = 0
    @use_store = false
    
    @sequence = nil
                
                @comments = Array["#"]
                
                @cache = Cassiopee::CrawlerCache.new

end

Public Instance Methods

clear() click to toggle source

Clear suffixes in memory If using #use_store, clear the store too

# File lib/cassiopee.rb, line 340
def clear
        @suffixes = Hash.new
                @matches.clear
                @pattern = nil
                @prev_max_position = 0
                @prev_min_position = 0
                @cache.clearCache()
        File.delete(@file_suffix+FILE_SUFFIX_POS) unless !File.exists?(@file_suffix+FILE_SUFFIX_POS)
end
extractSuffix(start,len) click to toggle source

Extract un suffix from suffix file based on md5 match

# File lib/cassiopee.rb, line 586
def extractSuffix(start,len)
        sequence = ''
        begin
                file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r")
                      file.pos = start
                                sequence = file.read(len)
                file.close
        rescue => err
                puts "Exception: #{err}"
                return nil
        end
        return sequence
end
filter(posArray) click to toggle source

Filter the array of positions with defined position filter

# File lib/cassiopee.rb, line 563
def filter(posArray)
        $log.debug("filter the position with " << @min_position.to_s << " and " << @max_position.to_s)
        if(@min_position==0 && @max_position==0)
                return posArray
        end
        filteredArray = Array.new
        i = 0
        posArray.each do |pos|
                if(i==0)
                        # First elt of array is match length
                        filteredArray << pos
                end
                if(i>0 && pos>=@min_position && pos<=@max_position)
                        filteredArray << pos
                end
                i +=1
        end
        return filteredArray
end
filterCost() click to toggle source
# File lib/cassiopee.rb, line 333
def filterCost
  filterOptimal(1)
end
filterLength() click to toggle source
# File lib/cassiopee.rb, line 329
def filterLength
  filterOptimal(0)
end
filter_position(min,max) click to toggle source

Filter matches to be between min and max start position If not using #use_store, search speed is improved but existing indexes are cleared If max=0, then max is string length Must be called after index creation or load

# File lib/cassiopee.rb, line 435
def filter_position(min,max)
    if(!use_store)
                clear()
        end
                @prev_min_position = @min_position
                @prev_max_position = @max_position
        @min_position = min
        @max_position = max
end
indexFile(f) click to toggle source

Index an input file Clear existing indexes

# File lib/cassiopee.rb, line 359
def indexFile(f)
 # Parse file, map letters to reduced alphabet
 # Later on, use binary map instead of ascii map
 # Take all suffix, order by length, link to position map on other file
 # Store md5 for easier compare? + 20 bytes per suffix
    @sequence = readSequence(f)
    clear()
    @min_position = 0
        @max_position = 0
end
indexString(s) click to toggle source

Index an input string Clear existing indexes

# File lib/cassiopee.rb, line 373
def indexString(s)
    @sequence = s
    File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data|
        data.puts(@sequence)
    end
    clear()
    @min_position = 0
        @max_position = 0
end
loadAmbiguityFile(f) click to toggle source

Load ambiguity rules from a file File format should be:

  • A=B,C D=E,F …

# File lib/cassiopee.rb, line 390
def loadAmbiguityFile(f)
  if(!File.exists?(f))
     $log.error("File "<< f << "does not exists")
         exit(1)
  end
  @ambiguous = Hash.new
  file = File.new(f, "r")
  while (line = file.gets)
    definition = line.downcase.chomp
        ambdef = definition.split('=')
        ambequal = ambdef[1].split(',')
        @ambiguous[ambdef[0]] = ambequal
  end
  @useAmbiguity = true
  $log.debug("loaded ambiguity rules: " << @ambiguous.inspect())
  file.close

end
loadIndex() click to toggle source

Load sequence from a previous index command

# File lib/cassiopee.rb, line 411
    def loadIndex
            seq = ''
            begin
                    file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r")
                    while (line = file.gets)
                            input = line.downcase.chomp
                            seq << input
                    end
                    file.close
            rescue => err
                    $log.error("Exception: #{err}")
                    exit()
            end
            @sequence = seq
            clear()
@min_position = 0
    @max_position = 0
    end
next() click to toggle source

Iterates over matches

# File lib/cassiopee.rb, line 602
def next
        if(@curmatch<@matches.length)
                @curmatch = @curmatch + 1
                return @matches[@curmatch-1]
        else
                @curmatch = 0
                return nil
        end
end
searchApproximate(s,edit) click to toggle source

Search an approximate string

  • support insertion, deletion, substitution

  • If edit > 0, use Hamming

  • Else use Levenshtein

# File lib/cassiopee.rb, line 494
def searchApproximate(s,edit)
        
        if(edit==0 && !@useAmbiguity) 
                return searchExact(s)
        end
                allowederrors = edit
        if(edit>=0)
          useHamming = true
          minmatchsize = s.length
          maxmatchsize = s.length
                  updateCache(1,edit)
                  @matches = @cache.loadCache()
        else
          useHamming = false
          edit = edit * (-1)
      minmatchsize = s.length - edit
      maxmatchsize = s.length + edit
                  updateCache(2,edit)
                  @matches = @cache.loadCache()
    end
                
                if(@matches.length>0)
                        return @matches
                end
                
                s = s.downcase
    
                
    #@matches.clear
                @pattern = Digest::MD5.hexdigest(s)
                
                parseSuffixes(@sequence,minmatchsize,maxmatchsize,allowederrors,s)
    
                return cache?(@matches) unless(method == METHOD_SUFFIX)
                
 
    
               
        @suffixes.each do |md5val,posArray|
                if(md5val == SUFFIXLEN)
                        next
                end
                if (md5val == @pattern)
                        filteredPosArray = filter(posArray)
            match = Array[md5val, 0, filteredPosArray]
                      $log.debug "Match: " << match.inspect
                      @matches << match
              else
                      if(posArray[0]>= minmatchsize && posArray[0] <= maxmatchsize)
                              # Get string
                              seq = extractSuffix(posArray[1],posArray[0])
                                        errors = isApproximateEqual?(seq,s,useHamming,edit)
                                        
                              if(errors>=0)
                                      filteredPosArray = filter(posArray)
                                  match = Array[md5val, errors, filteredPosArray]
                                      $log.debug "Match: " << match.inspect
                                      @matches << match
                              end
                      end
        end
        
        end
        
        return cache?(@matches) 
end
searchExact(s) click to toggle source

Search exact match

# File lib/cassiopee.rb, line 447
def searchExact(s)
        
        if(@useAmbiguity)
          return searchApproximate(s,0)
        end
        
s = s.downcase

        updateCache(0,0)
        @matches = @cache.loadCache()
        
        if(@matches.length>0)
                return cache?(@matches)
        end
        
        #@matches.clear
        
        @pattern = Digest::MD5.hexdigest(s)
        
        parseSuffixes(@sequence,s.length,s.length,0,s)

        return @matches unless(method == METHOD_SUFFIX)
        
 # Search required length, compare (compare md5?)
 # MD5 = 128 bits, easier to compare for large strings
    
                
                matchsize = @pattern.length
                
    @suffixes.each do |md5val,posArray|
        if (isMatchEqual?(md5val))
            match = Array[md5val, 0, posArray]
                      $log.debug "Match: " << match.inspect
                      @matches << match
        end
    end
return cache?(@matches) 

end
setLogLevel(level) click to toggle source

Set Logger level

# File lib/cassiopee.rb, line 352
def setLogLevel(level)
    $log.level = level
end
to_pos() click to toggle source
# File lib/cassiopee.rb, line 612
def to_pos
        positions = Hash.new
        @matches.each do |match|
          # match = Array[md5val, errors, posArray]
          i=0
                  len = 0
          match[2].each do |pos|
            if(i==0)
              len = pos
            else
              if(positions.has_key?(pos))
               posmatch = positions[pos]
               posmatch << Array[len,match[1]]

              
              else
                posmatch = Array.new
                posmatch << Array[len,match[1]]
                positions[pos] = posmatch
              end
            end
            i += 1
          end
         
        end
    return positions.sort
end
to_s() click to toggle source
# File lib/cassiopee.rb, line 640
def to_s
        puts '{ matches: "' << @matches.length << '" }'
end

Private Instance Methods

cache?(results) click to toggle source

If cache is used, store results for later retrieval, else return matches directly

# File lib/cassiopee.rb, line 647
def cache?(results)
        if(@useCache)
                @cache.saveCache(results)
        end
        
        return results
end
isApproximateEqual?(s,pattern,useHamming,edit) click to toggle source

check if string is approximatly equal to pattern s: string to compare pattern: base pattern used useHamming: use Hamming or edit distance edit : allowed errors

# File lib/cassiopee.rb, line 679
def isApproximateEqual?(s,pattern,useHamming,edit)
        errors = -1
                        s.extend(Cassiopee)
              if(useHamming)
                          if(@useAmbiguity && @ambiguous!=nil)
                            errors = s.computeHammingAmbiguous(pattern,edit,@ambiguous)
                          else
                  errors = s.computeHamming(pattern,edit)
                          end
              else
                          if(@useAmbiguity && @ambiguous!=nil)
                            errors = s.computeLevenshteinAmbiguous(pattern,edit,@ambigous)
                          else
                  errors = s.computeLevenshtein(pattern,edit)
                          end                                             
              end
end
isMatchEqual?(s) click to toggle source

check if md5 is equal to pattern

# File lib/cassiopee.rb, line 667
def isMatchEqual?(s)
        if(@pattern == s)
                return true
        end
        return false
end
parseSuffixes(s,minlen,maxlen,edit=0,pat=nil) click to toggle source

Parse input string

  • creates a suffix file

  • creates a suffix position file

# File lib/cassiopee.rb, line 705
            def parseSuffixes(s,minlen,maxlen,edit=0,pat=nil)
                                                        
            # Controls
            if(minlen<=0) 
                minlen = 1
            end
            if(maxlen>@sequence.length)
                maxlen = @sequence.length
            end
            
            if(!use_store)
                minpos = @min_position
                if(@max_position==0)
                        maxpos = @sequence.length
                else
                        maxpos = @max_position
                end
            else
                minpos = 0
                maxpos = @sequence.length - minlen
            end
            
                suffixlen = nil
                $log.info('Start indexing')
                loaded = false
                # Hash in memory already contain suffixes for searched lengths
                if(@suffixes != nil && !@suffixes.empty?)
                        suffixlen = @suffixes[SUFFIXLEN]
                        if(suffixlen!=nil && !suffixlen.empty?)
                                loaded = true
                                (maxlen).downto(minlen)  do |len|
                                        if(suffixlen.index(len)==nil)
                                                loaded = false
                                                break
                                        end
                                end
                        end
                end
                
                if(@use_store && loaded)
                        $log.debug('already in memory, skip file loading')
                end
                
                # If not already in memory
                if(@use_store && !loaded)
                        @suffixes = loadSuffixes(@file_suffix+FILE_SUFFIX_POS)
                        suffixlen = @suffixes[SUFFIXLEN]
                end
                
                nbSuffix = 0
                changed = false
                
                # Load suffix between maxlen and minlen
                (maxlen).downto(minlen)  do |i|
                        $log.debug('parse for length ' << i.to_s)
                        if(suffixlen!=nil && suffixlen.index(i)!=nil)
                                $log.debug('length '<<i "                                next
                        end
                        changed = true
                                        prev_progress = -1
                         (minpos..(maxpos)).each do |j|
                               # if position+length longer than sequence length, skip it
                               if(j+i>@sequence.length)
                                       next
                               end
                        @suffix = s[j,i]
                        @suffixmd5 = Digest::MD5.hexdigest(@suffix)
                        @position = j
                                                progress = (@position * 100).div(@sequence.length)
                                                if((progress % 10) == 0 && progress > prev_progress)
                                                        prev_progress = progress
                                                        $log.debug("progress: " << progress.to_s)
                                                end
                                        
                                                if(method==METHOD_DIRECT)
                                                
                                                        if(edit==0 && !@useAmbiguity)
                                                                if(isMatchEqual?(@suffixmd5))
                                                                        errors = 0
                                                                else
                                                                        errors = -1
                                                                end
                                                        else
                                                        
                                                        if(edit>=0)
                                                                useHamming = true
                                                                allowederrors = edit
                                                        else
                                                                useHamming = false
                                                                allowederrors = edit * (-1)
                                                        end
                                                                errors = isApproximateEqual?(@suffix,pat,useHamming,allowederrors)
                                                        end
                                                        
                                                        
                                                        if(errors>=0)
                                                                match = Array[@suffixmd5, errors, Array[i,j]]
                                                                $log.debug "Match: " << match.inspect
                                                                @matches << match
                                                        end
                                                        
                                                        
                                                        
                                                else
                          nbSuffix += addSuffix(@suffixmd5, @position,i)
                                                end
                    end
                    $log.debug("Nb suffix found: " << nbSuffix.to_s << ' for length ' << i.to_s) unless method==METHOD_DIRECT
                end
            
                
                if(@use_store && changed)
                        $log.info("Store suffixes")
                        marshal_dump = Marshal.dump(@suffixes)
                        sfxpos = File.new(@file_suffix+FILE_SUFFIX_POS,'w')
                        sfxpos = Zlib::GzipWriter.new(sfxpos)
                        sfxpos.write marshal_dump
                        sfxpos.close
                end
                $log.info('End of indexing')
            end
          
          
                # Add a suffix in Hashmap
                
                def addSuffix(md5val,position,len)
                        if(@suffixes.has_key?(md5val))
                    # Add position
                        @suffixes[md5val] << position
                else
                    # Add position, write new suffix
                    # First elt is size of elt
                        @suffixes[md5val] = Array[len, position]
                        if(@suffixes.has_key?(SUFFIXLEN))
                                @suffixes[SUFFIXLEN] << len
                        else
                                @suffixes[SUFFIXLEN] = Array[len]
                        end
                end
                        return 1
                end
          
                # read input string, and concat content
                
            def readSequence(s)
                $log.debug('read input sequence')
                 counter = 1
                 sequence = ''
                    begin
                        file = File.new(s, "r")
                        File.delete(@file_suffix+FILE_SUFFIX_EXT) unless !File.exists?(@file_suffix+FILE_SUFFIX_EXT)
                                                File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data|
                          while (line = file.gets)
                                        counter = counter + 1
                                input = line.downcase.chomp
                                                                skip = false
                                                                comments.each do |c|
                                                                        if(input[0] == c[0])
                                                                                # Line start with a comment char, skip it
                                                                                $log.debug("skip line")
                                                                                skip = true
                                                                                break
                                                                        end
                                                                end
                                                                if(!skip)
                                                                        sequence << input
                                                                        data.puts input
                                                                end
                          end
                        
                                                end
                        file.close
                    rescue => err
                        puts "Exception: #{err}"
                        err
                    end
                    $log.debug('data file created')
                    return sequence
             end
             
            # Load suffix position file in memory 
            
            def loadSuffixes(file_name)
                return Hash.new unless File.exists?(@file_suffix+FILE_SUFFIX_POS)
                begin
                  file = Zlib::GzipReader.open(file_name)
                rescue Zlib::GzipFile::Error
                  file = File.open(file_name, 'r')
                ensure
                    obj = Marshal.load file.read
                    file.close
                    return obj
                end
            end
                        
                        # Filter @matches to keep only the longest or the error less matches for a same start position
                        
                        def filterOptimal(type)
        
                positions = Hash.new
                @matches.each do |match|
                  # match = Array[md5val, errors, posArray]
                  i=0
                          len = 0
                  match[2].each do |pos|
                    if(i==0)
                      len = pos
                    else
                      if(positions.has_key?(pos))
                       posmatch = positions[pos]
                       posmatch << Array[len,match[1],match[0]]
                       #positions[pos] << posmatch
                      
                      else
                        posmatch = Array.new
                        posmatch << Array[len,match[1],match[0]]
                        positions[pos] = posmatch
                      end
                    end
                    i += 1
                  end
                end
                        
            matchtoremove = Array.new
                        positions.each do |pos,posmatch|
                         
                            optimal = nil
                                match = nil
                            count = 0
                                newoptimal = nil
                                newmatch = nil
                                
                            (0..posmatch.length-1).each do |i|
                                solution = posmatch[i]                                
                                  if(i==0)
                                    if(type==0)
                                # length
                                  optimal = solution[0]
                                else
                                # cost
                                  optimal = solution[1]      
                                end                          
                                match = solution[2].to_s
                                        #count += 1
                                        next
                                  end
                                  
                              newmatch = solution[2].to_s
                                  if(type==0)
                              # length
                                newoptimal = solution[0]
                                    if(newoptimal.to_i>optimal.to_i)
                                      optimal = newoptimal
                                          matchtoremove << match
                                      match = newmatch
                                        else
                                          matchtoremove << newmatch
                                    end
                              else
                              # cost
                                    newoptimal = solution[1]
                                    if(newoptimal<optimal)
                                      optimal = newoptimal
                                          matchtoremove << match
                                       match = newmatch
                                        else
                                          matchtoremove << newmatch
                                    end             
                              end
                                  count += 1
                                        
                                end                 
                          
                        end
                        
                        newmatches = Array.new
                        @matches.each do |match|
                          found = false
                          matchtoremove.each do |item|
                            if(match[0]==item)
                                  found = true
                                  break
                                end
                          end
                          if(!found)
                           newmatches << match
                          end
                        end
                        @matches = newmatches
                        
                        end
             
    end


end
")
updateCache(method,errors) click to toggle source

Update cache object with current object parameters

  • method: 0 -> exact, 1 -> hamming, 2 -> edit

# File lib/cassiopee.rb, line 657
def updateCache(method,errors)
        @cache.file_suffix = @file_suffix
        @cache.min_position = @min_position
        @cache.max_position = @max_position
        @cache.method = method
        @cache.errors = errors
end