Base class to index and search through a string
Ambiguity map (Hash)
Array of comment characters to skip lines in input sequence file
Suffix files name/path
Max number fo threads to use (not yet used)
Method for search FORCE or SUFFIX
SUFFIX loads all suffixes and search through them afterwards, interesting for multiple searches (suffixes are reused)
FORCE checks matches while crossing the suffixes. Does not keep parsed data for later search FORCE method does not yet support optimal filters
Use alphabet ambiguity (dna/rna) in search, automatically set with loadAmbiguityFile
Manage basic cache to store previous match
Use persistent suffix file ?
# File lib/cassiopee.rb, line 299 def initialize @useAmbiguity = false @ambiguous = nil @useCache = false @file_suffix = "crawler" @method = 0 @prev_min_position = 0 @prev_max_position = 0 @suffix = nil @suffixmd5 = nil @position = 0 @suffixes = Hash.new @matches = Array.new @curmatch = 0 @use_store = false @sequence = nil @comments = Array["#"] @cache = Cassiopee::CrawlerCache.new end
Clear suffixes in memory If using #use_store, clear the store too
# File lib/cassiopee.rb, line 340 def clear @suffixes = Hash.new @matches.clear @pattern = nil @prev_max_position = 0 @prev_min_position = 0 @cache.clearCache() File.delete(@file_suffix+FILE_SUFFIX_POS) unless !File.exists?(@file_suffix+FILE_SUFFIX_POS) end
Extract un suffix from suffix file based on md5 match
# File lib/cassiopee.rb, line 586 def extractSuffix(start,len) sequence = '' begin file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r") file.pos = start sequence = file.read(len) file.close rescue => err puts "Exception: #{err}" return nil end return sequence end
Filter the array of positions with defined position filter
# File lib/cassiopee.rb, line 563 def filter(posArray) $log.debug("filter the position with " << @min_position.to_s << " and " << @max_position.to_s) if(@min_position==0 && @max_position==0) return posArray end filteredArray = Array.new i = 0 posArray.each do |pos| if(i==0) # First elt of array is match length filteredArray << pos end if(i>0 && pos>=@min_position && pos<=@max_position) filteredArray << pos end i +=1 end return filteredArray end
# File lib/cassiopee.rb, line 333 def filterCost filterOptimal(1) end
# File lib/cassiopee.rb, line 329 def filterLength filterOptimal(0) end
Filter matches to be between min and max start position If not using #use_store, search speed is improved but existing indexes are cleared If max=0, then max is string length Must be called after index creation or load
# File lib/cassiopee.rb, line 435 def filter_position(min,max) if(!use_store) clear() end @prev_min_position = @min_position @prev_max_position = @max_position @min_position = min @max_position = max end
Index an input file Clear existing indexes
# File lib/cassiopee.rb, line 359 def indexFile(f) # Parse file, map letters to reduced alphabet # Later on, use binary map instead of ascii map # Take all suffix, order by length, link to position map on other file # Store md5 for easier compare? + 20 bytes per suffix @sequence = readSequence(f) clear() @min_position = 0 @max_position = 0 end
Index an input string Clear existing indexes
# File lib/cassiopee.rb, line 373 def indexString(s) @sequence = s File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data| data.puts(@sequence) end clear() @min_position = 0 @max_position = 0 end
Load ambiguity rules from a file File format should be:
A=B,C D=E,F …
# File lib/cassiopee.rb, line 390 def loadAmbiguityFile(f) if(!File.exists?(f)) $log.error("File "<< f << "does not exists") exit(1) end @ambiguous = Hash.new file = File.new(f, "r") while (line = file.gets) definition = line.downcase.chomp ambdef = definition.split('=') ambequal = ambdef[1].split(',') @ambiguous[ambdef[0]] = ambequal end @useAmbiguity = true $log.debug("loaded ambiguity rules: " << @ambiguous.inspect()) file.close end
Load sequence from a previous index command
# File lib/cassiopee.rb, line 411 def loadIndex seq = '' begin file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r") while (line = file.gets) input = line.downcase.chomp seq << input end file.close rescue => err $log.error("Exception: #{err}") exit() end @sequence = seq clear() @min_position = 0 @max_position = 0 end
Iterates over matches
# File lib/cassiopee.rb, line 602 def next if(@curmatch<@matches.length) @curmatch = @curmatch + 1 return @matches[@curmatch-1] else @curmatch = 0 return nil end end
Search an approximate string
support insertion, deletion, substitution
If edit > 0, use Hamming
Else use Levenshtein
# File lib/cassiopee.rb, line 494 def searchApproximate(s,edit) if(edit==0 && !@useAmbiguity) return searchExact(s) end allowederrors = edit if(edit>=0) useHamming = true minmatchsize = s.length maxmatchsize = s.length updateCache(1,edit) @matches = @cache.loadCache() else useHamming = false edit = edit * (-1) minmatchsize = s.length - edit maxmatchsize = s.length + edit updateCache(2,edit) @matches = @cache.loadCache() end if(@matches.length>0) return @matches end s = s.downcase #@matches.clear @pattern = Digest::MD5.hexdigest(s) parseSuffixes(@sequence,minmatchsize,maxmatchsize,allowederrors,s) return cache?(@matches) unless(method == METHOD_SUFFIX) @suffixes.each do |md5val,posArray| if(md5val == SUFFIXLEN) next end if (md5val == @pattern) filteredPosArray = filter(posArray) match = Array[md5val, 0, filteredPosArray] $log.debug "Match: " << match.inspect @matches << match else if(posArray[0]>= minmatchsize && posArray[0] <= maxmatchsize) # Get string seq = extractSuffix(posArray[1],posArray[0]) errors = isApproximateEqual?(seq,s,useHamming,edit) if(errors>=0) filteredPosArray = filter(posArray) match = Array[md5val, errors, filteredPosArray] $log.debug "Match: " << match.inspect @matches << match end end end end return cache?(@matches) end
Search exact match
# File lib/cassiopee.rb, line 447 def searchExact(s) if(@useAmbiguity) return searchApproximate(s,0) end s = s.downcase updateCache(0,0) @matches = @cache.loadCache() if(@matches.length>0) return cache?(@matches) end #@matches.clear @pattern = Digest::MD5.hexdigest(s) parseSuffixes(@sequence,s.length,s.length,0,s) return @matches unless(method == METHOD_SUFFIX) # Search required length, compare (compare md5?) # MD5 = 128 bits, easier to compare for large strings matchsize = @pattern.length @suffixes.each do |md5val,posArray| if (isMatchEqual?(md5val)) match = Array[md5val, 0, posArray] $log.debug "Match: " << match.inspect @matches << match end end return cache?(@matches) end
Set Logger level
# File lib/cassiopee.rb, line 352 def setLogLevel(level) $log.level = level end
# File lib/cassiopee.rb, line 612 def to_pos positions = Hash.new @matches.each do |match| # match = Array[md5val, errors, posArray] i=0 len = 0 match[2].each do |pos| if(i==0) len = pos else if(positions.has_key?(pos)) posmatch = positions[pos] posmatch << Array[len,match[1]] else posmatch = Array.new posmatch << Array[len,match[1]] positions[pos] = posmatch end end i += 1 end end return positions.sort end
# File lib/cassiopee.rb, line 640 def to_s puts '{ matches: "' << @matches.length << '" }' end
If cache is used, store results for later retrieval, else return matches directly
# File lib/cassiopee.rb, line 647 def cache?(results) if(@useCache) @cache.saveCache(results) end return results end
check if string is approximatly equal to pattern s: string to compare pattern: base pattern used useHamming: use Hamming or edit distance edit : allowed errors
# File lib/cassiopee.rb, line 679 def isApproximateEqual?(s,pattern,useHamming,edit) errors = -1 s.extend(Cassiopee) if(useHamming) if(@useAmbiguity && @ambiguous!=nil) errors = s.computeHammingAmbiguous(pattern,edit,@ambiguous) else errors = s.computeHamming(pattern,edit) end else if(@useAmbiguity && @ambiguous!=nil) errors = s.computeLevenshteinAmbiguous(pattern,edit,@ambigous) else errors = s.computeLevenshtein(pattern,edit) end end end
check if md5 is equal to pattern
# File lib/cassiopee.rb, line 667 def isMatchEqual?(s) if(@pattern == s) return true end return false end
Parse input string
creates a suffix file
creates a suffix position file
# File lib/cassiopee.rb, line 705 def parseSuffixes(s,minlen,maxlen,edit=0,pat=nil) # Controls if(minlen<=0) minlen = 1 end if(maxlen>@sequence.length) maxlen = @sequence.length end if(!use_store) minpos = @min_position if(@max_position==0) maxpos = @sequence.length else maxpos = @max_position end else minpos = 0 maxpos = @sequence.length - minlen end suffixlen = nil $log.info('Start indexing') loaded = false # Hash in memory already contain suffixes for searched lengths if(@suffixes != nil && !@suffixes.empty?) suffixlen = @suffixes[SUFFIXLEN] if(suffixlen!=nil && !suffixlen.empty?) loaded = true (maxlen).downto(minlen) do |len| if(suffixlen.index(len)==nil) loaded = false break end end end end if(@use_store && loaded) $log.debug('already in memory, skip file loading') end # If not already in memory if(@use_store && !loaded) @suffixes = loadSuffixes(@file_suffix+FILE_SUFFIX_POS) suffixlen = @suffixes[SUFFIXLEN] end nbSuffix = 0 changed = false # Load suffix between maxlen and minlen (maxlen).downto(minlen) do |i| $log.debug('parse for length ' << i.to_s) if(suffixlen!=nil && suffixlen.index(i)!=nil) $log.debug('length '<<i " next end changed = true prev_progress = -1 (minpos..(maxpos)).each do |j| # if position+length longer than sequence length, skip it if(j+i>@sequence.length) next end @suffix = s[j,i] @suffixmd5 = Digest::MD5.hexdigest(@suffix) @position = j progress = (@position * 100).div(@sequence.length) if((progress % 10) == 0 && progress > prev_progress) prev_progress = progress $log.debug("progress: " << progress.to_s) end if(method==METHOD_DIRECT) if(edit==0 && !@useAmbiguity) if(isMatchEqual?(@suffixmd5)) errors = 0 else errors = -1 end else if(edit>=0) useHamming = true allowederrors = edit else useHamming = false allowederrors = edit * (-1) end errors = isApproximateEqual?(@suffix,pat,useHamming,allowederrors) end if(errors>=0) match = Array[@suffixmd5, errors, Array[i,j]] $log.debug "Match: " << match.inspect @matches << match end else nbSuffix += addSuffix(@suffixmd5, @position,i) end end $log.debug("Nb suffix found: " << nbSuffix.to_s << ' for length ' << i.to_s) unless method==METHOD_DIRECT end if(@use_store && changed) $log.info("Store suffixes") marshal_dump = Marshal.dump(@suffixes) sfxpos = File.new(@file_suffix+FILE_SUFFIX_POS,'w') sfxpos = Zlib::GzipWriter.new(sfxpos) sfxpos.write marshal_dump sfxpos.close end $log.info('End of indexing') end # Add a suffix in Hashmap def addSuffix(md5val,position,len) if(@suffixes.has_key?(md5val)) # Add position @suffixes[md5val] << position else # Add position, write new suffix # First elt is size of elt @suffixes[md5val] = Array[len, position] if(@suffixes.has_key?(SUFFIXLEN)) @suffixes[SUFFIXLEN] << len else @suffixes[SUFFIXLEN] = Array[len] end end return 1 end # read input string, and concat content def readSequence(s) $log.debug('read input sequence') counter = 1 sequence = '' begin file = File.new(s, "r") File.delete(@file_suffix+FILE_SUFFIX_EXT) unless !File.exists?(@file_suffix+FILE_SUFFIX_EXT) File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data| while (line = file.gets) counter = counter + 1 input = line.downcase.chomp skip = false comments.each do |c| if(input[0] == c[0]) # Line start with a comment char, skip it $log.debug("skip line") skip = true break end end if(!skip) sequence << input data.puts input end end end file.close rescue => err puts "Exception: #{err}" err end $log.debug('data file created') return sequence end # Load suffix position file in memory def loadSuffixes(file_name) return Hash.new unless File.exists?(@file_suffix+FILE_SUFFIX_POS) begin file = Zlib::GzipReader.open(file_name) rescue Zlib::GzipFile::Error file = File.open(file_name, 'r') ensure obj = Marshal.load file.read file.close return obj end end # Filter @matches to keep only the longest or the error less matches for a same start position def filterOptimal(type) positions = Hash.new @matches.each do |match| # match = Array[md5val, errors, posArray] i=0 len = 0 match[2].each do |pos| if(i==0) len = pos else if(positions.has_key?(pos)) posmatch = positions[pos] posmatch << Array[len,match[1],match[0]] #positions[pos] << posmatch else posmatch = Array.new posmatch << Array[len,match[1],match[0]] positions[pos] = posmatch end end i += 1 end end matchtoremove = Array.new positions.each do |pos,posmatch| optimal = nil match = nil count = 0 newoptimal = nil newmatch = nil (0..posmatch.length-1).each do |i| solution = posmatch[i] if(i==0) if(type==0) # length optimal = solution[0] else # cost optimal = solution[1] end match = solution[2].to_s #count += 1 next end newmatch = solution[2].to_s if(type==0) # length newoptimal = solution[0] if(newoptimal.to_i>optimal.to_i) optimal = newoptimal matchtoremove << match match = newmatch else matchtoremove << newmatch end else # cost newoptimal = solution[1] if(newoptimal<optimal) optimal = newoptimal matchtoremove << match match = newmatch else matchtoremove << newmatch end end count += 1 end end newmatches = Array.new @matches.each do |match| found = false matchtoremove.each do |item| if(match[0]==item) found = true break end end if(!found) newmatches << match end end @matches = newmatches end end end ")
Update cache object with current object parameters
method: 0 -> exact, 1 -> hamming, 2 -> edit
# File lib/cassiopee.rb, line 657 def updateCache(method,errors) @cache.file_suffix = @file_suffix @cache.min_position = @min_position @cache.max_position = @max_position @cache.method = method @cache.errors = errors end