Package Gnumed :: Package pycommon :: Module gmMatchProvider
[frames] | no frames]

Source Code for Module Gnumed.pycommon.gmMatchProvider

  1  """Base classes for match providers. 
  2   
  3  They are used by business objects to give 
  4  phrasewheels the ability to guess phrases. 
  5   
  6  Copyright (C) GNUMed developers 
  7  license: GPL 
  8  """ 
  9  __version__ = "$Revision: 1.34 $" 
 10  __author__  = "K.Hilbert <Karsten.Hilbert@gmx.net>, I.Haywood <ihaywood@gnu.org>, S.J.Tan <sjtan@bigpond.com>" 
 11   
 12  # std lib 
 13  import string, types, time, sys, re as regex, logging 
 14   
 15   
 16  # GNUmed 
 17  from Gnumed.pycommon import gmPG2 
 18   
 19   
 20  _log = logging.getLogger('gm.ui') 
 21  _log.info(__version__) 
 22   
 23   
 24  default_ignored_chars = "[?!.'\\(){}\[\]<>~#*$%^_]+" + '"' 
 25  default_word_separators = '[- \t=+&:@]+' 
 26  #============================================================ 
27 -class cMatchProvider(object):
28 """Base class for match providing objects. 29 30 Match sources might be: 31 - database tables 32 - flat files 33 - previous input 34 - config files 35 - in-memory list created on the fly 36 """ 37 print_queries = False 38 #--------------------------------------------------------
39 - def __init__(self):
40 self.setThresholds() 41 42 self._context_vals = {} 43 self.__ignored_chars = regex.compile(default_ignored_chars) 44 # used to normalize word boundaries: 45 self.__word_separators = regex.compile(default_word_separators)
46 #-------------------------------------------------------- 47 # actions 48 #--------------------------------------------------------
49 - def getMatches(self, aFragment = None):
50 """Return matches according to aFragment and matching thresholds. 51 52 FIXME: design decision: we dont worry about data source changes 53 during the lifetime of a MatchProvider 54 FIXME: append _("*get all items*") on truncation 55 """ 56 # sanity check 57 if aFragment is None: 58 raise ValueError, 'Cannot find matches without a fragment.' 59 60 # user explicitly wants all matches 61 if aFragment == u'*': 62 return self.getAllMatches() 63 64 # case insensitivity 65 tmpFragment = aFragment.lower() 66 # remove ignored chars 67 if self.__ignored_chars is not None: 68 tmpFragment = self.__ignored_chars.sub('', tmpFragment) 69 # normalize word separators 70 if self.__word_separators is not None: 71 tmpFragment = u' '.join(self.__word_separators.split(tmpFragment)) 72 # length in number of significant characters only 73 lngFragment = len(tmpFragment) 74 75 # order is important ! 76 if lngFragment >= self.__threshold_substring: 77 return self.getMatchesBySubstr(tmpFragment) 78 elif lngFragment >= self.__threshold_word: 79 return self.getMatchesByWord(tmpFragment) 80 elif lngFragment >= self.__threshold_phrase: 81 return self.getMatchesByPhrase(tmpFragment) 82 else: 83 return (False, [])
84 #--------------------------------------------------------
85 - def getAllMatches(self):
86 raise NotImplementedError
87 #--------------------------------------------------------
88 - def getMatchesByPhrase(self, aFragment):
89 raise NotImplementedError
90 #--------------------------------------------------------
91 - def getMatchesByWord(self, aFragment):
92 raise NotImplementedError
93 #--------------------------------------------------------
94 - def getMatchesBySubstr(self, aFragment):
95 raise NotImplementedError
96 #-------------------------------------------------------- 97 # configuration 98 #--------------------------------------------------------
99 - def setThresholds(self, aPhrase = 1, aWord = 3, aSubstring = 5):
100 """Set match location thresholds. 101 102 - the fragment passed to getMatches() must contain at least this many 103 characters before it triggers a match search at: 104 1) phrase_start - start of phrase (first word) 105 2) word_start - start of any word within phrase 106 3) in_word - _inside_ any word within phrase 107 """ 108 # sanity checks 109 if aSubstring < aWord: 110 _log.error('Setting substring threshold (%s) lower than word-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_substring, self.__threshold_word)) 111 return False 112 if aWord < aPhrase: 113 _log.error('Setting word-start threshold (%s) lower than phrase-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_word, self.__threshold_phrase)) 114 return False 115 116 # now actually reassign thresholds 117 self.__threshold_phrase = aPhrase 118 self.__threshold_word = aWord 119 self.__threshold_substring = aSubstring 120 121 return True
122 #--------------------------------------------------------
123 - def _set_word_separators(self, word_separators=None):
124 if word_separators is None: 125 self.__word_separators = None 126 else: 127 self.__word_separators = regex.compile(word_separators)
128
129 - def _get_word_separators(self):
130 if self.__word_separators is None: 131 return None 132 return self.__word_separators.pattern
133 134 word_separators = property(_get_word_separators, _set_word_separators) 135 #--------------------------------------------------------
136 - def _set_ignored_chars(self, ignored_chars=None):
137 if ignored_chars is None: 138 self.__ignored_chars = None 139 else: 140 self.__ignored_chars = regex.compile(ignored_chars)
141
142 - def _get_ignored_chars(self):
143 if self.__ignored_chars is None: 144 return None 145 return self.__ignored_chars.pattern
146 147 ignored_chars = property(_get_ignored_chars, _set_ignored_chars) 148 #--------------------------------------------------------
149 - def set_context (self, context=None, val=None):
150 """Set value to provide context information for matches. 151 152 The matching code may ignore it depending on its exact 153 implementation. Names and values of the context depend 154 on what is being matched. 155 156 <context> -- the *placeholder* key *inside* the context 157 definition, not the context *definition* key 158 """ 159 if context is None: 160 return False 161 self._context_vals[context] = val 162 return True
163 #--------------------------------------------------------
164 - def unset_context(self, context=None):
165 try: 166 del self._context_vals[context] 167 except KeyError: 168 pass
169 #------------------------------------------------------------ 170 # usable instances 171 #------------------------------------------------------------
172 -class cMatchProvider_FixedList(cMatchProvider):
173 """Match provider where all possible options can be held 174 in a reasonably sized, pre-allocated list. 175 """
176 - def __init__(self, aSeq = None):
177 """aSeq must be a list of dicts. Each dict must have the keys (data, label, weight) 178 """ 179 if not type(aSeq) in [types.ListType, types.TupleType]: 180 _log.error('fixed list match provider argument must be a list or tuple of dicts') 181 raise TypeError('fixed list match provider argument must be a list or tuple of dicts') 182 183 self.__items = aSeq 184 cMatchProvider.__init__(self)
185 #-------------------------------------------------------- 186 # internal matching algorithms 187 # 188 # if we end up here: 189 # - aFragment will not be "None" 190 # - aFragment will be lower case 191 # - we _do_ deliver matches (whether we find any is a different story) 192 #--------------------------------------------------------
193 - def getMatchesByPhrase(self, aFragment):
194 """Return matches for aFragment at start of phrases.""" 195 matches = [] 196 # look for matches 197 for item in self.__items: 198 # at start of phrase, that is 199 if string.find(string.lower(item['list_label']), aFragment) == 0: 200 matches.append(item) 201 # no matches found 202 if len(matches) == 0: 203 return (False, []) 204 205 matches.sort(self.__cmp_items) 206 return (True, matches)
207 #--------------------------------------------------------
208 - def getMatchesByWord(self, aFragment):
209 """Return matches for aFragment at start of words inside phrases.""" 210 matches = [] 211 # look for matches 212 for item in self.__items: 213 pos = string.find(string.lower(item['list_label']), aFragment) 214 # found at start of phrase 215 if pos == 0: 216 matches.append(item) 217 # found as a true substring 218 elif pos > 0: 219 # but use only if substring is at start of a word 220 if (item['list_label'])[pos-1] == u' ': 221 matches.append(item) 222 # no matches found 223 if len(matches) == 0: 224 return (False, []) 225 226 matches.sort(self.__cmp_items) 227 return (True, matches)
228 #--------------------------------------------------------
229 - def getMatchesBySubstr(self, aFragment):
230 """Return matches for aFragment as a true substring.""" 231 matches = [] 232 # look for matches 233 for item in self.__items: 234 if string.find(string.lower(item['list_label']), aFragment) != -1: 235 matches.append(item) 236 # no matches found 237 if len(matches) == 0: 238 return (False, []) 239 240 matches.sort(self.__cmp_items) 241 return (True, matches)
242 #--------------------------------------------------------
243 - def getAllMatches(self):
244 """Return all items.""" 245 matches = self.__items 246 # no matches found 247 if len(matches) == 0: 248 return (False, []) 249 250 matches.sort(self.__cmp_items) 251 return (True, matches)
252 #--------------------------------------------------------
253 - def set_items(self, items):
254 """items must be a list of dicts. Each dict must have the keys (data, label, weight)""" 255 self.__items = items
256 #--------------------------------------------------------
257 - def __cmp_items(self, item1, item2):
258 """Compare items based on weight.""" 259 if item1['weight'] == item2['weight']: 260 return 0 261 262 # do it the wrong way round to do sorting/reversing at once 263 if item1['weight'] < item2['weight']: 264 return 1 265 if item1['weight'] > item2['weight']: 266 return -1
267 # ===========================================================
268 -class cMatchProvider_Func(cMatchProvider):
269 """Match provider which searches matches 270 in the results of a function call. 271 """
272 - def __init__(self, get_candidates = None):
273 """get_candidates() must return a list of strings.""" 274 if get_candidates is None: 275 _log.error('must define function to retrieve match candidates list') 276 raise ValueError('must define function to retrieve match candidates list') 277 278 self._get_candidates = get_candidates 279 cMatchProvider.__init__(self)
280 #-------------------------------------------------------- 281 # internal matching algorithms 282 # 283 # if we end up here: 284 # - aFragment will not be "None" 285 # - aFragment will be lower case 286 # - we _do_ deliver matches (whether we find any is a different story) 287 #--------------------------------------------------------
288 - def getMatchesByPhrase(self, aFragment):
289 """Return matches for aFragment at start of phrases.""" 290 print "getting phrase matches" 291 matches = [] 292 candidates = self._get_candidates() 293 # look for matches 294 for candidate in candidates: 295 # at start of phrase, that is 296 if aFragment.startswith(candidate['list_label'].lower()): 297 matches.append(candidate) 298 # no matches found 299 if len(matches) == 0: 300 return (False, []) 301 302 matches.sort(self.__cmp_candidates) 303 return (True, matches)
304 #--------------------------------------------------------
305 - def getMatchesByWord(self, aFragment):
306 """Return matches for aFragment at start of words inside phrases.""" 307 print "getting word matches" 308 matches = [] 309 candidates = self._get_candidates() 310 # look for matches 311 for candidate in candidates: 312 pos = candidate['list_label'].lower().find(aFragment) 313 # pos = string.find(string.lower(candidate['list_label']), aFragment) 314 # found as a true substring 315 # but use only if substring is at start of a word 316 # FIXME: use word seps 317 if (pos == 0) or (candidate['list_label'][pos-1] == u' '): 318 matches.append(candidate) 319 # no matches found 320 if len(matches) == 0: 321 return (False, []) 322 323 matches.sort(self.__cmp_candidates) 324 return (True, matches)
325 #--------------------------------------------------------
326 - def getMatchesBySubstr(self, aFragment):
327 """Return matches for aFragment as a true substring.""" 328 matches = [] 329 candidates = self._get_candidates() 330 # look for matches 331 for candidate in candidates: 332 if candidate['list_label'].lower().find(aFragment) != -1: 333 # if string.find(string.lower(candidate['list_label']), aFragment) != -1: 334 matches.append(candidate) 335 # no matches found 336 if len(matches) == 0: 337 return (False, []) 338 339 matches.sort(self.__cmp_candidates) 340 return (True, matches)
341 #--------------------------------------------------------
342 - def getAllMatches(self):
343 """Return all candidates.""" 344 return self._get_candidates()
345 #--------------------------------------------------------
346 - def __cmp_candidates(self, candidate1, candidate2):
347 """naive ordering""" 348 return 0
349 # FIXME: do ordering 350 # if candidate1 < candidate2: 351 # return -1 352 # if candidate1 == candidate2: 353 # return 0 354 # return 1 355 356 # ===========================================================
357 -class cMatchProvider_SQL2(cMatchProvider):
358 """Match provider which searches matches 359 in possibly several database tables. 360 361 queries: 362 - a list of unicode strings 363 - each string is a query 364 - each string must contain: "... where <column> %(fragment_condition)s ..." 365 - each string can contain in the where clause: "... %(<context_key>)s ..." 366 - each query must return (data, label) 367 368 context definitions to be used in the queries 369 example: {'ctxt_country': {'where_part': 'and country = %(country)s', 'placeholder': 'country'}} 370 """
371 - def __init__(self, queries = None, context = None):
372 if type(queries) != types.ListType: 373 queries = [queries] 374 375 self._queries = queries 376 377 if context is None: 378 self._context = {} 379 else: 380 self._context = context 381 382 self._args = {} 383 cMatchProvider.__init__(self)
384 #-------------------------------------------------------- 385 # internal matching algorithms 386 # 387 # if we end up here: 388 # - aFragment will not be "None" 389 # - aFragment will be lower case 390 # - we _do_ deliver matches (whether we find any is a different story) 391 #--------------------------------------------------------
392 - def getMatchesByPhrase(self, aFragment):
393 """Return matches for aFragment at start of phrases.""" 394 395 fragment_condition = u"ILIKE %(fragment)s" 396 self._args['fragment'] = u"%s%%" % aFragment 397 398 return self._find_matches(fragment_condition)
399 #--------------------------------------------------------
400 - def getMatchesByWord(self, aFragment):
401 """Return matches for aFragment at start of words inside phrases.""" 402 403 fragment_condition = u"~* %(fragment)s" 404 aFragment = gmPG2.sanitize_pg_regex(expression = aFragment, escape_all = False) 405 self._args['fragment'] = u"( %s)|(^%s)" % (aFragment, aFragment) 406 407 return self._find_matches(fragment_condition)
408 #--------------------------------------------------------
409 - def getMatchesBySubstr(self, aFragment):
410 """Return matches for aFragment as a true substring.""" 411 412 fragment_condition = u"ILIKE %(fragment)s" 413 self._args['fragment'] = u"%%%s%%" % aFragment 414 415 return self._find_matches(fragment_condition)
416 #--------------------------------------------------------
417 - def getAllMatches(self):
418 """Return all items.""" 419 return self.getMatchesBySubstr(u'')
420 #--------------------------------------------------------
421 - def _find_matches(self, fragment_condition):
422 if self.print_queries: 423 print "----------------------" 424 matches = [] 425 for query in self._queries: 426 where_fragments = {'fragment_condition': fragment_condition} 427 428 for context_key, context_def in self._context.items(): 429 try: 430 placeholder = context_def['placeholder'] 431 where_part = context_def['where_part'] 432 self._args[placeholder] = self._context_vals[placeholder] 433 # we do have a context value for this key, so add the where condition 434 where_fragments[context_key] = where_part 435 if self.print_queries: 436 print "ctxt ph:", placeholder 437 print "ctxt where:", where_part 438 print "ctxt val:", self._context_vals[placeholder] 439 except KeyError: 440 # we don't have a context value for this key, so skip the where condition 441 where_fragments[context_key] = u'' 442 443 cmd = query % where_fragments 444 445 if self.print_queries: 446 print "class:", self.__class__.__name__ 447 print "ctxt:", self._context_vals 448 print "args:", self._args 449 print "query:", cmd 450 451 try: 452 rows, idx = gmPG2.run_ro_queries(queries = [{'cmd': cmd, 'args': self._args}]) 453 except: 454 _log.exception('[%s]: error running match provider SQL, dropping query', self.__class__.__name__) 455 idx = self._queries.index(query) 456 del self._queries[idx] 457 break 458 459 # no matches found: try next query 460 if len(rows) == 0: 461 continue 462 463 for row in rows: 464 match = {'weight': 0} 465 466 try: 467 match['data'] = row['data'] 468 except KeyError: 469 match['data'] = row[0] 470 471 try: 472 match['list_label'] = row['list_label'] 473 except KeyError: 474 match['list_label'] = row[1] 475 476 # explicit "field_label" in result ? 477 try: 478 match['field_label'] = row['field_label'] 479 # no 480 except KeyError: 481 # but does row[2] exist ? 482 try: 483 match['field_label'] = row[2] 484 # no: reuse "list_label" 485 except IndexError: 486 match['field_label'] = match['list_label'] 487 488 # try: 489 # match['label'] = row['label'] 490 # except KeyError: 491 # match['label'] = match['list_label'] 492 493 matches.append(match) 494 495 return (True, matches) 496 497 # none found whatsoever 498 return (False, [])
499 #================================================================ 500 if __name__ == '__main__': 501 pass 502 503 #================================================================ 504