| Trees | Indices | Help |
|
|---|
|
|
1 """Base classes for match providers.
2
3 They are used by business objects to give
4 phrasewheels the ability to guess phrases.
5
6 Copyright (C) GNUMed developers
7 license: GPL
8 """
9 ############################################################################
10 # $Source: /cvsroot/gnumed/gnumed/gnumed/client/pycommon/gmMatchProvider.py,v $
11 # $Id: gmMatchProvider.py,v 1.34 2009/12/21 15:02:17 ncq Exp $
12 __version__ = "$Revision: 1.34 $"
13 __author__ = "K.Hilbert <Karsten.Hilbert@gmx.net>, I.Haywood <ihaywood@gnu.org>, S.J.Tan <sjtan@bigpond.com>"
14
15 # std lib
16 import string, types, time, sys, re as regex, logging
17
18
19 # GNUmed
20 from Gnumed.pycommon import gmPG2
21
22
23 _log = logging.getLogger('gm.ui')
24 _log.info(__version__)
25
26
27 default_ignored_chars = "[?!.'\\(){}\[\]<>~#*$%^_]+" + '"'
28 default_word_separators = '[- \t=+&:@]+'
29 #============================================================
31 """Base class for match providing objects.
32
33 Match sources might be:
34 - database tables
35 - flat files
36 - previous input
37 - config files
38 - in-memory list created on the fly
39 """
40 print_queries = False
41 #--------------------------------------------------------
43 self.setThresholds()
44
45 self._context_vals = {}
46 self.__ignored_chars = regex.compile(default_ignored_chars)
47 self.__word_separators = regex.compile(default_word_separators)
48 #--------------------------------------------------------
49 # actions
50 #--------------------------------------------------------
52 """Return matches according to aFragment and matching thresholds.
53
54 FIXME: design decision: we dont worry about data source changes
55 during the lifetime of a MatchProvider
56 FIXME: append _("*get all items*") on truncation
57 """
58 # sanity check
59 if aFragment is None:
60 raise ValueError, 'Cannot find matches without a fragment.'
61
62 # user explicitly wants all matches
63 if aFragment == u'*':
64 return self.getAllMatches()
65
66 # case insensitivity
67 tmpFragment = aFragment.lower()
68 # remove ignored chars
69 if self.__ignored_chars is not None:
70 tmpFragment = self.__ignored_chars.sub('', tmpFragment)
71 # normalize word separators
72 if self.__word_separators is not None:
73 tmpFragment = u' '.join(self.__word_separators.split(tmpFragment))
74 # length in number of significant characters only
75 lngFragment = len(tmpFragment)
76
77 # order is important !
78 if lngFragment >= self.__threshold_substring:
79 return self.getMatchesBySubstr(tmpFragment)
80 elif lngFragment >= self.__threshold_word:
81 return self.getMatchesByWord(tmpFragment)
82 elif lngFragment >= self.__threshold_phrase:
83 return self.getMatchesByPhrase(tmpFragment)
84 else:
85 return (False, [])
86 #--------------------------------------------------------
89 #--------------------------------------------------------
92 #--------------------------------------------------------
95 #--------------------------------------------------------
98 #--------------------------------------------------------
99 # configuration
100 #--------------------------------------------------------
102 """Set match location thresholds.
103
104 - the fragment passed to getMatches() must contain at least this many
105 characters before it triggers a match search at:
106 1) phrase_start - start of phrase (first word)
107 2) word_start - start of any word within phrase
108 3) in_word - _inside_ any word within phrase
109 """
110 # sanity checks
111 if aSubstring < aWord:
112 _log.error('Setting substring threshold (%s) lower than word-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_substring, self.__threshold_word))
113 return False
114 if aWord < aPhrase:
115 _log.error('Setting word-start threshold (%s) lower than phrase-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_word, self.__threshold_phrase))
116 return False
117
118 # now actually reassign thresholds
119 self.__threshold_phrase = aPhrase
120 self.__threshold_word = aWord
121 self.__threshold_substring = aSubstring
122
123 return True
124 #--------------------------------------------------------
126 if word_separators is None:
127 self.__word_separators = None
128 else:
129 self.__word_separators = regex.compile(word_separators)
130
135
136 word_separators = property(_get_word_separators, _set_word_separators)
137 #--------------------------------------------------------
139 if ignored_chars is None:
140 self.__ignored_chars = None
141 else:
142 self.__ignored_chars = regex.compile(ignored_chars)
143
148
149 ignored_chars = property(_get_ignored_chars, _set_ignored_chars)
150 #--------------------------------------------------------
152 """Set value to provide context information for matches.
153
154 The matching code may ignore it depending on its exact
155 implementation. Names and values of the context depend
156 on what is being matched.
157
158 <context> -- the *placeholder* key *inside* the context
159 definition, not the context *definition* key
160 """
161 if context is None:
162 return False
163 self._context_vals[context] = val
164 return True
165 #--------------------------------------------------------
171 #------------------------------------------------------------
172 # usable instances
173 #------------------------------------------------------------
175 """Match provider where all possible options can be held
176 in a reasonably sized, pre-allocated list.
177 """
179 """aSeq must be a list of dicts. Each dict must have the keys (data, label, weight)
180 """
181 if not type(aSeq) in [types.ListType, types.TupleType]:
182 _log.error('fixed list match provider argument must be a list or tuple of dicts')
183 raise TypeError('fixed list match provider argument must be a list or tuple of dicts')
184
185 self.__items = aSeq
186 cMatchProvider.__init__(self)
187 #--------------------------------------------------------
188 # internal matching algorithms
189 #
190 # if we end up here:
191 # - aFragment will not be "None"
192 # - aFragment will be lower case
193 # - we _do_ deliver matches (whether we find any is a different story)
194 #--------------------------------------------------------
196 """Return matches for aFragment at start of phrases."""
197 matches = []
198 # look for matches
199 for item in self.__items:
200 # at start of phrase, that is
201 if string.find(string.lower(item['label']), aFragment) == 0:
202 matches.append(item)
203 # no matches found
204 if len(matches) == 0:
205 return (False, [])
206
207 matches.sort(self.__cmp_items)
208 return (True, matches)
209 #--------------------------------------------------------
211 """Return matches for aFragment at start of words inside phrases."""
212 matches = []
213 # look for matches
214 for item in self.__items:
215 pos = string.find(string.lower(item['label']), aFragment)
216 # found at start of phrase
217 if pos == 0:
218 matches.append(item)
219 # found as a true substring
220 elif pos > 0:
221 # but use only if substring is at start of a word
222 if (item['label'])[pos-1] == ' ':
223 matches.append(item)
224 # no matches found
225 if len(matches) == 0:
226 return (False, [])
227
228 matches.sort(self.__cmp_items)
229 return (True, matches)
230 #--------------------------------------------------------
232 """Return matches for aFragment as a true substring."""
233 matches = []
234 # look for matches
235 for item in self.__items:
236 if string.find(string.lower(item['label']), aFragment) != -1:
237 matches.append(item)
238 # no matches found
239 if len(matches) == 0:
240 return (False, [])
241
242 matches.sort(self.__cmp_items)
243 return (True, matches)
244 #--------------------------------------------------------
246 """Return all items."""
247 matches = self.__items
248 # no matches found
249 if len(matches) == 0:
250 return (False, [])
251
252 matches.sort(self.__cmp_items)
253 return (True, matches)
254 #--------------------------------------------------------
256 """items must be a list of dicts. Each dict must have the keys (data, label, weight)"""
257 self.__items = items
258 #--------------------------------------------------------
269 # ===========================================================
271 """Match provider which searches matches
272 in the results of a function call.
273 """
275 """get_candidates() must return a list of strings."""
276 if get_candidates is None:
277 _log.error('must define function to retrieve match candidates list')
278 raise ArgumentError('must define function to retrieve match candidates list')
279
280 self._get_candidates = get_candidates
281 cMatchProvider.__init__(self)
282 #--------------------------------------------------------
283 # internal matching algorithms
284 #
285 # if we end up here:
286 # - aFragment will not be "None"
287 # - aFragment will be lower case
288 # - we _do_ deliver matches (whether we find any is a different story)
289 #--------------------------------------------------------
291 """Return matches for aFragment at start of phrases."""
292 print "getting phrase matches"
293 matches = []
294 candidates = self._get_candidates()
295 # look for matches
296 for candidate in candidates:
297 # at start of phrase, that is
298 if aFragment.startswith(candidate['label'].lower()):
299 matches.append(candidate)
300 # no matches found
301 if len(matches) == 0:
302 return (False, [])
303
304 matches.sort(self.__cmp_candidates)
305 return (True, matches)
306 #--------------------------------------------------------
308 """Return matches for aFragment at start of words inside phrases."""
309 print "getting word matches"
310 matches = []
311 candidates = self._get_candidates()
312 # look for matches
313 for candidate in candidates:
314 pos = candidate['label'].lower().find(aFragment)
315 # pos = string.find(string.lower(candidate['label']), aFragment)
316 # found as a true substring
317 # but use only if substring is at start of a word
318 # FIXME: use word seps
319 if (pos == 0) or (candidate['label'][pos-1] == ' '):
320 matches.append(candidate)
321 # no matches found
322 if len(matches) == 0:
323 return (False, [])
324
325 matches.sort(self.__cmp_candidates)
326 return (True, matches)
327 #--------------------------------------------------------
329 """Return matches for aFragment as a true substring."""
330 matches = []
331 candidates = self._get_candidates()
332 # look for matches
333 for candidate in candidates:
334 if candidate['label'].lower().find(aFragment) != -1:
335 # if string.find(string.lower(candidate['label']), aFragment) != -1:
336 matches.append(candidate)
337 # no matches found
338 if len(matches) == 0:
339 return (False, [])
340
341 matches.sort(self.__cmp_candidates)
342 return (True, matches)
343 #--------------------------------------------------------
347 #--------------------------------------------------------
351 # FIXME: do ordering
352 # if candidate1 < candidate2:
353 # return -1
354 # if candidate1 == candidate2:
355 # return 0
356 # return 1
357
358 # ===========================================================
360 """Match provider which searches matches
361 in possibly several database tables.
362
363 queries:
364 - a list of unicode strings
365 - each string is a query
366 - each string must contain: "... where <column> %(fragment_condition)s ..."
367 - each string can contain in the where clause: "... %(<context_key>)s ..."
368
369 context definitions to be used in the queries
370 example: {'ctxt_country': {'where_part': 'and country = %(country)s', 'placeholder': 'country'}}
371 """
373 if type(queries) != types.ListType:
374 queries = [queries]
375
376 self._queries = queries
377
378 if context is None:
379 self._context = {}
380 else:
381 self._context = context
382
383 self._args = {}
384 cMatchProvider.__init__(self)
385 #--------------------------------------------------------
386 # internal matching algorithms
387 #
388 # if we end up here:
389 # - aFragment will not be "None"
390 # - aFragment will be lower case
391 # - we _do_ deliver matches (whether we find any is a different story)
392 #--------------------------------------------------------
394 """Return matches for aFragment at start of phrases."""
395 fragment_condition = u"ilike %(fragment)s"
396 self._args['fragment'] = u"%s%%" % aFragment
397 return self.__find_matches(fragment_condition)
398 #--------------------------------------------------------
400 """Return matches for aFragment at start of words inside phrases."""
401 fragment_condition = u"~* %(fragment)s"
402 aFragment = gmPG2.sanitize_pg_regex(expression = aFragment, escape_all = False)
403 self._args['fragment'] = u"( %s)|(^%s)" % (aFragment, aFragment)
404 return self.__find_matches(fragment_condition)
405 #--------------------------------------------------------
407 """Return matches for aFragment as a true substring."""
408 fragment_condition = u"ilike %(fragment)s"
409 self._args['fragment'] = u"%%%s%%" % aFragment
410 return self.__find_matches(fragment_condition)
411 #--------------------------------------------------------
415 #--------------------------------------------------------
417 matches = []
418 for query in self._queries:
419 where_fragments = {'fragment_condition': fragment_condition}
420
421 for context_key, context_def in self._context.items():
422 try:
423 placeholder = context_def['placeholder']
424 where_part = context_def['where_part']
425 self._args[placeholder] = self._context_vals[placeholder]
426 # we do have a context value for this key, so add the where condition
427 where_fragments[context_key] = where_part
428 except KeyError:
429 # we don't have a context value for this key, so skip the where condition
430 where_fragments[context_key] = u''
431
432 cmd = query % where_fragments
433
434 if self.print_queries:
435 print self.__class__.__name__
436 print self._context_vals
437 print self._args
438 print cmd
439
440 try:
441 rows, idx = gmPG2.run_ro_queries(queries = [{'cmd': cmd, 'args': self._args}])
442 except:
443 _log.exception('Error running match provider SQL, dropping query.')
444 idx = self._queries.index(query)
445 del self._queries[idx]
446 break
447
448 # no matches found: try next query
449 if len(rows) == 0:
450 continue
451
452 for row in rows:
453 matches.append({'data': row[0], 'label': row[1], 'weight': 0})
454
455 return (True, matches)
456 # none found whatsoever
457 return (False, [])
458 #================================================================
459 if __name__ == '__main__':
460 pass
461
462 #================================================================
463 # $Log: gmMatchProvider.py,v $
464 # Revision 1.34 2009/12/21 15:02:17 ncq
465 # - fix typo
466 #
467 # Revision 1.33 2009/04/05 17:58:27 ncq
468 # - improved docs
469 #
470 # Revision 1.32 2009/04/03 09:34:06 ncq
471 # - some exception cleanup
472 #
473 # Revision 1.31 2009/03/01 18:07:14 ncq
474 # - factor out default ignored chars/word separators onto module level
475 #
476 # Revision 1.30 2009/01/21 22:34:09 ncq
477 # - make FixedList match provider work nicely again
478 #
479 # Revision 1.29 2008/06/16 15:02:35 ncq
480 # - cleanup
481 # - remove unneeded methods
482 #
483 # Revision 1.28 2008/06/15 20:31:10 ncq
484 # - make match provider derive from object
485 # - turn ignored chars and word separators into properties
486 # - raise NotImplementedError in base match provider
487 # - remove dis/enableMatching
488 #
489 # Revision 1.27 2008/06/09 15:28:21 ncq
490 # - .print_queries and support it in sql provider
491 #
492 # Revision 1.26 2008/04/29 18:29:29 ncq
493 # - remove increaseScore
494 #
495 # Revision 1.25 2007/12/12 16:17:15 ncq
496 # - better logger names
497 #
498 # Revision 1.24 2007/12/11 14:31:11 ncq
499 # - use std logging
500 #
501 # Revision 1.23 2007/12/02 20:59:13 ncq
502 # - drop failing queries
503 #
504 # Revision 1.22 2007/07/03 15:57:24 ncq
505 # - use gmPG2.sanitize_pg_regex()
506 # - ignore failing match retrieval queries such
507 # that we don't freak out in the phrasewheel
508 #
509 # Revision 1.21 2007/01/07 23:02:11 ncq
510 # - more documentation on context
511 #
512 # Revision 1.20 2006/11/06 09:59:42 ncq
513 # - when allowing non-list strings to turn into query list do not
514 # str() them or else we may lose unicodity
515 # - more u''ing
516 #
517 # Revision 1.19 2006/11/05 16:07:31 ncq
518 # - *_SQL2 now really handles context values, tested, too
519 # - some u''-ification
520 # - don't sort items in *_SQL2, rely on in-query ORDER BY instead
521 #
522 # Revision 1.18 2006/10/24 13:18:29 ncq
523 # - switch to gmPG2
524 # - remove cMatchProvider_SQL()
525 #
526 # Revision 1.17 2006/05/25 22:13:30 ncq
527 # - robustify set_context()
528 #
529 # Revision 1.16 2006/05/01 18:46:05 ncq
530 # - cleanup
531 #
532 # Revision 1.15 2005/06/14 18:54:40 ncq
533 # - don't sort in SQL2 matcher - queries should ORDER BY
534 #
535 # Revision 1.14 2005/06/12 21:20:55 ncq
536 # - make SQL2 match provider more robust regarding query list
537 #
538 # Revision 1.13 2005/06/12 21:16:55 ncq
539 # - make SQL2 match provider accept a query list
540 #
541 # Revision 1.12 2005/06/10 17:07:34 cfmoro
542 # Fixed set_context in SQL2
543 #
544 # Revision 1.11 2005/06/08 01:27:12 cfmoro
545 # Renamed function to make parent set_context work
546 #
547 # Revision 1.10 2005/06/07 10:16:37 ncq
548 # - setContext -> set_context
549 #
550 # Revision 1.9 2005/05/08 21:40:57 ncq
551 # - cleanup
552 #
553 # Revision 1.8 2005/04/14 18:24:57 ncq
554 # - some cleanup of funky magic so we are faster
555 #
556 # Revision 1.7 2005/04/11 18:00:54 ncq
557 # - cleanup
558 #
559 # Revision 1.6 2005/03/14 14:35:27 ncq
560 # - add match provider class cMatchProvider_Func which pulls
561 # match candidates through a function
562 #
563 # Revision 1.5 2004/07/17 21:08:51 ncq
564 # - gmPG.run_query() now has a verbosity parameter, so use it
565 #
566 # Revision 1.4 2004/05/02 22:54:43 ncq
567 # - cleanup
568 #
569 # Revision 1.3 2004/04/30 09:10:57 ncq
570 # - label needs to be str()ed in list.append()
571 #
572 # Revision 1.2 2004/03/10 12:56:01 ihaywood
573 # fixed sudden loss of main.shadow
574 # more work on referrals,
575 #
576 # Revision 1.1 2004/02/25 09:30:13 ncq
577 # - moved here from python-common
578 #
579 # Revision 1.13 2004/01/12 13:10:27 ncq
580 # - remove debugging code
581 #
582 # Revision 1.12 2004/01/06 10:02:47 ncq
583 # - add _SQL2 match provider that operates on queries rather than tables/columns
584 #
585 # Revision 1.11 2003/12/29 16:28:04 uid66147
586 # - I think we got the indentation level wrong when
587 # applying the extra condition default context
588 #
589 # Revision 1.10 2003/11/20 08:55:05 ncq
590 # - some internal cleanup/renaming
591 #
592 # Revision 1.9 2003/11/20 02:16:03 sjtan
593 #
594 # make __context_val in base class gmMatchProvider protected instead of class private, so subclasses can
595 # access it.
596 #
597 # Revision 1.8 2003/11/20 01:37:05 sjtan
598 #
599 # syntax correction.
600 #
601 # Revision 1.7 2003/11/20 00:33:12 ncq
602 # - improve comments on extra conditions in __find_matches()
603 #
604 # Revision 1.6 2003/11/19 23:18:37 ncq
605 # - some cleanup
606 #
607
| Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Tue Feb 9 04:01:52 2010 | http://epydoc.sourceforge.net |