| Home | Trees | Indices | Help |
|
|---|
|
|
1 """Base classes for match providers.
2
3 They are used by business objects to give
4 phrasewheels the ability to guess phrases.
5
6 Copyright (C) GNUMed developers
7 license: GPL v2 or later
8 """
9 __version__ = "$Revision: 1.34 $"
10 __author__ = "K.Hilbert <Karsten.Hilbert@gmx.net>, I.Haywood <ihaywood@gnu.org>, S.J.Tan <sjtan@bigpond.com>"
11
12 # std lib
13 import re as regex, logging
14
15
16 # GNUmed
17 from Gnumed.pycommon import gmPG2
18
19
20 _log = logging.getLogger('gm.ui')
21 _log.info(__version__)
22
23
24 # these are stripped from the fragment passed to the
25 # match provider before looking for matches:
26 default_ignored_chars = "[?!.'\\(){}\[\]<>~#*$%^_]+" + '"'
27
28 # these are used to detect word boundaries which is,
29 # in turn, used to normalize word boundaries in the
30 # input fragment
31 default_word_separators = '[- \t=+&:@]+'
32 #============================================================
34 """Base class for match providing objects.
35
36 Match sources might be:
37 - database tables
38 - flat files
39 - previous input
40 - config files
41 - in-memory list created on the fly
42 """
43 print_queries = False
44 #--------------------------------------------------------
46 self.setThresholds()
47
48 self._context_vals = {}
49 self.__ignored_chars = regex.compile(default_ignored_chars)
50 # used to normalize word boundaries:
51 self.__word_separators = regex.compile(default_word_separators)
52 #--------------------------------------------------------
53 # actions
54 #--------------------------------------------------------
56 """Return matches according to aFragment and matching thresholds.
57
58 FIXME: design decision: we dont worry about data source changes
59 during the lifetime of a MatchProvider
60 FIXME: append _("*get all items*") on truncation
61 """
62 # sanity check
63 if aFragment is None:
64 raise ValueError, 'Cannot find matches without a fragment.'
65
66 # user explicitly wants all matches
67 if aFragment == u'*':
68 return self.getAllMatches()
69
70 # case insensitivity
71 tmpFragment = aFragment.lower()
72 # remove ignored chars
73 if self.__ignored_chars is not None:
74 tmpFragment = self.__ignored_chars.sub('', tmpFragment)
75 # normalize word separators
76 if self.__word_separators is not None:
77 tmpFragment = u' '.join(self.__word_separators.split(tmpFragment))
78 # length in number of significant characters only
79 lngFragment = len(tmpFragment)
80
81 # order is important !
82 if lngFragment >= self.__threshold_substring:
83 return self.getMatchesBySubstr(tmpFragment)
84 elif lngFragment >= self.__threshold_word:
85 return self.getMatchesByWord(tmpFragment)
86 elif lngFragment >= self.__threshold_phrase:
87 return self.getMatchesByPhrase(tmpFragment)
88 else:
89 return (False, [])
90 #--------------------------------------------------------
93 #--------------------------------------------------------
96 #--------------------------------------------------------
99 #--------------------------------------------------------
102 #--------------------------------------------------------
103 # configuration
104 #--------------------------------------------------------
106 """Set match location thresholds.
107
108 - the fragment passed to getMatches() must contain at least this many
109 characters before it triggers a match search at:
110 1) phrase_start - start of phrase (first word)
111 2) word_start - start of any word within phrase
112 3) in_word - _inside_ any word within phrase
113 """
114 # sanity checks
115 if aSubstring < aWord:
116 _log.error('Setting substring threshold (%s) lower than word-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_substring, self.__threshold_word))
117 return False
118 if aWord < aPhrase:
119 _log.error('Setting word-start threshold (%s) lower than phrase-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_word, self.__threshold_phrase))
120 return False
121
122 # now actually reassign thresholds
123 self.__threshold_phrase = aPhrase
124 self.__threshold_word = aWord
125 self.__threshold_substring = aSubstring
126
127 return True
128 #--------------------------------------------------------
130 if word_separators is None:
131 self.__word_separators = None
132 else:
133 self.__word_separators = regex.compile(word_separators)
134
139
140 word_separators = property(_get_word_separators, _set_word_separators)
141 #--------------------------------------------------------
143 if ignored_chars is None:
144 self.__ignored_chars = None
145 else:
146 self.__ignored_chars = regex.compile(ignored_chars)
147
152
153 ignored_chars = property(_get_ignored_chars, _set_ignored_chars)
154 #--------------------------------------------------------
156 """Set value to provide context information for matches.
157
158 The matching code may ignore it depending on its exact
159 implementation. Names and values of the context depend
160 on what is being matched.
161
162 <context> -- the *placeholder* key *inside* the context
163 definition, not the context *definition* key
164 """
165 if context is None:
166 return False
167 self._context_vals[context] = val
168 return True
169 #--------------------------------------------------------
175 #------------------------------------------------------------
176 # usable instances
177 #------------------------------------------------------------
179 """Match provider where all possible options can be held
180 in a reasonably sized, pre-allocated list.
181 """
183 """aSeq must be a list of dicts. Each dict must have the keys (data, label, weight)
184 """
185 if not type(aSeq) in [type(None), type([]), type(())]:
186 _log.error('fixed list match provider argument must be a list/tuple of dicts/None')
187 raise TypeError('fixed list match provider argument must be a list/tuple of dicts/None')
188
189 self.__items = aSeq
190 cMatchProvider.__init__(self)
191 #--------------------------------------------------------
192 # internal matching algorithms
193 #
194 # if we end up here:
195 # - aFragment will not be "None"
196 # - aFragment will be lower case
197 # - we _do_ deliver matches (whether we find any is a different story)
198 #--------------------------------------------------------
200 """Return matches for aFragment at start of phrases."""
201 matches = []
202 # look for matches
203 for item in self.__items:
204 # at start of phrase, that is
205 if item['list_label'].lower().startswith(aFragment.lower()):
206 matches.append(item)
207 # no matches found
208 if len(matches) == 0:
209 return (False, [])
210
211 matches.sort(self.__cmp_items)
212 return (True, matches)
213 #--------------------------------------------------------
215 """Return matches for aFragment at start of words inside phrases."""
216 matches = []
217 # look for matches
218 for item in self.__items:
219 item_label = item['list_label'].lower()
220 fragment_pos = item_label.find(aFragment.lower())
221 # found at start of phrase
222 if fragment_pos == 0:
223 matches.append(item)
224 # found as a true substring
225 elif fragment_pos > 0:
226 # but use only if substring is at start of a word
227 if item_label[fragment_pos-1] == u' ':
228 matches.append(item)
229 # no matches found
230 if len(matches) == 0:
231 return (False, [])
232
233 matches.sort(self.__cmp_items)
234 return (True, matches)
235 #--------------------------------------------------------
237 """Return matches for aFragment as a true substring."""
238 matches = []
239 # look for matches
240 for item in self.__items:
241 if item['list_label'].lower().find(aFragment.lower()) != -1:
242 matches.append(item)
243 # no matches found
244 if len(matches) == 0:
245 return (False, [])
246
247 matches.sort(self.__cmp_items)
248 return (True, matches)
249 #--------------------------------------------------------
251 """Return all items."""
252 matches = self.__items
253 # no matches found
254 if len(matches) == 0:
255 return (False, [])
256
257 matches.sort(self.__cmp_items)
258 return (True, matches)
259 #--------------------------------------------------------
261 """items must be a list of dicts. Each dict must have the keys (data, list_label, weight)"""
262 self.__items = items
263 #--------------------------------------------------------
274 # ===========================================================
276 """Match provider which searches matches
277 in the results of a function call.
278 """
280 """get_candidates() must return a list of strings."""
281 if get_candidates is None:
282 _log.error('must define function to retrieve match candidates list')
283 raise ValueError('must define function to retrieve match candidates list')
284
285 self._get_candidates = get_candidates
286 cMatchProvider.__init__(self)
287 #--------------------------------------------------------
288 # internal matching algorithms
289 #
290 # if we end up here:
291 # - aFragment will not be "None"
292 # - aFragment will be lower case
293 # - we _do_ deliver matches (whether we find any is a different story)
294 #--------------------------------------------------------
296 """Return matches for aFragment at start of phrases."""
297 matches = []
298 candidates = self._get_candidates()
299 # look for matches
300 for candidate in candidates:
301 # at start of phrase, that is
302 if aFragment.startswith(candidate['list_label'].lower()):
303 matches.append(candidate)
304 # no matches found
305 if len(matches) == 0:
306 return (False, [])
307
308 matches.sort(self.__cmp_candidates)
309 return (True, matches)
310 #--------------------------------------------------------
312 """Return matches for aFragment at start of words inside phrases."""
313 matches = []
314 candidates = self._get_candidates()
315 # look for matches
316 for candidate in candidates:
317 pos = candidate['list_label'].lower().find(aFragment)
318 # pos = string.find(string.lower(candidate['list_label']), aFragment)
319 # found as a true substring
320 # but use only if substring is at start of a word
321 # FIXME: use word seps
322 if (pos == 0) or (candidate['list_label'][pos-1] == u' '):
323 matches.append(candidate)
324 # no matches found
325 if len(matches) == 0:
326 return (False, [])
327
328 matches.sort(self.__cmp_candidates)
329 return (True, matches)
330 #--------------------------------------------------------
332 """Return matches for aFragment as a true substring."""
333 matches = []
334 candidates = self._get_candidates()
335 # look for matches
336 for candidate in candidates:
337 if candidate['list_label'].lower().find(aFragment) != -1:
338 # if string.find(string.lower(candidate['list_label']), aFragment) != -1:
339 matches.append(candidate)
340 # no matches found
341 if len(matches) == 0:
342 return (False, [])
343
344 matches.sort(self.__cmp_candidates)
345 return (True, matches)
346 #--------------------------------------------------------
350 #--------------------------------------------------------
354 # FIXME: do ordering
355 # if candidate1 < candidate2:
356 # return -1
357 # if candidate1 == candidate2:
358 # return 0
359 # return 1
360
361 # ===========================================================
363 """Match provider which searches matches
364 in possibly several database tables.
365
366 queries:
367 - a list of unicode strings
368 - each string is a query
369 - each string must contain: "... where <column> %(fragment_condition)s ..."
370 - each string can contain in the where clause: "... %(<context_key>)s ..."
371 - each query must return (data, label)
372
373 context definitions to be used in the queries
374 example: {'ctxt_country': {'where_part': 'and country = %(country)s', 'placeholder': 'country'}}
375 """
377 if type(queries) != type([]):
378 queries = [queries]
379
380 self._queries = queries
381
382 if context is None:
383 self._context = {}
384 else:
385 self._context = context
386
387 self._args = {}
388 cMatchProvider.__init__(self)
389 #--------------------------------------------------------
390 # internal matching algorithms
391 #
392 # if we end up here:
393 # - aFragment will not be "None"
394 # - aFragment will be lower case
395 # - we _do_ deliver matches (whether we find any is a different story)
396 #--------------------------------------------------------
398 """Return matches for aFragment at start of phrases."""
399
400 fragment_condition = u"ILIKE %(fragment)s"
401 self._args['fragment'] = u"%s%%" % aFragment
402
403 return self._find_matches(fragment_condition)
404 #--------------------------------------------------------
406 """Return matches for aFragment at start of words inside phrases."""
407
408 fragment_condition = u"~* %(fragment)s"
409 aFragment = gmPG2.sanitize_pg_regex(expression = aFragment, escape_all = False)
410 self._args['fragment'] = u"( %s)|(^%s)" % (aFragment, aFragment)
411
412 return self._find_matches(fragment_condition)
413 #--------------------------------------------------------
415 """Return matches for aFragment as a true substring."""
416
417 fragment_condition = u"ILIKE %(fragment)s"
418 self._args['fragment'] = u"%%%s%%" % aFragment
419
420 return self._find_matches(fragment_condition)
421 #--------------------------------------------------------
425 #--------------------------------------------------------
427 if self.print_queries:
428 print "----------------------"
429 matches = []
430 for query in self._queries:
431 where_fragments = {'fragment_condition': fragment_condition}
432
433 for context_key, context_def in self._context.items():
434 try:
435 placeholder = context_def['placeholder']
436 where_part = context_def['where_part']
437 self._args[placeholder] = self._context_vals[placeholder]
438 # we do have a context value for this key, so add the where condition
439 where_fragments[context_key] = where_part
440 if self.print_queries:
441 print "ctxt ph:", placeholder
442 print "ctxt where:", where_part
443 print "ctxt val:", self._context_vals[placeholder]
444 except KeyError:
445 # we don't have a context value for this key, so skip the where condition
446 where_fragments[context_key] = u''
447
448 cmd = query % where_fragments
449
450 if self.print_queries:
451 print "class:", self.__class__.__name__
452 print "ctxt:", self._context_vals
453 print "args:", self._args
454 print "query:", cmd
455
456 try:
457 rows, idx = gmPG2.run_ro_queries(queries = [{'cmd': cmd, 'args': self._args}])
458 except:
459 _log.exception('[%s]: error running match provider SQL, dropping query', self.__class__.__name__)
460 idx = self._queries.index(query)
461 del self._queries[idx]
462 break
463
464 # no matches found: try next query
465 if len(rows) == 0:
466 continue
467
468 for row in rows:
469 match = {'weight': 0}
470
471 try:
472 match['data'] = row['data']
473 except KeyError:
474 match['data'] = row[0]
475
476 try:
477 match['list_label'] = row['list_label']
478 except KeyError:
479 match['list_label'] = row[1]
480
481 # explicit "field_label" in result ?
482 try:
483 match['field_label'] = row['field_label']
484 # no
485 except KeyError:
486 # but does row[2] exist ?
487 try:
488 match['field_label'] = row[2]
489 # no: reuse "list_label"
490 except IndexError:
491 match['field_label'] = match['list_label']
492
493 # try:
494 # match['label'] = row['label']
495 # except KeyError:
496 # match['label'] = match['list_label']
497
498 matches.append(match)
499
500 return (True, matches)
501
502 # none found whatsoever
503 return (False, [])
504 #================================================================
505 if __name__ == '__main__':
506 pass
507
508 #================================================================
509
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Thu Jul 28 03:57:31 2011 | http://epydoc.sourceforge.net |