| Trees | Indices | Help |
|
|---|
|
|
1 """GNUmed German XDT parsing objects.
2
3 This encapsulates some of the XDT data into
4 objects for easy access.
5 """
6 #==============================================================
7 # $Source: /cvsroot/gnumed/gnumed/gnumed/client/business/gmXdtObjects.py,v $
8 # $Id: gmXdtObjects.py,v 1.33 2009/05/04 11:39:47 ncq Exp $
9 __version__ = "$Revision: 1.33 $"
10 __author__ = "K.Hilbert, S.Hilbert"
11 __license__ = "GPL"
12
13 import os.path, sys, linecache, codecs, re as regex, time, datetime as pyDT, logging
14
15
16 import mx.DateTime as mxDT
17
18
19 if __name__ == '__main__':
20 sys.path.insert(0, '../../')
21 from Gnumed.pycommon import gmDateTime, gmTools
22 from Gnumed.business import gmXdtMappings, gmPerson
23
24
25 _log = logging.getLogger('gm.xdt')
26 _log.info(__version__)
27
28 #==============================================================
33 #==============================================================
35
36 f = codecs.open(filename=filename, mode='rU', encoding='utf8', errors='ignore')
37
38 file_encoding = None
39 for line in f:
40 field = line[3:7]
41 if field in gmXdtMappings._charset_fields:
42 _log.debug('found charset field [%s] in <%s>', field, filename)
43 val = line[7:8]
44 file_encoding = gmXdtMappings._map_field2charset[field][val]
45 _log.debug('encoding in file is "%s" (%s)', file_encoding, val)
46 break
47 f.close()
48
49 if file_encoding is None:
50 _log.debug('no encoding found in <%s>, assuming [%s]', filename, default_encoding)
51 return default_encoding
52
53 return file_encoding
54 #==============================================================
56
57 _map_id2name = {
58 '3101': 'lastnames',
59 '3102': 'firstnames',
60 '3103': 'dob',
61 '3110': 'gender',
62 '3106': 'zipurb',
63 '3107': 'street',
64 '3112': 'zip',
65 '3113': 'urb',
66 '8316': 'source'
67 }
68
69 needed_fields = (
70 '3101',
71 '3102'
72 )
73
74 interesting_fields = _map_id2name.keys()
75
76 data = {}
77
78 # try to find encoding if not given
79 if encoding is None:
80 encoding = determine_xdt_encoding(filename=filename)
81
82 xdt_file = codecs.open(filename=filename, mode='rU', encoding=encoding)
83
84 for line in xdt_file:
85
86 # # can't use more than what's interesting ... ;-)
87 # if len(data) == len(interesting_fields):
88 # break
89
90 line = line.replace('\015','')
91 line = line.replace('\012','')
92
93 # xDT line format: aaabbbbcccccccccccCRLF where aaa = length, bbbb = record type, cccc... = content
94 field = line[3:7]
95 # do we care about this line ?
96 if field in interesting_fields:
97 try:
98 already_seen = data[_map_id2name[field]]
99 break
100 except KeyError:
101 data[_map_id2name[field]] = line[7:]
102
103 xdt_file.close()
104
105 # found enough data ?
106 if len(data) < len(needed_fields):
107 raise ValueError('insufficient patient data in XDT file [%s], found only: %s' % (filename, data))
108
109 from Gnumed.business import gmPerson
110 dto = gmPerson.cDTO_person()
111
112 dto.firstnames = data['firstnames']
113 dto.lastnames = data['lastnames']
114
115 # CAVE: different data orders are possible, so configuration may be needed
116 # FIXME: detect xDT version and use default from the standard when dob_format is None
117 try:
118 dob = time.strptime(data['dob'], gmTools.coalesce(dob_format, '%d%m%Y'))
119 dto.dob = pyDT.datetime(dob.tm_year, dob.tm_mon, dob.tm_mday, tzinfo = gmDateTime.gmCurrentLocalTimezone)
120 except KeyError:
121 dto.dob = None
122
123 try:
124 dto.gender = gmXdtMappings.map_gender_xdt2gm[data['gender'].lower()]
125 except KeyError:
126 dto.gender = None
127
128 dto.zip = None
129 try:
130 dto.zip = regex.match('\d{5}', data['zipurb']).group()
131 except KeyError: pass
132 try:
133 dto.zip = data['zip']
134 except KeyError: pass
135
136 dto.urb = None
137 try:
138 dto.urb = regex.sub('\d{5} ', '', data['zipurb'])
139 except KeyError: pass
140 try:
141 dto.urb = data['urb']
142 except KeyError: pass
143
144 try:
145 dto.street = data['street']
146 except KeyError:
147 dto.street = None
148
149 try:
150 dto.source = data['source']
151 except KeyError:
152 dto.source = None
153
154 return dto
155 #==============================================================
157
159
160 file_encoding = determine_xdt_encoding(filename=filename)
161 if file_encoding is None:
162 _log.warning('LDT file <%s> does not specify encoding', filename)
163 if encoding is None:
164 raise ValueError('no encoding specified in file <%s> or method call' % filename)
165
166 if override_encoding:
167 if encoding is None:
168 raise ValueError('no encoding specified in method call for overriding encoding in file <%s>' % filename)
169 self.encoding = encoding
170 else:
171 if file_encoding is None:
172 self.encoding = encoding
173 else:
174 self.encoding = file_encoding
175
176 self.filename = filename
177
178 self.__header = None
179 self.__tail = None
180 #----------------------------------------------------------
182
183 if self.__header is not None:
184 return self.__header
185
186 ldt_file = codecs.open(filename = self.filename, mode = 'rU', encoding = self.encoding)
187 self.__header = []
188 for line in ldt_file:
189 length, field, content = line[:3], line[3:7], line[7:].replace('\015','').replace('\012','')
190 # loop until found first LG-Bericht
191 if field == u'8000':
192 if content in [u'8202']:
193 break
194 self.__header.append(line)
195
196 ldt_file.close()
197 return self.__header
198
199 header = property(_get_header, lambda x:x)
200 #----------------------------------------------------------
202
203 if self.__tail is not None:
204 return self.__tail
205
206 ldt_file = codecs.open(filename = self.filename, mode = 'rU', encoding = self.encoding)
207 self.__tail = []
208 in_tail = False
209 for line in ldt_file:
210 if in_tail:
211 self.__tail.append(line)
212 continue
213
214 length, field, content = line[:3], line[3:7], line[7:].replace('\015','').replace('\012','')
215
216 # loop until found tail
217 if field == u'8000':
218 if content not in [u'8221']:
219 continue
220 in_tail = True
221 self.__tail.append(line)
222
223 ldt_file.close()
224 return self.__tail
225
226 tail = property(_get_tail, lambda x:x)
227 #----------------------------------------------------------
229
230 ldt_file = codecs.open(filename = self.filename, mode = 'rU', encoding = self.encoding)
231 out_file = None
232
233 in_patient = False
234 for line in ldt_file:
235
236 if in_patient:
237 out_file.write(line)
238 continue
239
240 length, field, content = line[:3], line[3:7], line[7:].replace('\015','').replace('\012','')
241
242 # start of record
243 if field == u'8000':
244 # start of LG-Bericht
245 if content == u'8202':
246 in_patient = True
247 if out_file is not None:
248 out_file.write(u''.join(self.tail))
249 out_file.close()
250 #out_file = codecs.open(filename=filename_xxxx, mode=xxxx_'rU', encoding=self.encoding)
251 out_file.write(u''.join(self.header))
252 else:
253 in_patient = False
254 if out_file is not None:
255 out_file.write(u''.join(self.tail))
256 out_file.close()
257
258 if out_file is not None:
259 if not out_file.closed:
260 out_file.write(u''.join(self.tail))
261 out_file.close()
262
263 ldt_file.close()
264 #==============================================================
265 # FIXME: the following *should* get wrapped in class XdtFile ...
266 #--------------------------------------------------------------
268 pat_ids = []
269 pat_names = []
270 pats = {}
271 # xDT line format: aaabbbbcccccccccccCRLF where aaa = length, bbbb = record type, cccc... = content
272 # read patient dat
273 for line in fileinput.input(aFile):
274 # remove trailing CR and/or LF
275 line = line.replace('\015','')
276 line = line.replace('\012','')
277 # do we care about this line ?
278 field = line[3:7]
279 # yes, if type = patient id
280 if field == '3000':
281 pat_id = line[7:]
282 if pat_id not in pat_ids:
283 pat_ids.append(pat_id)
284 continue
285 # yes, if type = patient name
286 if field == '3101':
287 pat_name = line [7:]
288 if pat_name not in pat_names:
289 pat_names.append(pat_name)
290 pats[pat_id] = pat_name
291 continue
292 fileinput.close()
293
294 _log.debug("patients found: %s" % len(pat_ids))
295 return pats
296 #==============================================================
298 _log.debug("getting files for patient [%s:%s]" % (ID, name))
299 files = patlst.get(aGroup = "%s:%s" % (ID, name), anOption = "files")
300 _log.debug("%s => %s" % (patdir, files))
301 return [patdir, files]
302 #==============================================================
304 content=[]
305 lineno = []
306
307 # xDT line format: aaabbbbcccccccccccCRLF where aaa = length, bbbb = record type, cccc... = content
308
309 content = []
310 record_start_lines = []
311
312 # find record starts
313 for line in fileinput.input(aFile):
314 strippedline = line.replace('\015','')
315 strippedline = strippedline.replace('\012','')
316 # do we care about this line ? (records start with 8000)
317 if strippedline[3:7] == '8000':
318 record_start_lines.append(fileinput.filelineno())
319
320 # loop over patient records
321 for aline in record_start_lines:
322 # WHY +2 ?!?
323 line = linecache.getline(aFile,aline+2)
324 # remove trailing CR and/or LF
325 strippedline = line.replace('\015','')
326 strippedline = strippedline.replace('\012','')
327 # do we care about this line ?
328 field = strippedline[3:7]
329 # extract patient id
330 if field == '3000':
331 ID = strippedline[7:]
332 line = linecache.getline(aFile,aline+3)
333 # remove trailing CR and/or LF
334 strippedline = line.replace('\015','')
335 strippedline = strippedline.replace('\012','')
336 # do we care about this line ?
337 field = strippedline[3:7]
338 if field == '3101':
339 name = strippedline [7:]
340 startline=aline
341 endline=record_start_lines[record_start_lines.index(aline)+1]
342 _log.debug("reading from%s" %str(startline)+' '+str(endline) )
343 for tmp in range(startline,endline):
344 content.append(linecache.getline(aFile,tmp))
345 _log.debug("reading %s"%tmp )
346 hashes = check_for_previous_records(ID,name,patlst)
347 # is this new content ?
348 data_hash = md5.new() # FIXME: use hashlib
349 map(data_hash.update, content)
350 digest = data_hash.hexdigest()
351 if digest not in hashes:
352 pat_dir = cfg.get("xdt-viewer", "export-dir")
353 file = write_xdt_pat_data(content, pat_dir)
354 add_file_to_patlst(ID, name, patlst, file, ahash)
355 content = []
356 else:
357 continue
358 # cleanup
359 fileinput.close()
360 patlst.store()
361 return 1
362 #==============================================================
364 tmpname = gmTools.get_unique_filename(prefix='', suffix = time.strftime(".%Y%m%d-%H%M%S", time.localtime()), tmp_dir=aDir)
365 path, fname = os.path.split(tmpname)
366 return fname
367 #==============================================================
369 """write record for this patient to new file"""
370 pat_file = open(os.path.join(aDir, get_rand_fname(aDir)), "w")
371 map(pat_file.write, data)
372 pat_file.close()
373 return fname
374 #==============================================================
376 anIdentity = "%s:%s" % (ID, name)
377 hashes = []
378 # patient not listed yet
379 if anIdentity not in patlst.getGroups():
380 _log.debug("identity not yet in list" )
381 patlst.set(aGroup = anIdentity, anOption = 'files', aValue = [], aComment = '')
382 # file already listed ?
383 file_defs = patlst.get(aGroup = anIdentity, anOption = "files")
384 for line in file_defs:
385 file, ahash = line.split(':')
386 hashes.append(ahash)
387
388 return hashes
389 #==============================================================
397 #==============================================================
398 # main
399 #--------------------------------------------------------------
400 if __name__ == "__main__":
401 from Gnumed.pycommon import gmI18N, gmLog2
402
403 root_log = logging.getLogger()
404 root_log.setLevel(logging.DEBUG)
405 _log = logging.getLogger('gm.xdt')
406
407 #from Gnumed.business import gmPerson
408 gmI18N.activate_locale()
409 gmI18N.install_domain()
410 gmDateTime.init()
411
412 ldt = cLDTFile(filename = sys.argv[1])
413 print "header:"
414 for line in ldt.header:
415 print line.encode('utf8', 'replace')
416 print "tail:"
417 for line in ldt.tail:
418 print line.encode('utf8', 'replace')
419
420 # # test framework if run by itself
421 # patfile = sys.argv[1]
422 # dobformat = sys.argv[2]
423 # encoding = sys.argv[3]
424 # print "reading patient data from xDT file [%s]" % patfile
425
426 # dto = read_person_from_xdt(patfile, dob_format=dobformat, encoding=encoding)
427 # print "DTO:", dto
428 # print "dto.dob:", dto.dob, type(dto.dob)
429 # print "dto.dob.tz:", dto.dob.tzinfo
430 # print "dto.zip: %s dto.urb: %s" % (dto.zip, dto.urb)
431 # print "dto.street", dto.street
432 # searcher = gmPerson.cPatientSearcher_SQL()
433 # ident = searcher.get_identities(dto=dto)[0]
434 # print ident
435 ## print ident.get_medical_age()
436
437 #==============================================================
438 # $Log: gmXdtObjects.py,v $
439 # Revision 1.33 2009/05/04 11:39:47 ncq
440 # - md5 is gone
441 #
442 # Revision 1.32 2009/02/18 13:43:38 ncq
443 # - get_unique_filename API change
444 #
445 # Revision 1.31 2009/02/05 21:16:59 ncq
446 # - start supporting importing LDT
447 #
448 # Revision 1.30 2008/01/30 13:34:50 ncq
449 # - switch to std lib logging
450 #
451 # Revision 1.29 2007/07/11 21:05:10 ncq
452 # - use gmTools.get_unique_filename()
453 #
454 # Revision 1.28 2007/06/28 12:34:35 ncq
455 # - handle GDT source field, too
456 # - safer detection of subsequent records
457 # - improved date parsing logic
458 #
459 # Revision 1.27 2007/05/21 13:04:29 ncq
460 # - start class cDTO_xdt_person
461 #
462 # Revision 1.26 2007/02/22 17:28:45 ncq
463 # - improve test suite
464 #
465 # Revision 1.25 2007/01/21 12:20:45 ncq
466 # - add determine_xdt_encoding()
467 #
468 # Revision 1.24 2007/01/16 17:57:54 ncq
469 # - improve test suite
470 #
471 # Revision 1.23 2007/01/16 13:43:10 ncq
472 # - use gmDateTime.gmCurrentLocalTimezone for dto.dob
473 #
474 # Revision 1.22 2007/01/16 12:13:30 ncq
475 # - dto.dob now requires datetime.datetime
476 # - improve test suite
477 #
478 # Revision 1.21 2007/01/16 10:26:29 ncq
479 # - open xdt file in utf8 even for encoding detection since
480 # it can still contain umlauts et al
481 # - fix zipurb vs zip + urb handling
482 #
483 # Revision 1.20 2007/01/04 23:09:38 ncq
484 # - support explicit DOB format in xDT files
485 #
486 # Revision 1.19 2006/12/11 18:53:43 ncq
487 # - make read_person_from_xdt() recognize address data
488 #
489 # Revision 1.18 2006/10/30 16:42:27 ncq
490 # - use more gmXdtMappings
491 #
492 # Revision 1.17 2006/10/08 10:48:28 ncq
493 # - teach xdt reader to derive encoding from gdt 6301 record
494 #
495 # Revision 1.16 2006/09/13 07:54:32 ncq
496 # - clean up imports
497 # - handle source encoding in read_person_from_xdt()
498 #
499 # Revision 1.15 2006/09/12 17:19:53 ncq
500 # - xDT files have the gender in upper or lower case, so normalize to lower
501 #
502 # Revision 1.14 2006/07/22 11:01:00 ncq
503 # - make gender optional
504 #
505 # Revision 1.13 2006/07/19 20:43:59 ncq
506 # - remove cXDTPatient
507 #
508 # Revision 1.12 2006/07/17 18:02:50 ncq
509 # - cleanup, improve testing
510 # - add read_person_from_xdt() and use gmPerson.cDTO_person()
511 #
512 # Revision 1.11 2006/07/13 21:00:32 ncq
513 # - cleanup gender mappings
514 # - streamline cXdtPatient and improve test harness
515 #
516 # Revision 1.10 2006/05/12 12:05:04 ncq
517 # - cleanup
518 #
519 # Revision 1.9 2004/03/20 19:45:49 ncq
520 # - rename gender map
521 #
522 # Revision 1.8 2004/03/18 11:05:00 shilbert
523 # - fixed xDT-parsing in standalone mode
524 #
525 # Revision 1.7 2004/02/25 09:46:20 ncq
526 # - import from pycommon now, not python-common
527 #
528 # Revision 1.6 2003/11/17 10:56:35 sjtan
529 #
530 # synced and commiting.
531 #
532 # Revision 1.1 2003/10/23 06:02:38 sjtan
533 #
534 # manual edit areas modelled after r.terry's specs.
535 #
536 # Revision 1.5 2003/08/28 18:54:32 shilbert
537 # - corrected some minor glitches
538 #
539 # Revision 1.4 2003/08/27 14:58:58 ncq
540 # - added helpers written by shilbert for XdtViewer
541 #
542 # Revision 1.3 2003/04/19 22:56:03 ncq
543 # - speed up __load_data(), better encapsulate xdt file maps
544 #
545 # Revision 1.2 2003/02/18 02:43:16 ncq
546 # - rearranged __getitem__ to check self.__data last
547 #
548 # Revision 1.1 2003/02/17 23:33:14 ncq
549 # - first version
550 #
551
| Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Tue Feb 9 04:01:41 2010 | http://epydoc.sourceforge.net |