Spaces:
Runtime error
Runtime error
| #!/usr/bin/python | |
| #-------------------------------- | |
| # Written by Marzyeh Ghassemi, CSAIL, MIT | |
| # Sept 21, 2012 | |
| # Updated for Python 3, added Notebook, db connection | |
| # by Tom J. Pollard 13 Nov, 2017 | |
| # Please contact the author with errors found. | |
| # mghassem {AT} mit {DOT} edu | |
| #-------------------------------- | |
| from __future__ import with_statement | |
| import nltk | |
| import os | |
| import os.path | |
| import re | |
| import string | |
| import sys | |
| import time | |
| def addToDrugs(line, drugs, listing, genList): | |
| """ | |
| ###### function addToDrugs | |
| # line: line of text to search | |
| # drugs: array to modify | |
| # listing: list of search terms in (generic:search list) form | |
| # genList: list of all generic keys being searched for | |
| # | |
| # Searches the provided line for drugs that are listed. Inserts | |
| # a 1 in the drugs array provided at the location which maps | |
| # the found key to the generics list | |
| """ | |
| genList = dict(enumerate(genList)) | |
| genList = dict((v,k) for k, v in genList.items()) | |
| for (generic, names) in listing.items(): | |
| if re.search(names, line, re.I): | |
| drugs[genList[generic]] = 1 | |
| return drugs | |
| def readDrugs(f, genList): | |
| """ | |
| ###### function readDrugs | |
| # f: file | |
| # genList: list of search terms in (generic:search list) form | |
| # | |
| # Converts lines of the form "generic|brand1|brand2" to a | |
| # dictionary keyed by "generic" with value "generic|brand1|brand2 | |
| """ | |
| lines = f.read() | |
| generics = re.findall("^(.*?)\|", lines, re.MULTILINE) | |
| generics = [x.lower() for x in generics] | |
| lines = lines.split("\n") | |
| lines = [x.lower() for x in lines] | |
| genList.append(generics) | |
| return dict(zip(generics, lines)) | |
| def search(NOTES, | |
| SSRI_FILE = os.path.join(os.getcwd(), "SSRI_list.txt"), | |
| MISC_FILE = os.path.join(os.getcwd(), "MISC_list.txt"), | |
| SUMMARY_FILE = "output.csv", | |
| VERBOSE = False): | |
| """ | |
| ###### Search the notes | |
| # NOTES: dataframe loaded from the noteevents table | |
| # SSRI_FILE: list of SSRI drugs to search for | |
| # MISC_FILE: list of additional drugs to search for | |
| # | |
| # NB: files should have a line for each distinct drug type, | |
| # and drugs should be separated by a vertical bar '|' | |
| # | |
| # LIMIT FOR PARSING: max number of notes to search. | |
| # OUTPUT: name of the output file. | |
| """ | |
| if os.path.isfile(SUMMARY_FILE): | |
| print('The output file already exists.\n\nRemove the following file or save with a different filename:') | |
| print(os.path.join(os.getcwd(), SUMMARY_FILE)) | |
| return | |
| starttime = time.time() | |
| # Keep a list of all generics we are looking for | |
| genList = [] | |
| # Get the drugs into a structure we can use | |
| with open(SSRI_FILE) as f: | |
| SSRI = readDrugs(f, genList) | |
| print("Using drugs from {}".format(SSRI_FILE)) | |
| try: | |
| with open(MISC_FILE) as f: | |
| MISC = readDrugs(f, genList) | |
| print("Using additional drugs from {}".format(MISC_FILE)) | |
| except: | |
| MISC = None | |
| flatList = [item for sublist in genList for item in sublist] | |
| # Create indices for the flat list | |
| # This allows us to understand which "types" are being used | |
| lengths = [len(type) for type in genList] | |
| prevLeng = 0 | |
| starts = [] | |
| ends = [] | |
| for leng in lengths: | |
| starts.append(prevLeng) | |
| ends.append(prevLeng + leng - 1) | |
| prevLeng = prevLeng + leng | |
| # Limit the analysis to discharge summaries | |
| # Comment out because limitation is now in SQL query | |
| # NOTES = NOTES[NOTES['category'] == 'Discharge summary'] | |
| # Write heads and notes to new doc | |
| with open(SUMMARY_FILE, 'a') as f_out: | |
| f_out.write('"ROW_ID","SUBJECT_ID","HADM_ID","HIST_FOUND","DEPRESSION","ADMIT_FOUND","DIS_FOUND","GEN_DEPRESS_MEDS_FOUND","GROUP","SSRI","MISC","' \ | |
| + '","'.join(flatList) + '"\n') | |
| # Parse each patient record | |
| print("Reading documents...") | |
| for note in NOTES.itertuples(): | |
| if note.Index % 100 == 0: | |
| print("...index: {}. row_id: {}. subject_id: {}. hadm_id: {}. \n".format(note.Index, note.row_id, note.subject_id, note.hadm_id)) | |
| sys.stdout.flush() | |
| # Reset some per-patient variables | |
| section = "" | |
| newSection = "" | |
| admitFound = 0 # admission note found | |
| dischargeFound = 0 # discharge summary found | |
| histFound = 0 # medical history found | |
| depressionHist = 0; | |
| drugsAdmit = [0]*len(flatList) | |
| drugsDis = [0]*len(flatList) | |
| general_depression_drugs = 0 | |
| # Read through lines sequentially | |
| # If this looks like a section header, start looking for drugs | |
| for line in note.text.split("\n"): | |
| # Searches for a section header based on heuristics | |
| m = re.search("""^((\d|[A-Z])(\.|\)))?\s*([a-zA-Z',\.\-\*\d\[\]\(\) ]+)(:| WERE | IS | ARE |INCLUDED|INCLUDING)""", line, re.I) | |
| if m: | |
| newSection = "" | |
| # Past Medical History Section | |
| if re.search('med(ical)?\s+hist(ory)?', line, re.I): | |
| newSection = "hist" | |
| histFound = 1 | |
| # Discharge Medication Section | |
| elif re.search('medication|meds', line, re.I) and re.search('disch(arge)?', line, re.I): | |
| newSection = "discharge" | |
| dischargeFound = 1 | |
| # Admitting Medication Section | |
| elif re.search('admission|admitting|home|nh|nmeds|pre(\-|\s)?(hosp|op)|current|previous|outpatient|outpt|outside|^[^a-zA-Z]*med(ication)?(s)?', line, re.I) \ | |
| and (section == "admit" or re.search('medication|meds', line, re.I)): | |
| newSection = "admit" | |
| admitFound = 1 | |
| # Med section ended, now in non-meds section | |
| if section != newSection: | |
| section = newSection | |
| # If in history section, search for depression | |
| if 'hist' in section: | |
| if re.search('depression', line, re.I): | |
| depressionHist = 1 | |
| # If in meds section, look at each line for specific drugs | |
| elif 'admit' in section: | |
| drugsAdmit = addToDrugs(line, drugsAdmit, SSRI, flatList) | |
| if MISC: | |
| drugsAdmit = addToDrugs(line, drugsAdmit, MISC, flatList) | |
| ## Section just has something like 'Depression meds' | |
| if re.search('depression\s+med(ication)?(s)?', line, re.I): | |
| general_depression_drugs = 1 | |
| ## Already in meds section, look at each line for specific drugs | |
| elif 'discharge' in section: | |
| drugsDis = addToDrugs(line, drugsDis, SSRI, flatList) | |
| if MISC: | |
| drugsDis = addToDrugs(line, drugsDis, MISC, flatList) | |
| # A line with information which we are uncertain about... | |
| elif re.search('medication|meds', line, re.I) and re.search('admission|discharge|transfer', line, re.I): | |
| if VERBOSE: | |
| print('?? {}'.format(line)) | |
| pass | |
| group = 0 | |
| # Group 0: Patient has no medications on admission section (or no targeted meds) | |
| # and medications on discharge from the list | |
| if dischargeFound == 1 and (1 in drugsDis) and (admitFound == 0 or not(1 in drugsAdmit)): | |
| group = 0 | |
| # Group 1: Patient has a medications on admission section with no targeted meds | |
| # and no medications on discharge | |
| elif admitFound == 1 and not(1 in drugsAdmit) and (dischargeFound == 0) and general_depression_drugs == 0: | |
| group = 1 | |
| # Group 2: Patient has medications on admission section, but none from the list | |
| # and no medications on discharge from the list | |
| elif admitFound == 1 and not(1 in drugsAdmit) and dischargeFound == 1 and not(1 in drugsDis) and general_depression_drugs == 0: | |
| group = 2 | |
| # Group 3: Patient has medications on admission (at least one from the list) | |
| elif (1 in drugsAdmit): | |
| group = 3 | |
| else: | |
| if VERBOSE: | |
| print('Uncertain about group type for row_id = {}'.format(note.row_id)) | |
| pass | |
| if VERBOSE: | |
| print('group is {}'.format(group)) | |
| # Combine the admit and discharge drugs lists | |
| combined = [w or x for w, x in zip(drugsAdmit, drugsDis)] | |
| # Count the types of each drug | |
| member = [] | |
| member = [int(1 in drugsAdmit[s:e+1]) for s, e in zip(starts, ends)] | |
| # save items to csv | |
| f_out.write(str(note.row_id) + "," + str(note.subject_id) + "," + str(note.hadm_id) + "," + str(histFound) + "," \ | |
| + str(depressionHist) + "," + str(admitFound) + "," + str(dischargeFound) + "," \ | |
| + str(general_depression_drugs) + "," + str(group) + "," + ",".join(map(str, member)) \ | |
| + "," + ",".join(map(str, drugsAdmit)) + "\n") | |
| # Print summary of analysis | |
| stoptime = time.time() | |
| print("Done analyzing {} documents in {} seconds ({} docs/sec)".format(len(NOTES), | |
| round(stoptime - starttime, 2), round(len(NOTES) / (stoptime - starttime), 2))) | |
| print("Summary file is in {}".format(os.getcwd())) | |