Learn practical skills, build real-world projects, and advance your career

Poc of user feedback influence on the original model

Create new entity

Improve existing entity

from __future__ import unicode_literals, print_function
import spacy
import en_core_web_sm
import en_core_web_md
# import en_core_web_lg
import random
from pathlib import Path
from spacy.util import minibatch, compounding
import tarfile
import xml.etree.ElementTree as ET
import os
from pandas import ExcelWriter
import pandas as pd
from sklearn.model_selection import train_test_split
import timeit
from spacy.gold import GoldParse
from spacy.scorer import Scorer

Preprocess the data

# unzip files
# fname = './2014 i2c2/track1 deid/training-PHI-Gold-Set2.tar.gz'
# tar = tarfile.open(fname, "r:gz")
# tar.extractall(path='./2014 i2c2/track1 deid')
# tar.close()
# fname = './2014 i2c2/track1 deid/2014_training-PHI-Gold-Set1.tar.gz'
# tar = tarfile.open(fname, "r:gz")
# tar.extractall(path='./2014 i2c2/track1 deid')
# tar.close()
# fname = './2014 i2c2/track1 deid/Track1-de-indentification.tar.gz'
# tar = tarfile.open(fname, "r:gz")
# tar.extractall(path='./2014 i2c2/track1 deid')
# tar.close()


# function to combine all the available and save to a new excel file
def xml2Excel(inDirectory = None, outFileName = None, outDirectory = None):   
    path = inDirectory
    rawTexts = []
    tags = []
    for filename in os.listdir(path):
        if not filename.endswith('.xml'): continue
        fullname = os.path.join(path, filename)
        tree = ET.parse(fullname)
        root = tree.getroot()
        rawText = root[0].text
        tag = []
        for child in root[1]:
            tag.append(child.attrib)
        rawTexts.append(rawText)
        tags.append(tag)
    fullPath = outDirectory + outFileName
    writer = ExcelWriter(outFileName)
    df = pd.DataFrame({'text': rawTexts, 'tags' : tags})
    df.to_excel(writer,'Sheet1', index = False)
    writer.save()
xml2Excel(inDirectory = './2014 i2c2/track1 deid/combined records/',
         outFileName = 'PythonExport.xlsx',
         outDirectory = './') 
df = pd.read_excel('./PythonExport.xlsx')