Learn practical skills, build real-world projects, and advance your career
Created 3 years ago
Poc of user feedback influence on the original model
Create new entity
Improve existing entity
from __future__ import unicode_literals, print_function
import spacy
import en_core_web_sm
import en_core_web_md
# import en_core_web_lg
import random
from pathlib import Path
from spacy.util import minibatch, compounding
import tarfile
import xml.etree.ElementTree as ET
import os
from pandas import ExcelWriter
import pandas as pd
from sklearn.model_selection import train_test_split
import timeit
from spacy.gold import GoldParse
from spacy.scorer import Scorer
Preprocess the data
# unzip files
# fname = './2014 i2c2/track1 deid/training-PHI-Gold-Set2.tar.gz'
# tar = tarfile.open(fname, "r:gz")
# tar.extractall(path='./2014 i2c2/track1 deid')
# tar.close()
# fname = './2014 i2c2/track1 deid/2014_training-PHI-Gold-Set1.tar.gz'
# tar = tarfile.open(fname, "r:gz")
# tar.extractall(path='./2014 i2c2/track1 deid')
# tar.close()
# fname = './2014 i2c2/track1 deid/Track1-de-indentification.tar.gz'
# tar = tarfile.open(fname, "r:gz")
# tar.extractall(path='./2014 i2c2/track1 deid')
# tar.close()
# function to combine all the available and save to a new excel file
def xml2Excel(inDirectory = None, outFileName = None, outDirectory = None):
path = inDirectory
rawTexts = []
tags = []
for filename in os.listdir(path):
if not filename.endswith('.xml'): continue
fullname = os.path.join(path, filename)
tree = ET.parse(fullname)
root = tree.getroot()
rawText = root[0].text
tag = []
for child in root[1]:
tag.append(child.attrib)
rawTexts.append(rawText)
tags.append(tag)
fullPath = outDirectory + outFileName
writer = ExcelWriter(outFileName)
df = pd.DataFrame({'text': rawTexts, 'tags' : tags})
df.to_excel(writer,'Sheet1', index = False)
writer.save()
xml2Excel(inDirectory = './2014 i2c2/track1 deid/combined records/',
outFileName = 'PythonExport.xlsx',
outDirectory = './')
df = pd.read_excel('./PythonExport.xlsx')