* Work on prepare_treebank script, adding NER to it

This commit is contained in:
Matthew Honnibal 2015-05-26 19:28:29 +02:00
parent 15bbbf4901
commit 61885aee76

View File

@ -4,18 +4,20 @@ doc: {
id: string, id: string,
paragraphs: [{ paragraphs: [{
raw: string, raw: string,
segmented: string,
sents: [int], sents: [int],
tokens: [{ tokens: [{
start: int, start: int,
tag: string, tag: string,
head: int, head: int,
dep: string}], dep: string}],
ner: [{
start: int,
end: int,
label: string}],
brackets: [{ brackets: [{
start: int, start: int,
end: int, end: int,
label: string, label: string}]}]}
flabel: int}]}]}
Consumes output of spacy/munge/align_raw.py Consumes output of spacy/munge/align_raw.py
""" """
@ -26,6 +28,7 @@ import re
from spacy.munge import read_ptb from spacy.munge import read_ptb
from spacy.munge import read_conll from spacy.munge import read_conll
from spacy.munge import read_ner
def _iter_raw_files(raw_loc): def _iter_raw_files(raw_loc):
@ -34,24 +37,30 @@ def _iter_raw_files(raw_loc):
yield f yield f
def format_doc(section, filename, raw_paras, ptb_loc, dep_loc): def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
ptb_sents = read_ptb.split(open(ptb_loc).read()) ptb_sents = read_ptb.split(ptb_text)
dep_sents = read_conll.split(open(dep_loc).read()) dep_sents = read_conll.split(dep_text)
ner_sents = read_ner.split(ner_text) if ner_text is not None else None
assert len(ptb_sents) == len(dep_sents) assert len(ptb_sents) == len(dep_sents)
i = 0 i = 0
doc = {'id': filename, 'paragraphs': []} doc = {'id': file_id, 'paragraphs': []}
for raw_sents in raw_paras: for raw_sents in raw_paras:
para = { para = {
'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents), 'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents),
'sents': [], 'sents': [],
'tokens': [], 'tokens': [],
'brackets': []} 'brackets': [],
'entities': []}
offset = 0 offset = 0
for raw_sent in raw_sents: for raw_sent in raw_sents:
_, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True) _, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True)
_, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True) _, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
if ner_sents is not None:
_, ner = read_ner.parse(ner_sents[i], strip_bad_periods=True)
else:
ner = None
for token_id, token in enumerate(annot): for token_id, token in enumerate(annot):
try: try:
head = (token['head'] + offset) if token['head'] != -1 else -1 head = (token['head'] + offset) if token['head'] != -1 else -1
@ -63,11 +72,19 @@ def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
'dep': token['dep']}) 'dep': token['dep']})
except: except:
raise raise
if ner is not None:
for label, start, end in ner:
if start != end:
para['entities'].append({
'label': label,
'first': start + offset,
'last': (end-1) + offset})
for label, start, end in brackets: for label, start, end in brackets:
if start != end: if start != end:
para['brackets'].append({'label': label, para['brackets'].append({
'start': start + offset, 'label': label,
'end': (end-1) + offset}) 'first': start + offset,
'last': (end-1) + offset})
i += 1 i += 1
offset += len(annot) offset += len(annot)
para['sents'].append(offset) para['sents'].append(offset)
@ -87,9 +104,15 @@ def main(onto_dir, raw_dir, out_dir):
continue continue
ptb_loc = path.join(onto_dir, section, '%s.parse' % filename) ptb_loc = path.join(onto_dir, section, '%s.parse' % filename)
dep_loc = ptb_loc + '.dep' dep_loc = ptb_loc + '.dep'
if path.exists(ptb_loc) and path.exists(dep_loc): ner_loc = path.join(onto_dir, section, '%s.name' % filename)
doc = format_doc(section, filename, raw_paras, ptb_loc, dep_loc) if path.exists(ptb_loc) and path.exists(dep_loc) and path.exists(ner_loc):
docs.append(doc) docs.append(
format_doc(
filename,
raw_paras,
open(ptb_loc).read().strip(),
open(dep_loc).read().strip(),
open(ner_loc).read().strip() if path.exists(ner_loc) else None))
with open(path.join(out_dir, '%s.json' % section), 'w') as file_: with open(path.join(out_dir, '%s.json' % section), 'w') as file_:
json.dump(docs, file_, indent=4) json.dump(docs, file_, indent=4)