mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-26 11:53:40 +03:00
* Work on prepare_treebank script, adding NER to it
This commit is contained in:
parent
15bbbf4901
commit
61885aee76
|
@ -4,18 +4,20 @@ doc: {
|
||||||
id: string,
|
id: string,
|
||||||
paragraphs: [{
|
paragraphs: [{
|
||||||
raw: string,
|
raw: string,
|
||||||
segmented: string,
|
|
||||||
sents: [int],
|
sents: [int],
|
||||||
tokens: [{
|
tokens: [{
|
||||||
start: int,
|
start: int,
|
||||||
tag: string,
|
tag: string,
|
||||||
head: int,
|
head: int,
|
||||||
dep: string}],
|
dep: string}],
|
||||||
|
ner: [{
|
||||||
|
start: int,
|
||||||
|
end: int,
|
||||||
|
label: string}],
|
||||||
brackets: [{
|
brackets: [{
|
||||||
start: int,
|
start: int,
|
||||||
end: int,
|
end: int,
|
||||||
label: string,
|
label: string}]}]}
|
||||||
flabel: int}]}]}
|
|
||||||
|
|
||||||
Consumes output of spacy/munge/align_raw.py
|
Consumes output of spacy/munge/align_raw.py
|
||||||
"""
|
"""
|
||||||
|
@ -26,6 +28,7 @@ import re
|
||||||
|
|
||||||
from spacy.munge import read_ptb
|
from spacy.munge import read_ptb
|
||||||
from spacy.munge import read_conll
|
from spacy.munge import read_conll
|
||||||
|
from spacy.munge import read_ner
|
||||||
|
|
||||||
|
|
||||||
def _iter_raw_files(raw_loc):
|
def _iter_raw_files(raw_loc):
|
||||||
|
@ -34,24 +37,30 @@ def _iter_raw_files(raw_loc):
|
||||||
yield f
|
yield f
|
||||||
|
|
||||||
|
|
||||||
def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
|
def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
|
||||||
ptb_sents = read_ptb.split(open(ptb_loc).read())
|
ptb_sents = read_ptb.split(ptb_text)
|
||||||
dep_sents = read_conll.split(open(dep_loc).read())
|
dep_sents = read_conll.split(dep_text)
|
||||||
|
ner_sents = read_ner.split(ner_text) if ner_text is not None else None
|
||||||
|
|
||||||
assert len(ptb_sents) == len(dep_sents)
|
assert len(ptb_sents) == len(dep_sents)
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
doc = {'id': filename, 'paragraphs': []}
|
doc = {'id': file_id, 'paragraphs': []}
|
||||||
for raw_sents in raw_paras:
|
for raw_sents in raw_paras:
|
||||||
para = {
|
para = {
|
||||||
'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents),
|
'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents),
|
||||||
'sents': [],
|
'sents': [],
|
||||||
'tokens': [],
|
'tokens': [],
|
||||||
'brackets': []}
|
'brackets': [],
|
||||||
|
'entities': []}
|
||||||
offset = 0
|
offset = 0
|
||||||
for raw_sent in raw_sents:
|
for raw_sent in raw_sents:
|
||||||
_, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True)
|
_, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True)
|
||||||
_, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
|
_, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
|
||||||
|
if ner_sents is not None:
|
||||||
|
_, ner = read_ner.parse(ner_sents[i], strip_bad_periods=True)
|
||||||
|
else:
|
||||||
|
ner = None
|
||||||
for token_id, token in enumerate(annot):
|
for token_id, token in enumerate(annot):
|
||||||
try:
|
try:
|
||||||
head = (token['head'] + offset) if token['head'] != -1 else -1
|
head = (token['head'] + offset) if token['head'] != -1 else -1
|
||||||
|
@ -63,11 +72,19 @@ def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
|
||||||
'dep': token['dep']})
|
'dep': token['dep']})
|
||||||
except:
|
except:
|
||||||
raise
|
raise
|
||||||
|
if ner is not None:
|
||||||
|
for label, start, end in ner:
|
||||||
|
if start != end:
|
||||||
|
para['entities'].append({
|
||||||
|
'label': label,
|
||||||
|
'first': start + offset,
|
||||||
|
'last': (end-1) + offset})
|
||||||
for label, start, end in brackets:
|
for label, start, end in brackets:
|
||||||
if start != end:
|
if start != end:
|
||||||
para['brackets'].append({'label': label,
|
para['brackets'].append({
|
||||||
'start': start + offset,
|
'label': label,
|
||||||
'end': (end-1) + offset})
|
'first': start + offset,
|
||||||
|
'last': (end-1) + offset})
|
||||||
i += 1
|
i += 1
|
||||||
offset += len(annot)
|
offset += len(annot)
|
||||||
para['sents'].append(offset)
|
para['sents'].append(offset)
|
||||||
|
@ -87,9 +104,15 @@ def main(onto_dir, raw_dir, out_dir):
|
||||||
continue
|
continue
|
||||||
ptb_loc = path.join(onto_dir, section, '%s.parse' % filename)
|
ptb_loc = path.join(onto_dir, section, '%s.parse' % filename)
|
||||||
dep_loc = ptb_loc + '.dep'
|
dep_loc = ptb_loc + '.dep'
|
||||||
if path.exists(ptb_loc) and path.exists(dep_loc):
|
ner_loc = path.join(onto_dir, section, '%s.name' % filename)
|
||||||
doc = format_doc(section, filename, raw_paras, ptb_loc, dep_loc)
|
if path.exists(ptb_loc) and path.exists(dep_loc) and path.exists(ner_loc):
|
||||||
docs.append(doc)
|
docs.append(
|
||||||
|
format_doc(
|
||||||
|
filename,
|
||||||
|
raw_paras,
|
||||||
|
open(ptb_loc).read().strip(),
|
||||||
|
open(dep_loc).read().strip(),
|
||||||
|
open(ner_loc).read().strip() if path.exists(ner_loc) else None))
|
||||||
with open(path.join(out_dir, '%s.json' % section), 'w') as file_:
|
with open(path.join(out_dir, '%s.json' % section), 'w') as file_:
|
||||||
json.dump(docs, file_, indent=4)
|
json.dump(docs, file_, indent=4)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user