mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 18:36:36 +03:00
* Read in OntoNotes. Doesn't support train/test/dev split yet
This commit is contained in:
parent
732fa7709a
commit
e140e03516
|
@ -21,10 +21,13 @@ doc: {
|
||||||
|
|
||||||
Consumes output of spacy/munge/align_raw.py
|
Consumes output of spacy/munge/align_raw.py
|
||||||
"""
|
"""
|
||||||
|
from __future__ import unicode_literals
|
||||||
import plac
|
import plac
|
||||||
import json
|
import json
|
||||||
from os import path
|
from os import path
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
|
import codecs
|
||||||
|
|
||||||
from spacy.munge import read_ptb
|
from spacy.munge import read_ptb
|
||||||
from spacy.munge import read_conll
|
from spacy.munge import read_conll
|
||||||
|
@ -40,78 +43,150 @@ def _iter_raw_files(raw_loc):
|
||||||
def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
|
def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
|
||||||
ptb_sents = read_ptb.split(ptb_text)
|
ptb_sents = read_ptb.split(ptb_text)
|
||||||
dep_sents = read_conll.split(dep_text)
|
dep_sents = read_conll.split(dep_text)
|
||||||
ner_sents = read_ner.split(ner_text) if ner_text is not None else None
|
if len(ptb_sents) != len(dep_sents):
|
||||||
|
return None
|
||||||
assert len(ptb_sents) == len(dep_sents)
|
if ner_text is not None:
|
||||||
|
ner_sents = read_ner.split(ner_text)
|
||||||
|
else:
|
||||||
|
ner_sents = [None] * len(ptb_sents)
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
doc = {'id': file_id, 'paragraphs': []}
|
doc = {'id': file_id}
|
||||||
for raw_sents in raw_paras:
|
if raw_paras is None:
|
||||||
para = {
|
doc['paragraphs'] = [format_para(None, ptb_sents, dep_sents, ner_sents)]
|
||||||
'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents),
|
else:
|
||||||
'sents': [],
|
doc['paragraphs'] = []
|
||||||
'tokens': [],
|
for raw_sents in raw_paras:
|
||||||
'brackets': [],
|
doc['paragraphs'].append(
|
||||||
'entities': []}
|
format_para(
|
||||||
offset = 0
|
' '.join(raw_sents).replace('<SEP>', ''),
|
||||||
for raw_sent in raw_sents:
|
ptb_sents[i:i+len(raw_sents)],
|
||||||
_, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True)
|
dep_sents[i:i+len(raw_sents)],
|
||||||
_, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
|
ner_sents[i:i+len(raw_sents)]))
|
||||||
if ner_sents is not None:
|
i += len(raw_sents)
|
||||||
_, ner = read_ner.parse(ner_sents[i], strip_bad_periods=True)
|
|
||||||
assert len(ner) == len(annot)
|
|
||||||
else:
|
|
||||||
ner = ['-' for _ in annot]
|
|
||||||
for token_id, token in enumerate(annot):
|
|
||||||
try:
|
|
||||||
head = (token['head'] + offset) if token['head'] != -1 else -1
|
|
||||||
para['tokens'].append({
|
|
||||||
'id': offset + token_id,
|
|
||||||
'orth': token['word'],
|
|
||||||
'tag': token['tag'],
|
|
||||||
'head': head,
|
|
||||||
'dep': token['dep'],
|
|
||||||
'ner': ner[token_id]})
|
|
||||||
except:
|
|
||||||
raise
|
|
||||||
for label, start, end in brackets:
|
|
||||||
if start != end:
|
|
||||||
para['brackets'].append({
|
|
||||||
'label': label,
|
|
||||||
'first': start + offset,
|
|
||||||
'last': (end-1) + offset})
|
|
||||||
i += 1
|
|
||||||
offset += len(annot)
|
|
||||||
para['sents'].append(offset)
|
|
||||||
doc['paragraphs'].append(para)
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
def main(onto_dir, raw_dir, out_dir):
|
def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
|
||||||
|
para = {
|
||||||
|
'raw': raw_text,
|
||||||
|
'sents': [],
|
||||||
|
'tokens': [],
|
||||||
|
'brackets': []}
|
||||||
|
offset = 0
|
||||||
|
assert len(ptb_sents) == len(dep_sents) == len(ner_sents)
|
||||||
|
for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents):
|
||||||
|
_, annot = read_conll.parse(dep_text, strip_bad_periods=True)
|
||||||
|
if ner_text is not None:
|
||||||
|
_, ner = read_ner.parse(ner_text, strip_bad_periods=True)
|
||||||
|
else:
|
||||||
|
ner = ['-' for _ in annot]
|
||||||
|
for token_id, (token, token_ent) in enumerate(zip(annot, ner)):
|
||||||
|
para['tokens'].append(format_token(offset, token_id, token, token_ent))
|
||||||
|
|
||||||
|
_, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True)
|
||||||
|
for label, start, end in brackets:
|
||||||
|
if start != end:
|
||||||
|
para['brackets'].append({
|
||||||
|
'label': label,
|
||||||
|
'first': start + offset,
|
||||||
|
'last': (end-1) + offset})
|
||||||
|
offset += len(annot)
|
||||||
|
para['sents'].append(offset)
|
||||||
|
return para
|
||||||
|
|
||||||
|
|
||||||
|
def format_token(offset, token_id, token, ner):
|
||||||
|
head = (token['head'] + offset) if token['head'] != -1 else -1
|
||||||
|
return {
|
||||||
|
'id': offset + token_id,
|
||||||
|
'orth': token['word'],
|
||||||
|
'tag': token['tag'],
|
||||||
|
'head': head,
|
||||||
|
'dep': token['dep'],
|
||||||
|
'ner': ner}
|
||||||
|
|
||||||
|
|
||||||
|
def read_file(*pieces):
|
||||||
|
loc = path.join(*pieces)
|
||||||
|
if not path.exists(loc):
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return codecs.open(loc, 'r', 'utf8').read().strip()
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_names(section_dir, subsection):
|
||||||
|
filenames = []
|
||||||
|
for fn in os.listdir(path.join(section_dir, subsection)):
|
||||||
|
filenames.append(fn.rsplit('.', 1)[0])
|
||||||
|
return list(sorted(set(filenames)))
|
||||||
|
|
||||||
|
|
||||||
|
def main(onto_dir, raw_dir, out_loc):
|
||||||
|
# All but WSJ --- we do that separately, as we have the source docs
|
||||||
|
sections = [
|
||||||
|
'bc/cctv',
|
||||||
|
'bc/cnn',
|
||||||
|
'bc/msnbc',
|
||||||
|
'bc/p2.5_a2e',
|
||||||
|
'bc/p2.5_c2e',
|
||||||
|
'bc/phoenix',
|
||||||
|
'bn/abc',
|
||||||
|
'bn/cnn',
|
||||||
|
'bn/mnb',
|
||||||
|
'bn/nbc',
|
||||||
|
'bn/p2.5_a2e',
|
||||||
|
'bn/p2.5_c2e',
|
||||||
|
'bn/pri',
|
||||||
|
'bn/voa',
|
||||||
|
'mz/sinorama',
|
||||||
|
'nw/dev_09_c2e',
|
||||||
|
'nw/p2.5_a2e',
|
||||||
|
'nw/p2.5_c2e',
|
||||||
|
'nw/xinhua',
|
||||||
|
'pt/ot',
|
||||||
|
'tc/ch',
|
||||||
|
'wb/a2e',
|
||||||
|
'wb/c2e',
|
||||||
|
'wb/eng',
|
||||||
|
'wb/dev_09_c2e',
|
||||||
|
'wb/p2.5_a2e',
|
||||||
|
'wb/p2.5_c2e',
|
||||||
|
'wb/sel'
|
||||||
|
]
|
||||||
|
docs = []
|
||||||
|
for section in sections:
|
||||||
|
section_dir = path.join(onto_dir, 'data', 'english', 'annotations', section)
|
||||||
|
print section, len(docs)
|
||||||
|
for subsection in os.listdir(section_dir):
|
||||||
|
for fn in get_file_names(section_dir, subsection):
|
||||||
|
ptb = read_file(section_dir, subsection, '%s.parse' % fn)
|
||||||
|
dep = read_file(section_dir, subsection, '%s.parse.dep' % fn)
|
||||||
|
ner = read_file(section_dir, subsection, '%s.name' % fn)
|
||||||
|
if ptb is not None:
|
||||||
|
doc = format_doc(fn, None, ptb, dep, ner)
|
||||||
|
if doc is not None:
|
||||||
|
docs.append(doc)
|
||||||
|
# Now do WSJ, with source alignment
|
||||||
|
onto_dir = path.join(onto_dir, 'data', 'english', 'annotations', 'nw', 'wsj')
|
||||||
for i in range(25):
|
for i in range(25):
|
||||||
section = str(i) if i >= 10 else ('0' + str(i))
|
section = str(i) if i >= 10 else ('0' + str(i))
|
||||||
raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
|
raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
|
||||||
docs = []
|
|
||||||
for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)):
|
for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)):
|
||||||
if section == '00':
|
if section == '00':
|
||||||
j += 1
|
j += 1
|
||||||
if section == '04' and filename == '55':
|
if section == '04' and filename == '55':
|
||||||
continue
|
continue
|
||||||
ptb_loc = path.join(onto_dir, section, '%s.parse' % filename)
|
ptb = read_file(onto_dir, section, '%s.parse' % filename)
|
||||||
dep_loc = ptb_loc + '.dep'
|
dep = read_file(onto_dir, section, '%s.parse.dep' % filename)
|
||||||
ner_loc = path.join(onto_dir, section, '%s.name' % filename)
|
ner = read_file(onto_dir, section, '%s.name' % filename)
|
||||||
if path.exists(ptb_loc) and path.exists(dep_loc) and path.exists(ner_loc):
|
if ptb is not None and dep is not None:
|
||||||
docs.append(
|
docs.append(format_doc(filename, raw_paras, ptb, dep, ner))
|
||||||
format_doc(
|
print 'nw/wsj', len(docs)
|
||||||
filename,
|
with open(out_loc, 'w') as file_:
|
||||||
raw_paras,
|
json.dump(docs, file_, indent=4)
|
||||||
open(ptb_loc).read().strip(),
|
|
||||||
open(dep_loc).read().strip(),
|
|
||||||
open(ner_loc).read().strip() if path.exists(ner_loc) else None))
|
|
||||||
with open(path.join(out_dir, '%s.json' % section), 'w') as file_:
|
|
||||||
json.dump(docs, file_, indent=4)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user