mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
* Have prepare_treebank read train/dev/test IDs.
This commit is contained in:
parent
e140e03516
commit
ef1333cf89
|
@ -122,53 +122,10 @@ def get_file_names(section_dir, subsection):
|
|||
return list(sorted(set(filenames)))
|
||||
|
||||
|
||||
def main(onto_dir, raw_dir, out_loc):
|
||||
# All but WSJ --- we do that separately, as we have the source docs
|
||||
sections = [
|
||||
'bc/cctv',
|
||||
'bc/cnn',
|
||||
'bc/msnbc',
|
||||
'bc/p2.5_a2e',
|
||||
'bc/p2.5_c2e',
|
||||
'bc/phoenix',
|
||||
'bn/abc',
|
||||
'bn/cnn',
|
||||
'bn/mnb',
|
||||
'bn/nbc',
|
||||
'bn/p2.5_a2e',
|
||||
'bn/p2.5_c2e',
|
||||
'bn/pri',
|
||||
'bn/voa',
|
||||
'mz/sinorama',
|
||||
'nw/dev_09_c2e',
|
||||
'nw/p2.5_a2e',
|
||||
'nw/p2.5_c2e',
|
||||
'nw/xinhua',
|
||||
'pt/ot',
|
||||
'tc/ch',
|
||||
'wb/a2e',
|
||||
'wb/c2e',
|
||||
'wb/eng',
|
||||
'wb/dev_09_c2e',
|
||||
'wb/p2.5_a2e',
|
||||
'wb/p2.5_c2e',
|
||||
'wb/sel'
|
||||
]
|
||||
docs = []
|
||||
for section in sections:
|
||||
section_dir = path.join(onto_dir, 'data', 'english', 'annotations', section)
|
||||
print section, len(docs)
|
||||
for subsection in os.listdir(section_dir):
|
||||
for fn in get_file_names(section_dir, subsection):
|
||||
ptb = read_file(section_dir, subsection, '%s.parse' % fn)
|
||||
dep = read_file(section_dir, subsection, '%s.parse.dep' % fn)
|
||||
ner = read_file(section_dir, subsection, '%s.name' % fn)
|
||||
if ptb is not None:
|
||||
doc = format_doc(fn, None, ptb, dep, ner)
|
||||
if doc is not None:
|
||||
docs.append(doc)
|
||||
def read_wsj_with_source(onto_dir, raw_dir):
|
||||
# Now do WSJ, with source alignment
|
||||
onto_dir = path.join(onto_dir, 'data', 'english', 'annotations', 'nw', 'wsj')
|
||||
docs = {}
|
||||
for i in range(25):
|
||||
section = str(i) if i >= 10 else ('0' + str(i))
|
||||
raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
|
||||
|
@ -181,12 +138,40 @@ def main(onto_dir, raw_dir, out_loc):
|
|||
dep = read_file(onto_dir, section, '%s.parse.dep' % filename)
|
||||
ner = read_file(onto_dir, section, '%s.name' % filename)
|
||||
if ptb is not None and dep is not None:
|
||||
docs.append(format_doc(filename, raw_paras, ptb, dep, ner))
|
||||
print 'nw/wsj', len(docs)
|
||||
with open(out_loc, 'w') as file_:
|
||||
json.dump(docs, file_, indent=4)
|
||||
docs[filename] = format_doc(filename, raw_paras, ptb, dep, ner)
|
||||
return docs
|
||||
|
||||
|
||||
def get_doc(onto_dir, file_path, wsj_docs):
|
||||
filename = file_path.rsplit('/', 1)[1]
|
||||
if filename in wsj_docs:
|
||||
return wsj_docs[filename]
|
||||
else:
|
||||
ptb = read_file(onto_dir, file_path + '.parse')
|
||||
dep = read_file(onto_dir, file_path + '.parse.dep')
|
||||
ner = read_file(onto_dir, file_path + '.name')
|
||||
if ptb is not None and dep is not None:
|
||||
return format_doc(filename, None, ptb, dep, ner)
|
||||
else:
|
||||
return None
|
||||
|
||||
def read_ids(loc):
|
||||
return open(loc).read().strip().split('\n')
|
||||
|
||||
def main(onto_dir, raw_dir, out_dir):
|
||||
wsj_docs = read_wsj_with_source(onto_dir, raw_dir)
|
||||
|
||||
for partition in ('train', 'test', 'development'):
|
||||
ids = read_ids(path.join(onto_dir, '%s.id' % partition))
|
||||
out_loc = path.join(out_dir, '%s.json' % partition)
|
||||
docs = []
|
||||
for file_path in ids:
|
||||
doc = get_doc(onto_dir, file_path, wsj_docs)
|
||||
if doc is not None:
|
||||
docs.append(doc)
|
||||
with open(out_loc, 'w') as file_:
|
||||
json.dump(docs, file_, indent=4)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
||||
|
|
Loading…
Reference in New Issue
Block a user