mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
* Tmp commit, while switch to new format that assumes alignment happens during training
This commit is contained in:
parent
f35503018e
commit
983d954ef4
|
@ -52,7 +52,7 @@ def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
|
||||||
|
|
||||||
word_idx = 0
|
word_idx = 0
|
||||||
i = 0
|
i = 0
|
||||||
doc = {'id': 'wsj_%s%s' % (section, filename), 'paragraphs': []}
|
doc = {'id': filename, 'paragraphs': []}
|
||||||
for raw_sents in raw_paras:
|
for raw_sents in raw_paras:
|
||||||
para = {'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents),
|
para = {'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents),
|
||||||
'segmented': '<SENT>'.join(raw_sents),
|
'segmented': '<SENT>'.join(raw_sents),
|
||||||
|
@ -67,8 +67,8 @@ def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
|
||||||
_, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
|
_, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
|
||||||
indices, word_idx, offset = _get_word_indices(raw_sent, 0, offset)
|
indices, word_idx, offset = _get_word_indices(raw_sent, 0, offset)
|
||||||
for j, token in enumerate(annot):
|
for j, token in enumerate(annot):
|
||||||
head = indices[token['head']] if token['head'] != -1 else -1
|
|
||||||
try:
|
try:
|
||||||
|
head = indices[token['head']] if token['head'] != -1 else -1
|
||||||
para['tokens'].append({
|
para['tokens'].append({
|
||||||
'start': indices[token['id']],
|
'start': indices[token['id']],
|
||||||
'orth': words[j],
|
'orth': words[j],
|
||||||
|
@ -76,9 +76,6 @@ def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
|
||||||
'head': head,
|
'head': head,
|
||||||
'dep': token['dep']})
|
'dep': token['dep']})
|
||||||
except:
|
except:
|
||||||
print sorted(indices.items())
|
|
||||||
print token
|
|
||||||
print raw_sent
|
|
||||||
raise
|
raise
|
||||||
for label, start, end in brackets:
|
for label, start, end in brackets:
|
||||||
if start != end:
|
if start != end:
|
||||||
|
@ -95,20 +92,18 @@ def main(onto_dir, raw_dir, out_dir):
|
||||||
section = str(i) if i >= 10 else ('0' + str(i))
|
section = str(i) if i >= 10 else ('0' + str(i))
|
||||||
raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
|
raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
|
||||||
docs = []
|
docs = []
|
||||||
for j, raw_paras in enumerate(_iter_raw_files(raw_loc)):
|
for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)):
|
||||||
if section == '00':
|
if section == '00':
|
||||||
j += 1
|
j += 1
|
||||||
filename = str(j) if j >= 9 else ('0' + str(j))
|
|
||||||
if section == '04' and filename == '55':
|
if section == '04' and filename == '55':
|
||||||
continue
|
continue
|
||||||
ptb_loc = path.join(onto_dir, section, 'wsj_%s%s.mrg' % (section, filename))
|
ptb_loc = path.join(onto_dir, section, '%s.parse' % filename)
|
||||||
dep_loc = ptb_loc + '.3.pa.gs.tab'
|
dep_loc = ptb_loc + '.dep'
|
||||||
if path.exists(ptb_loc) and path.exists(dep_loc):
|
if path.exists(ptb_loc) and path.exists(dep_loc):
|
||||||
print ptb_loc
|
|
||||||
doc = format_doc(section, filename, raw_paras, ptb_loc, dep_loc)
|
doc = format_doc(section, filename, raw_paras, ptb_loc, dep_loc)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
with open(path.join(out_dir, '%s.json' % section), 'w') as file_:
|
with open(path.join(out_dir, '%s.json' % section), 'w') as file_:
|
||||||
json.dump(docs, file_)
|
json.dump(docs, file_, indent=4)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
Loading…
Reference in New Issue
Block a user