mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 02:16:32 +03:00
* Fix alignment in prepare_treebank
This commit is contained in:
parent
0605af6838
commit
e0ef6b6992
|
@ -16,6 +16,8 @@ doc: {
|
||||||
end: int,
|
end: int,
|
||||||
label: string,
|
label: string,
|
||||||
flabel: int}]}]}
|
flabel: int}]}]}
|
||||||
|
|
||||||
|
Consumes output of spacy/munge/align_raw.py
|
||||||
"""
|
"""
|
||||||
import plac
|
import plac
|
||||||
import json
|
import json
|
||||||
|
@ -39,7 +41,7 @@ def _get_word_indices(raw_sent, word_idx, offset):
|
||||||
indices[word_idx] = offset + match.start()
|
indices[word_idx] = offset + match.start()
|
||||||
word_idx += 1
|
word_idx += 1
|
||||||
offset += len(piece)
|
offset += len(piece)
|
||||||
return indices, word_idx, offset
|
return indices, word_idx, offset + 1
|
||||||
|
|
||||||
|
|
||||||
def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
|
def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
|
||||||
|
@ -49,25 +51,27 @@ def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
|
||||||
assert len(ptb_sents) == len(dep_sents)
|
assert len(ptb_sents) == len(dep_sents)
|
||||||
|
|
||||||
word_idx = 0
|
word_idx = 0
|
||||||
offset = 0
|
|
||||||
i = 0
|
i = 0
|
||||||
doc = {'id': 'wsj_%s%s' % (section, filename), 'paragraphs': []}
|
doc = {'id': 'wsj_%s%s' % (section, filename), 'paragraphs': []}
|
||||||
for raw_sents in raw_paras:
|
for raw_sents in raw_paras:
|
||||||
para = {'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents),
|
para = {'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents),
|
||||||
'segmented': '<PARA>'.join(raw_sents),
|
'segmented': '<SENT>'.join(raw_sents),
|
||||||
'sents': [],
|
'sents': [],
|
||||||
'tokens': [],
|
'tokens': [],
|
||||||
'brackets': []}
|
'brackets': []}
|
||||||
|
offset = 0
|
||||||
for raw_sent in raw_sents:
|
for raw_sent in raw_sents:
|
||||||
|
words = raw_sent.replace('<SEP>', ' ').split()
|
||||||
para['sents'].append(offset)
|
para['sents'].append(offset)
|
||||||
_, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True)
|
_, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True)
|
||||||
_, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
|
_, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
|
||||||
indices, word_idx, offset = _get_word_indices(raw_sent, 0, offset)
|
indices, word_idx, offset = _get_word_indices(raw_sent, 0, offset)
|
||||||
|
for j, token in enumerate(annot):
|
||||||
for token in annot:
|
head = indices[token['head']] if token['head'] != -1 else -1
|
||||||
head = indices[token['head']]
|
|
||||||
try:
|
try:
|
||||||
para['tokens'].append({'start': indices[token['id']],
|
para['tokens'].append({
|
||||||
|
'start': indices[token['id']],
|
||||||
|
'orth': words[j],
|
||||||
'tag': token['tag'],
|
'tag': token['tag'],
|
||||||
'head': head,
|
'head': head,
|
||||||
'dep': token['dep']})
|
'dep': token['dep']})
|
||||||
|
|
Loading…
Reference in New Issue
Block a user