mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-05 13:43:24 +03:00
* Change data format of JSON corpus, putting sentences into lists with the paragraph
This commit is contained in:
parent
784e577f45
commit
2d11739f28
|
@ -71,44 +71,44 @@ def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
|
||||||
|
|
||||||
|
|
||||||
def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
|
def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
|
||||||
para = {
|
para = {'raw': raw_text, 'sentences': []}
|
||||||
'raw': raw_text,
|
|
||||||
'sents': [],
|
|
||||||
'tokens': [],
|
|
||||||
'brackets': []}
|
|
||||||
offset = 0
|
offset = 0
|
||||||
assert len(ptb_sents) == len(dep_sents) == len(ner_sents)
|
assert len(ptb_sents) == len(dep_sents) == len(ner_sents)
|
||||||
for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents):
|
for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents):
|
||||||
_, annot = read_conll.parse(dep_text, strip_bad_periods=True)
|
_, deps = read_conll.parse(dep_text, strip_bad_periods=True)
|
||||||
if annot and 'VERB' in [t['tag'] for t in annot]:
|
if deps and 'VERB' in [t['tag'] for t in deps]:
|
||||||
continue
|
continue
|
||||||
if ner_text is not None:
|
if ner_text is not None:
|
||||||
_, ner = read_ner.parse(ner_text, strip_bad_periods=True)
|
_, ner = read_ner.parse(ner_text, strip_bad_periods=True)
|
||||||
else:
|
else:
|
||||||
ner = ['-' for _ in annot]
|
ner = ['-' for _ in deps]
|
||||||
# Necessary because the ClearNLP converter deletes EDITED words.
|
|
||||||
if len(ner) != len(annot):
|
|
||||||
ner = ['-' for _ in annot]
|
|
||||||
for token_id, (token, token_ent) in enumerate(zip(annot, ner)):
|
|
||||||
para['tokens'].append(format_token(offset, token_id, token, token_ent))
|
|
||||||
|
|
||||||
_, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True)
|
_, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True)
|
||||||
for label, start, end in brackets:
|
# Necessary because the ClearNLP converter deletes EDITED words.
|
||||||
if start != end:
|
if len(ner) != len(deps):
|
||||||
para['brackets'].append({
|
ner = ['-' for _ in deps]
|
||||||
'label': label,
|
para['sentences'].append(format_sentence(deps, ner, brackets))
|
||||||
'first': start + offset,
|
|
||||||
'last': (end-1) + offset})
|
|
||||||
offset += len(annot)
|
|
||||||
para['sents'].append(offset)
|
|
||||||
return para
|
return para
|
||||||
|
|
||||||
|
|
||||||
def format_token(offset, token_id, token, ner):
|
def format_sentence(deps, ner, brackets):
|
||||||
|
sent = {'tokens': [], 'brackets': []}
|
||||||
|
for token_id, (token, token_ent) in enumerate(zip(deps, ner)):
|
||||||
|
sent['tokens'].append(format_token(token_id, token, token_ent))
|
||||||
|
|
||||||
|
for label, start, end in brackets:
|
||||||
|
if start != end:
|
||||||
|
sent['brackets'].append({
|
||||||
|
'label': label,
|
||||||
|
'first': start,
|
||||||
|
'last': (end-1)})
|
||||||
|
return sent
|
||||||
|
|
||||||
|
|
||||||
|
def format_token(token_id, token, ner):
|
||||||
assert token_id == token['id']
|
assert token_id == token['id']
|
||||||
head = (token['head'] + offset) if token['head'] != -1 else -1
|
head = (token['head'] - token_id) if token['head'] != -1 else 0
|
||||||
return {
|
return {
|
||||||
'id': offset + token_id,
|
'id': token_id,
|
||||||
'orth': token['word'],
|
'orth': token['word'],
|
||||||
'tag': token['tag'],
|
'tag': token['tag'],
|
||||||
'head': head,
|
'head': head,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user