Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthw Honnibal 2020-05-21 19:11:08 +02:00
commit 1729165e90
4 changed files with 66 additions and 37 deletions

View File

@ -26,7 +26,7 @@ def conllu2json(
Extract NER tags if available and convert them so that they follow Extract NER tags if available and convert them so that they follow
BILUO and the Wikipedia scheme BILUO and the Wikipedia scheme
""" """
MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?" MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
msg = Printer(no_print=no_print) msg = Printer(no_print=no_print)
n_sents_info(msg, n_sents) n_sents_info(msg, n_sents)
docs = [] docs = []
@ -39,7 +39,7 @@ def conllu2json(
ner_map=ner_map, ner_map=ner_map,
merge_subtokens=merge_subtokens, merge_subtokens=merge_subtokens,
) )
has_ner_tags = has_ner(input_data, ner_tag_pattern=MISC_NER_PATTERN) has_ner_tags = has_ner(input_data, MISC_NER_PATTERN)
for i, example in enumerate(conll_data): for i, example in enumerate(conll_data):
raw += example.text raw += example.text
sentences.append( sentences.append(
@ -65,20 +65,19 @@ def conllu2json(
def has_ner(input_data, ner_tag_pattern): def has_ner(input_data, ner_tag_pattern):
""" """
Check the 10th column of the first token to determine if the file contains Check the MISC column for NER tags.
NER tags
""" """
for sent in input_data.strip().split("\n\n"): for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split("\n") lines = sent.strip().split("\n")
if lines: if lines:
while lines[0].startswith("#"): while lines[0].startswith("#"):
lines.pop(0) lines.pop(0)
if lines: for line in lines:
parts = lines[0].split("\t") parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
if re.search(ner_tag_pattern, misc): for misc_part in misc.split("|"):
if re.match(ner_tag_pattern, misc_part):
return True return True
else:
return False return False
@ -127,8 +126,9 @@ def get_entities(lines, tag_pattern, ner_map=None):
iob = [] iob = []
for misc in miscs: for misc in miscs:
tag_match = re.search(tag_pattern, misc)
iob_tag = "O" iob_tag = "O"
for misc_part in misc.split("|"):
tag_match = re.match(tag_pattern, misc_part)
if tag_match: if tag_match:
prefix = tag_match.group(2) prefix = tag_match.group(2)
suffix = tag_match.group(3) suffix = tag_match.group(3)
@ -140,6 +140,7 @@ def get_entities(lines, tag_pattern, ner_map=None):
iob_tag = "O" iob_tag = "O"
else: else:
iob_tag = prefix + "-" + suffix iob_tag = prefix + "-" + suffix
break
iob.append(iob_tag) iob.append(iob_tag)
return iob_to_biluo(iob) return iob_to_biluo(iob)

View File

@ -53,7 +53,7 @@ cdef class TokenAnnotation:
cdef public list deps cdef public list deps
cdef public list entities cdef public list entities
cdef public list sent_starts cdef public list sent_starts
cdef public list brackets cdef public dict brackets_by_start
cdef class DocAnnotation: cdef class DocAnnotation:

View File

@ -658,7 +658,18 @@ cdef class TokenAnnotation:
self.deps = deps if deps else [] self.deps = deps if deps else []
self.entities = entities if entities else [] self.entities = entities if entities else []
self.sent_starts = sent_starts if sent_starts else [] self.sent_starts = sent_starts if sent_starts else []
self.brackets = brackets if brackets else [] self.brackets_by_start = {}
if brackets:
for b_start, b_end, b_label in brackets:
self.brackets_by_start.setdefault(b_start, []).append((b_end, b_label))
@property
def brackets(self):
brackets = []
for start, ends_labels in self.brackets_by_start.items():
for end, label in ends_labels:
brackets.append((start, end, label))
return brackets
@classmethod @classmethod
def from_dict(cls, token_dict): def from_dict(cls, token_dict):
@ -811,8 +822,10 @@ cdef class Example:
s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], [] s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
s_brackets = [] s_brackets = []
sent_start_i = 0 sent_start_i = 0
t = self.token_annotation cdef TokenAnnotation t = self.token_annotation
split_examples = [] split_examples = []
cdef int b_start, b_end
cdef unicode b_label
for i in range(len(t.words)): for i in range(len(t.words)):
if i > 0 and t.sent_starts[i] == 1: if i > 0 and t.sent_starts[i] == 1:
s_example.set_token_annotation(ids=s_ids, s_example.set_token_annotation(ids=s_ids,
@ -836,9 +849,10 @@ cdef class Example:
s_deps.append(t.get_dep(i)) s_deps.append(t.get_dep(i))
s_ents.append(t.get_entity(i)) s_ents.append(t.get_entity(i))
s_sent_starts.append(t.get_sent_start(i)) s_sent_starts.append(t.get_sent_start(i))
s_brackets.extend((b[0] - sent_start_i, for b_end, b_label in t.brackets_by_start.get(i, []):
b[1] - sent_start_i, b[2]) s_brackets.append(
for b in t.brackets if b[0] == i) (i - sent_start_i, b_end - sent_start_i, b_label)
)
i += 1 i += 1
s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads, pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads,
@ -904,8 +918,10 @@ cdef class Example:
examples = [examples] examples = [examples]
converted_examples = [] converted_examples = []
for ex in examples: for ex in examples:
if isinstance(ex, Example):
converted_examples.append(ex)
# convert string to Doc to Example # convert string to Doc to Example
if isinstance(ex, str): elif isinstance(ex, str):
if keep_raw_text: if keep_raw_text:
converted_examples.append(Example(doc=ex)) converted_examples.append(Example(doc=ex))
else: else:

View File

@ -29,14 +29,26 @@ def test_cli_converters_conllu2json():
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
def test_cli_converters_conllu2json_name_ner_map(): @pytest.mark.parametrize(
lines = [ "lines",
[
(
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER",
"3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER",
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD", "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD",
] ),
(
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\t_",
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|NE=B-PER",
"3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tNE=L-PER",
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No",
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tNE=B-BAD",
),
],
)
def test_cli_converters_conllu2json_name_ner_map(lines):
input_data = "\n".join(lines) input_data = "\n".join(lines)
converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
assert len(converted) == 1 assert len(converted) == 1