diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index ecdc2ae66..0b2920802 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -26,7 +26,7 @@ def conllu2json( Extract NER tags if available and convert them so that they follow BILUO and the Wikipedia scheme """ - MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?" + MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$" msg = Printer(no_print=no_print) n_sents_info(msg, n_sents) docs = [] @@ -39,7 +39,7 @@ def conllu2json( ner_map=ner_map, merge_subtokens=merge_subtokens, ) - has_ner_tags = has_ner(input_data, ner_tag_pattern=MISC_NER_PATTERN) + has_ner_tags = has_ner(input_data, MISC_NER_PATTERN) for i, example in enumerate(conll_data): raw += example.text sentences.append( @@ -65,21 +65,20 @@ def conllu2json( def has_ner(input_data, ner_tag_pattern): """ - Check the 10th column of the first token to determine if the file contains - NER tags + Check the MISC column for NER tags. """ for sent in input_data.strip().split("\n\n"): lines = sent.strip().split("\n") if lines: while lines[0].startswith("#"): lines.pop(0) - if lines: - parts = lines[0].split("\t") + for line in lines: + parts = line.split("\t") id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts - if re.search(ner_tag_pattern, misc): - return True - else: - return False + for misc_part in misc.split("|"): + if re.match(ner_tag_pattern, misc_part): + return True + return False def read_conllx( @@ -127,19 +126,21 @@ def get_entities(lines, tag_pattern, ner_map=None): iob = [] for misc in miscs: - tag_match = re.search(tag_pattern, misc) iob_tag = "O" - if tag_match: - prefix = tag_match.group(2) - suffix = tag_match.group(3) - if prefix and suffix: - iob_tag = prefix + "-" + suffix - if ner_map: - suffix = ner_map.get(suffix, suffix) - if suffix == "": - iob_tag = "O" - else: - iob_tag = prefix + "-" + suffix + for misc_part in misc.split("|"): + tag_match = re.match(tag_pattern, misc_part) + if tag_match: + prefix = tag_match.group(2) + suffix = tag_match.group(3) + if prefix and suffix: + iob_tag = prefix + "-" + suffix + if ner_map: + suffix = ner_map.get(suffix, suffix) + if suffix == "": + iob_tag = "O" + else: + iob_tag = prefix + "-" + suffix + break iob.append(iob_tag) return iob_to_biluo(iob) diff --git a/spacy/gold.pxd b/spacy/gold.pxd index c5ab6ebbe..bf724868f 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -53,7 +53,7 @@ cdef class TokenAnnotation: cdef public list deps cdef public list entities cdef public list sent_starts - cdef public list brackets + cdef public dict brackets_by_start cdef class DocAnnotation: diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 46a6ae583..1864b7a04 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -658,7 +658,18 @@ cdef class TokenAnnotation: self.deps = deps if deps else [] self.entities = entities if entities else [] self.sent_starts = sent_starts if sent_starts else [] - self.brackets = brackets if brackets else [] + self.brackets_by_start = {} + if brackets: + for b_start, b_end, b_label in brackets: + self.brackets_by_start.setdefault(b_start, []).append((b_end, b_label)) + + @property + def brackets(self): + brackets = [] + for start, ends_labels in self.brackets_by_start.items(): + for end, label in ends_labels: + brackets.append((start, end, label)) + return brackets @classmethod def from_dict(cls, token_dict): @@ -811,8 +822,10 @@ cdef class Example: s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], [] s_brackets = [] sent_start_i = 0 - t = self.token_annotation + cdef TokenAnnotation t = self.token_annotation split_examples = [] + cdef int b_start, b_end + cdef unicode b_label for i in range(len(t.words)): if i > 0 and t.sent_starts[i] == 1: s_example.set_token_annotation(ids=s_ids, @@ -836,9 +849,10 @@ cdef class Example: s_deps.append(t.get_dep(i)) s_ents.append(t.get_entity(i)) s_sent_starts.append(t.get_sent_start(i)) - s_brackets.extend((b[0] - sent_start_i, - b[1] - sent_start_i, b[2]) - for b in t.brackets if b[0] == i) + for b_end, b_label in t.brackets_by_start.get(i, []): + s_brackets.append( + (i - sent_start_i, b_end - sent_start_i, b_label) + ) i += 1 s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads, @@ -904,8 +918,10 @@ cdef class Example: examples = [examples] converted_examples = [] for ex in examples: + if isinstance(ex, Example): + converted_examples.append(ex) # convert string to Doc to Example - if isinstance(ex, str): + elif isinstance(ex, str): if keep_raw_text: converted_examples.append(Example(doc=ex)) else: diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 306adc881..132f7ac9f 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -29,14 +29,26 @@ def test_cli_converters_conllu2json(): assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] -def test_cli_converters_conllu2json_name_ner_map(): - lines = [ - "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", - "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", - "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", - "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", - "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD", - ] +@pytest.mark.parametrize( + "lines", + [ + ( + "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", + "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", + "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", + "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", + "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD", + ), + ( + "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\t_", + "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|NE=B-PER", + "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tNE=L-PER", + "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No", + "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tNE=B-BAD", + ), + ], +) +def test_cli_converters_conllu2json_name_ner_map(lines): input_data = "\n".join(lines) converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) assert len(converted) == 1