mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						1729165e90
					
				|  | @ -26,7 +26,7 @@ def conllu2json( | |||
|     Extract NER tags if available and convert them so that they follow | ||||
|     BILUO and the Wikipedia scheme | ||||
|     """ | ||||
|     MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?" | ||||
|     MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$" | ||||
|     msg = Printer(no_print=no_print) | ||||
|     n_sents_info(msg, n_sents) | ||||
|     docs = [] | ||||
|  | @ -39,7 +39,7 @@ def conllu2json( | |||
|         ner_map=ner_map, | ||||
|         merge_subtokens=merge_subtokens, | ||||
|     ) | ||||
|     has_ner_tags = has_ner(input_data, ner_tag_pattern=MISC_NER_PATTERN) | ||||
|     has_ner_tags = has_ner(input_data, MISC_NER_PATTERN) | ||||
|     for i, example in enumerate(conll_data): | ||||
|         raw += example.text | ||||
|         sentences.append( | ||||
|  | @ -65,21 +65,20 @@ def conllu2json( | |||
| 
 | ||||
| def has_ner(input_data, ner_tag_pattern): | ||||
|     """ | ||||
|     Check the 10th column of the first token to determine if the file contains | ||||
|     NER tags | ||||
|     Check the MISC column for NER tags. | ||||
|     """ | ||||
|     for sent in input_data.strip().split("\n\n"): | ||||
|         lines = sent.strip().split("\n") | ||||
|         if lines: | ||||
|             while lines[0].startswith("#"): | ||||
|                 lines.pop(0) | ||||
|             if lines: | ||||
|                 parts = lines[0].split("\t") | ||||
|             for line in lines: | ||||
|                 parts = line.split("\t") | ||||
|                 id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts | ||||
|                 if re.search(ner_tag_pattern, misc): | ||||
|                     return True | ||||
|                 else: | ||||
|                     return False | ||||
|                 for misc_part in misc.split("|"): | ||||
|                     if re.match(ner_tag_pattern, misc_part): | ||||
|                         return True | ||||
|     return False | ||||
| 
 | ||||
| 
 | ||||
| def read_conllx( | ||||
|  | @ -127,19 +126,21 @@ def get_entities(lines, tag_pattern, ner_map=None): | |||
| 
 | ||||
|     iob = [] | ||||
|     for misc in miscs: | ||||
|         tag_match = re.search(tag_pattern, misc) | ||||
|         iob_tag = "O" | ||||
|         if tag_match: | ||||
|             prefix = tag_match.group(2) | ||||
|             suffix = tag_match.group(3) | ||||
|             if prefix and suffix: | ||||
|                 iob_tag = prefix + "-" + suffix | ||||
|                 if ner_map: | ||||
|                     suffix = ner_map.get(suffix, suffix) | ||||
|                     if suffix == "": | ||||
|                         iob_tag = "O" | ||||
|                     else: | ||||
|                         iob_tag = prefix + "-" + suffix | ||||
|         for misc_part in misc.split("|"): | ||||
|             tag_match = re.match(tag_pattern, misc_part) | ||||
|             if tag_match: | ||||
|                 prefix = tag_match.group(2) | ||||
|                 suffix = tag_match.group(3) | ||||
|                 if prefix and suffix: | ||||
|                     iob_tag = prefix + "-" + suffix | ||||
|                     if ner_map: | ||||
|                         suffix = ner_map.get(suffix, suffix) | ||||
|                         if suffix == "": | ||||
|                             iob_tag = "O" | ||||
|                         else: | ||||
|                             iob_tag = prefix + "-" + suffix | ||||
|                 break | ||||
|         iob.append(iob_tag) | ||||
|     return iob_to_biluo(iob) | ||||
| 
 | ||||
|  |  | |||
|  | @ -53,7 +53,7 @@ cdef class TokenAnnotation: | |||
|     cdef public list deps | ||||
|     cdef public list entities | ||||
|     cdef public list sent_starts | ||||
|     cdef public list brackets | ||||
|     cdef public dict brackets_by_start | ||||
| 
 | ||||
| 
 | ||||
| cdef class DocAnnotation: | ||||
|  |  | |||
|  | @ -658,7 +658,18 @@ cdef class TokenAnnotation: | |||
|         self.deps = deps if deps else [] | ||||
|         self.entities = entities if entities else [] | ||||
|         self.sent_starts = sent_starts if sent_starts else [] | ||||
|         self.brackets = brackets if brackets else [] | ||||
|         self.brackets_by_start = {} | ||||
|         if brackets: | ||||
|             for b_start, b_end, b_label in brackets: | ||||
|                 self.brackets_by_start.setdefault(b_start, []).append((b_end, b_label)) | ||||
| 
 | ||||
|     @property | ||||
|     def brackets(self): | ||||
|         brackets = [] | ||||
|         for start, ends_labels in self.brackets_by_start.items(): | ||||
|             for end, label in ends_labels: | ||||
|                 brackets.append((start, end, label)) | ||||
|         return brackets | ||||
| 
 | ||||
|     @classmethod | ||||
|     def from_dict(cls, token_dict): | ||||
|  | @ -811,8 +822,10 @@ cdef class Example: | |||
|         s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], [] | ||||
|         s_brackets = [] | ||||
|         sent_start_i = 0 | ||||
|         t = self.token_annotation | ||||
|         cdef TokenAnnotation t = self.token_annotation | ||||
|         split_examples = [] | ||||
|         cdef int b_start, b_end | ||||
|         cdef unicode b_label | ||||
|         for i in range(len(t.words)): | ||||
|             if i > 0 and t.sent_starts[i] == 1: | ||||
|                 s_example.set_token_annotation(ids=s_ids, | ||||
|  | @ -836,9 +849,10 @@ cdef class Example: | |||
|             s_deps.append(t.get_dep(i)) | ||||
|             s_ents.append(t.get_entity(i)) | ||||
|             s_sent_starts.append(t.get_sent_start(i)) | ||||
|             s_brackets.extend((b[0] - sent_start_i, | ||||
|                                b[1] - sent_start_i, b[2]) | ||||
|                                for b in t.brackets if b[0] == i) | ||||
|             for b_end, b_label in t.brackets_by_start.get(i, []): | ||||
|                 s_brackets.append( | ||||
|                     (i - sent_start_i, b_end - sent_start_i, b_label) | ||||
|                 ) | ||||
|             i += 1 | ||||
|         s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, | ||||
|                 pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads, | ||||
|  | @ -904,8 +918,10 @@ cdef class Example: | |||
|             examples = [examples] | ||||
|         converted_examples = [] | ||||
|         for ex in examples: | ||||
|             if isinstance(ex, Example): | ||||
|                 converted_examples.append(ex) | ||||
|             # convert string to Doc to Example | ||||
|             if isinstance(ex, str): | ||||
|             elif isinstance(ex, str): | ||||
|                 if keep_raw_text: | ||||
|                     converted_examples.append(Example(doc=ex)) | ||||
|                 else: | ||||
|  |  | |||
|  | @ -29,14 +29,26 @@ def test_cli_converters_conllu2json(): | |||
|     assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] | ||||
| 
 | ||||
| 
 | ||||
| def test_cli_converters_conllu2json_name_ner_map(): | ||||
|     lines = [ | ||||
|         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", | ||||
|         "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", | ||||
|         "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", | ||||
|         "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", | ||||
|         "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD", | ||||
|     ] | ||||
| @pytest.mark.parametrize( | ||||
|     "lines", | ||||
|     [ | ||||
|         ( | ||||
|             "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", | ||||
|             "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", | ||||
|             "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", | ||||
|             "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", | ||||
|             "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD", | ||||
|         ), | ||||
|         ( | ||||
|             "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\t_", | ||||
|             "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|NE=B-PER", | ||||
|             "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tNE=L-PER", | ||||
|             "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No", | ||||
|             "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tNE=B-BAD", | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_cli_converters_conllu2json_name_ner_map(lines): | ||||
|     input_data = "\n".join(lines) | ||||
|     converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) | ||||
|     assert len(converted) == 1 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user