mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Update conllu2json MISC column handling (#4715)
Update converter to handle various things in MISC column: * `SpaceAfter=No` and set raw text accordingly * plain NER tag * name=NER (for NorNE)
This commit is contained in:
		
							parent
							
								
									9aab0a55e1
								
							
						
					
					
						commit
						9efd3ccbef
					
				|  | @ -18,21 +18,28 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): | ||||||
|     """ |     """ | ||||||
|     # by @dvsrepo, via #11 explosion/spacy-dev-resources |     # by @dvsrepo, via #11 explosion/spacy-dev-resources | ||||||
|     # by @katarkor |     # by @katarkor | ||||||
|  |     # name=NER is to handle NorNE | ||||||
|  |     MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?" | ||||||
|     docs = [] |     docs = [] | ||||||
|  |     raw = "" | ||||||
|     sentences = [] |     sentences = [] | ||||||
|     conll_data = read_conllx(input_data, use_morphology=use_morphology) |     conll_data = read_conllx(input_data, use_morphology=use_morphology) | ||||||
|     checked_for_ner = False |     checked_for_ner = False | ||||||
|     has_ner_tags = False |     has_ner_tags = False | ||||||
|     for i, example in enumerate(conll_data): |     for i, example in enumerate(conll_data): | ||||||
|         if not checked_for_ner: |         if not checked_for_ner: | ||||||
|             has_ner_tags = is_ner(example.token_annotation.entities[0]) |             has_ner_tags = is_ner(example.token_annotation.entities[0], | ||||||
|  |                     MISC_NER_PATTERN) | ||||||
|             checked_for_ner = True |             checked_for_ner = True | ||||||
|         sentences.append(generate_sentence(example.token_annotation, has_ner_tags)) |         raw += example.text | ||||||
|  |         sentences.append(generate_sentence(example.token_annotation, | ||||||
|  |                 has_ner_tags, MISC_NER_PATTERN)) | ||||||
|         # Real-sized documents could be extracted using the comments on the |         # Real-sized documents could be extracted using the comments on the | ||||||
|         # conllu document |         # conllu document | ||||||
|         if len(sentences) % n_sents == 0: |         if len(sentences) % n_sents == 0: | ||||||
|             doc = create_doc(sentences, i) |             doc = create_doc(raw, sentences, i) | ||||||
|             docs.append(doc) |             docs.append(doc) | ||||||
|  |             raw = "" | ||||||
|             sentences = [] |             sentences = [] | ||||||
|     if sentences: |     if sentences: | ||||||
|         doc = create_doc(sentences, i) |         doc = create_doc(sentences, i) | ||||||
|  | @ -40,12 +47,12 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): | ||||||
|     return docs |     return docs | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def is_ner(tag): | def is_ner(tag, tag_pattern): | ||||||
|     """ |     """ | ||||||
|     Check the 10th column of the first token to determine if the file contains |     Check the 10th column of the first token to determine if the file contains | ||||||
|     NER tags |     NER tags | ||||||
|     """ |     """ | ||||||
|     tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag) |     tag_match = re.search(tag_pattern, tag) | ||||||
|     if tag_match: |     if tag_match: | ||||||
|         return True |         return True | ||||||
|     elif tag == "O": |     elif tag == "O": | ||||||
|  | @ -63,9 +70,10 @@ def read_conllx(input_data, use_morphology=False, n=0): | ||||||
|             while lines[0].startswith("#"): |             while lines[0].startswith("#"): | ||||||
|                 lines.pop(0) |                 lines.pop(0) | ||||||
|             ids, words, tags, heads, deps, ents = [], [], [], [], [], [] |             ids, words, tags, heads, deps, ents = [], [], [], [], [], [] | ||||||
|  |             spaces = [] | ||||||
|             for line in lines: |             for line in lines: | ||||||
|                 parts = line.split("\t") |                 parts = line.split("\t") | ||||||
|                 id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts |                 id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts | ||||||
|                 if "-" in id_ or "." in id_: |                 if "-" in id_ or "." in id_: | ||||||
|                     continue |                     continue | ||||||
|                 try: |                 try: | ||||||
|  | @ -74,18 +82,27 @@ def read_conllx(input_data, use_morphology=False, n=0): | ||||||
|                     dep = "ROOT" if dep == "root" else dep |                     dep = "ROOT" if dep == "root" else dep | ||||||
|                     tag = pos if tag == "_" else tag |                     tag = pos if tag == "_" else tag | ||||||
|                     tag = tag + "__" + morph if use_morphology else tag |                     tag = tag + "__" + morph if use_morphology else tag | ||||||
|                     iob = iob if iob else "O" |                     ent = misc if misc else "O" | ||||||
| 
 | 
 | ||||||
|                     ids.append(id_) |                     ids.append(id_) | ||||||
|                     words.append(word) |                     words.append(word) | ||||||
|                     tags.append(tag) |                     tags.append(tag) | ||||||
|                     heads.append(head) |                     heads.append(head) | ||||||
|                     deps.append(dep) |                     deps.append(dep) | ||||||
|                     ents.append(iob) |                     ents.append(ent) | ||||||
|  |                     if "SpaceAfter=No" in misc: | ||||||
|  |                         spaces.append(False) | ||||||
|  |                     else: | ||||||
|  |                         spaces.append(True) | ||||||
|                 except:  # noqa: E722 |                 except:  # noqa: E722 | ||||||
|                     print(line) |                     print(line) | ||||||
|                     raise |                     raise | ||||||
|             example = Example(doc=None) |             raw = "" | ||||||
|  |             for word, space in zip(words, spaces): | ||||||
|  |                 raw += word | ||||||
|  |                 if space: | ||||||
|  |                     raw += " " | ||||||
|  |             example = Example(doc=raw) | ||||||
|             example.set_token_annotation(ids=ids, words=words, tags=tags, |             example.set_token_annotation(ids=ids, words=words, tags=tags, | ||||||
|                                          heads=heads, deps=deps, entities=ents) |                                          heads=heads, deps=deps, entities=ents) | ||||||
|             yield example |             yield example | ||||||
|  | @ -94,7 +111,7 @@ def read_conllx(input_data, use_morphology=False, n=0): | ||||||
|                 break |                 break | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def simplify_tags(iob): | def simplify_tags(iob, tag_pattern): | ||||||
|     """ |     """ | ||||||
|     Simplify tags obtained from the dataset in order to follow Wikipedia |     Simplify tags obtained from the dataset in order to follow Wikipedia | ||||||
|     scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while |     scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while | ||||||
|  | @ -103,26 +120,28 @@ def simplify_tags(iob): | ||||||
|     """ |     """ | ||||||
|     new_iob = [] |     new_iob = [] | ||||||
|     for tag in iob: |     for tag in iob: | ||||||
|         tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag) |         tag_match = re.search(tag_pattern, tag) | ||||||
|  |         new_tag = "O" | ||||||
|         if tag_match: |         if tag_match: | ||||||
|             prefix = tag_match.group(1) |             prefix = tag_match.group(2) | ||||||
|             suffix = tag_match.group(2) |             suffix = tag_match.group(3) | ||||||
|             if suffix == "GPE_LOC": |             if prefix and suffix: | ||||||
|                 suffix = "LOC" |                 if suffix == "GPE_LOC": | ||||||
|             elif suffix == "GPE_ORG": |                     suffix = "LOC" | ||||||
|                 suffix = "ORG" |                 elif suffix == "GPE_ORG": | ||||||
|             elif suffix != "PER" and suffix != "LOC" and suffix != "ORG": |                     suffix = "ORG" | ||||||
|                 suffix = "MISC" |                 elif suffix != "PER" and suffix != "LOC" and suffix != "ORG": | ||||||
|             tag = prefix + "-" + suffix |                     suffix = "MISC" | ||||||
|         new_iob.append(tag) |                 new_tag = prefix + "-" + suffix | ||||||
|  |         new_iob.append(new_tag) | ||||||
|     return new_iob |     return new_iob | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def generate_sentence(token_annotation, has_ner_tags): | def generate_sentence(token_annotation, has_ner_tags, tag_pattern): | ||||||
|     sentence = {} |     sentence = {} | ||||||
|     tokens = [] |     tokens = [] | ||||||
|     if has_ner_tags: |     if has_ner_tags: | ||||||
|         iob = simplify_tags(token_annotation.entities) |         iob = simplify_tags(token_annotation.entities, tag_pattern) | ||||||
|         biluo = iob_to_biluo(iob) |         biluo = iob_to_biluo(iob) | ||||||
|     for i, id in enumerate(token_annotation.ids): |     for i, id in enumerate(token_annotation.ids): | ||||||
|         token = {} |         token = {} | ||||||
|  | @ -138,11 +157,12 @@ def generate_sentence(token_annotation, has_ner_tags): | ||||||
|     return sentence |     return sentence | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def create_doc(sentences, id): | def create_doc(raw, sentences, id): | ||||||
|     doc = {} |     doc = {} | ||||||
|     paragraph = {} |     paragraph = {} | ||||||
|     doc["id"] = id |     doc["id"] = id | ||||||
|     doc["paragraphs"] = [] |     doc["paragraphs"] = [] | ||||||
|  |     paragraph["raw"] = raw.strip() | ||||||
|     paragraph["sentences"] = sentences |     paragraph["sentences"] = sentences | ||||||
|     doc["paragraphs"].append(paragraph) |     doc["paragraphs"].append(paragraph) | ||||||
|     return doc |     return doc | ||||||
|  |  | ||||||
|  | @ -32,6 +32,32 @@ def test_cli_converters_conllu2json(): | ||||||
|     assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] |     assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_cli_converters_conllu2json(): | ||||||
|  |     # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu | ||||||
|  |     lines = [ | ||||||
|  |         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", | ||||||
|  |         "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", | ||||||
|  |         "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", | ||||||
|  |         "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", | ||||||
|  |         "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", | ||||||
|  |     ] | ||||||
|  |     input_data = "\n".join(lines) | ||||||
|  |     converted = conllu2json(input_data, n_sents=1) | ||||||
|  |     assert len(converted) == 1 | ||||||
|  |     assert converted[0]["id"] == 0 | ||||||
|  |     assert len(converted[0]["paragraphs"]) == 1 | ||||||
|  |     assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår." | ||||||
|  |     assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 | ||||||
|  |     sent = converted[0]["paragraphs"][0]["sentences"][0] | ||||||
|  |     assert len(sent["tokens"]) == 5 | ||||||
|  |     tokens = sent["tokens"] | ||||||
|  |     assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår", "."] | ||||||
|  |     assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] | ||||||
|  |     assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] | ||||||
|  |     assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] | ||||||
|  |     assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O", "O"] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_cli_converters_iob2json(): | def test_cli_converters_iob2json(): | ||||||
|     lines = [ |     lines = [ | ||||||
|         "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", |         "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", | ||||||
|  | @ -106,7 +132,6 @@ def test_cli_converters_conll_ner2json(): | ||||||
|     ] |     ] | ||||||
|     input_data = "\n".join(lines) |     input_data = "\n".join(lines) | ||||||
|     converted = conll_ner2json(input_data, n_sents=10) |     converted = conll_ner2json(input_data, n_sents=10) | ||||||
|     print(converted) |  | ||||||
|     assert len(converted) == 1 |     assert len(converted) == 1 | ||||||
|     assert converted[0]["id"] == 0 |     assert converted[0]["id"] == 0 | ||||||
|     assert len(converted[0]["paragraphs"]) == 1 |     assert len(converted[0]["paragraphs"]) == 1 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user