mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Tidy up and auto-format [ci skip]
This commit is contained in:
		
							parent
							
								
									bcd1b12f43
								
							
						
					
					
						commit
						cd90752193
					
				|  | @ -88,12 +88,21 @@ def convert( | |||
|             msg.info("Auto-detected sentence-per-line NER format") | ||||
|             converter = converter_autodetect | ||||
|         else: | ||||
|             msg.warn("Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert") | ||||
|             msg.warn( | ||||
|                 "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert" | ||||
|             ) | ||||
|     if converter not in CONVERTERS: | ||||
|         msg.fail("Can't find converter for {}".format(converter), exits=1) | ||||
|     # Use converter function to convert data | ||||
|     func = CONVERTERS[converter] | ||||
|     data = func(input_data, n_sents=n_sents, seg_sents=seg_sents, use_morphology=morphology, lang=lang, model=model) | ||||
|     data = func( | ||||
|         input_data, | ||||
|         n_sents=n_sents, | ||||
|         seg_sents=seg_sents, | ||||
|         use_morphology=morphology, | ||||
|         lang=lang, | ||||
|         model=model, | ||||
|     ) | ||||
|     if output_dir != "-": | ||||
|         # Export data to a file | ||||
|         suffix = ".{}".format(file_type) | ||||
|  | @ -104,7 +113,9 @@ def convert( | |||
|             srsly.write_jsonl(output_file, data) | ||||
|         elif file_type == "msg": | ||||
|             srsly.write_msgpack(output_file, data) | ||||
|         msg.good("Generated output file ({} documents): {}".format(len(data), output_file)) | ||||
|         msg.good( | ||||
|             "Generated output file ({} documents): {}".format(len(data), output_file) | ||||
|         ) | ||||
|     else: | ||||
|         # Print to stdout | ||||
|         if file_type == "json": | ||||
|  |  | |||
|  | @ -38,32 +38,50 @@ def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs | |||
|     doc_delimiter = "-DOCSTART- -X- O O" | ||||
|     # check for existing delimiters, which should be preserved | ||||
|     if "\n\n" in input_data and seg_sents: | ||||
|         msg.warn("Sentence boundaries found, automatic sentence segmentation with `-s` disabled.") | ||||
|         msg.warn( | ||||
|             "Sentence boundaries found, automatic sentence segmentation with " | ||||
|             "`-s` disabled." | ||||
|         ) | ||||
|         seg_sents = False | ||||
|     if doc_delimiter in input_data and n_sents: | ||||
|         msg.warn("Document delimiters found, automatic document segmentation with `-n` disabled.") | ||||
|         msg.warn( | ||||
|             "Document delimiters found, automatic document segmentation with " | ||||
|             "`-n` disabled." | ||||
|         ) | ||||
|         n_sents = 0 | ||||
|     # do document segmentation with existing sentences | ||||
|     if "\n\n" in input_data and not doc_delimiter in input_data and n_sents: | ||||
|     if "\n\n" in input_data and doc_delimiter not in input_data and n_sents: | ||||
|         n_sents_info(msg, n_sents) | ||||
|         input_data = segment_docs(input_data, n_sents, doc_delimiter) | ||||
|     # do sentence segmentation with existing documents | ||||
|     if not "\n\n" in input_data and doc_delimiter in input_data and seg_sents: | ||||
|     if "\n\n" not in input_data and doc_delimiter in input_data and seg_sents: | ||||
|         input_data = segment_sents_and_docs(input_data, 0, "", model=model, msg=msg) | ||||
|     # do both sentence segmentation and document segmentation according | ||||
|     # to options | ||||
|     if not "\n\n" in input_data and not doc_delimiter in input_data: | ||||
|     if "\n\n" not in input_data and doc_delimiter not in input_data: | ||||
|         # sentence segmentation required for document segmentation | ||||
|         if n_sents > 0 and not seg_sents: | ||||
|             msg.warn("No sentence boundaries found to use with option `-n {}`. Use `-s` to automatically segment sentences or `-n 0` to disable.".format(n_sents)) | ||||
|             msg.warn( | ||||
|                 "No sentence boundaries found to use with option `-n {}`. " | ||||
|                 "Use `-s` to automatically segment sentences or `-n 0` " | ||||
|                 "to disable.".format(n_sents) | ||||
|             ) | ||||
|         else: | ||||
|             n_sents_info(msg, n_sents) | ||||
|             input_data = segment_sents_and_docs(input_data, n_sents, doc_delimiter, model=model, msg=msg) | ||||
|             input_data = segment_sents_and_docs( | ||||
|                 input_data, n_sents, doc_delimiter, model=model, msg=msg | ||||
|             ) | ||||
|     # provide warnings for problematic data | ||||
|     if not "\n\n" in input_data: | ||||
|         msg.warn("No sentence boundaries found. Use `-s` to automatically segment sentences.") | ||||
|     if not doc_delimiter in input_data: | ||||
|         msg.warn("No document delimiters found. Use `-n` to automatically group sentences into documents.") | ||||
|     if "\n\n" not in input_data: | ||||
|         msg.warn( | ||||
|             "No sentence boundaries found. Use `-s` to automatically segment " | ||||
|             "sentences." | ||||
|         ) | ||||
|     if doc_delimiter not in input_data: | ||||
|         msg.warn( | ||||
|             "No document delimiters found. Use `-n` to automatically group " | ||||
|             "sentences into documents." | ||||
|         ) | ||||
|     output_docs = [] | ||||
|     for doc in input_data.strip().split(doc_delimiter): | ||||
|         doc = doc.strip() | ||||
|  | @ -78,8 +96,10 @@ def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs | |||
|             cols = list(zip(*[line.split() for line in lines])) | ||||
|             if len(cols) < 2: | ||||
|                 raise ValueError( | ||||
|                     "The token-per-line NER file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert" | ||||
|             ) | ||||
|                     "The token-per-line NER file is not formatted correctly. " | ||||
|                     "Try checking whitespace and delimiters. See " | ||||
|                     "https://spacy.io/api/cli#convert" | ||||
|                 ) | ||||
|             words = cols[0] | ||||
|             iob_ents = cols[-1] | ||||
|             if len(cols) > 2: | ||||
|  | @ -110,7 +130,10 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None): | |||
|             msg.info("Segmenting sentences with parser from model '{}'.".format(model)) | ||||
|             sentencizer = nlp.get_pipe("parser") | ||||
|     if not sentencizer: | ||||
|         msg.info("Segmenting sentences with sentencizer. (Use `-b model` for improved parser-based sentence segmentation.)") | ||||
|         msg.info( | ||||
|             "Segmenting sentences with sentencizer. (Use `-b model` for " | ||||
|             "improved parser-based sentence segmentation.)" | ||||
|         ) | ||||
|         nlp = MultiLanguage() | ||||
|         sentencizer = nlp.create_pipe("sentencizer") | ||||
|     lines = doc.strip().split("\n") | ||||
|  | @ -132,7 +155,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None): | |||
| def segment_docs(input_data, n_sents, doc_delimiter): | ||||
|     sent_delimiter = "\n\n" | ||||
|     sents = input_data.split(sent_delimiter) | ||||
|     docs = [sents[i:i+n_sents] for i in range(0, len(sents), n_sents)] | ||||
|     docs = [sents[i : i + n_sents] for i in range(0, len(sents), n_sents)] | ||||
|     input_data = "" | ||||
|     for doc in docs: | ||||
|         input_data += sent_delimiter + doc_delimiter | ||||
|  | @ -143,4 +166,7 @@ def segment_docs(input_data, n_sents, doc_delimiter): | |||
| def n_sents_info(msg, n_sents): | ||||
|     msg.info("Grouping every {} sentences into a document.".format(n_sents)) | ||||
|     if n_sents == 1: | ||||
|         msg.warn("To generate better training data, you may want to group sentences into documents with `-n 10`.") | ||||
|         msg.warn( | ||||
|             "To generate better training data, you may want to group " | ||||
|             "sentences into documents with `-n 10`." | ||||
|         ) | ||||
|  |  | |||
|  | @ -34,7 +34,7 @@ def read_iob(raw_sents): | |||
|     for line in raw_sents: | ||||
|         if not line.strip(): | ||||
|             continue | ||||
|         tokens = [t.split('|') for t in line.split()] | ||||
|         tokens = [t.split("|") for t in line.split()] | ||||
|         if len(tokens[0]) == 3: | ||||
|             words, pos, iob = zip(*tokens) | ||||
|         elif len(tokens[0]) == 2: | ||||
|  |  | |||
|  | @ -38,8 +38,8 @@ from . import about | |||
| class BaseDefaults(object): | ||||
|     @classmethod | ||||
|     def create_lemmatizer(cls, nlp=None, lookups=None): | ||||
|         lemma_rules, lemma_index, lemma_exc, lemma_lookup = util.get_lemma_tables(lookups) | ||||
|         return Lemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup) | ||||
|         rules, index, exc, lookup = util.get_lemma_tables(lookups) | ||||
|         return Lemmatizer(index, exc, rules, lookup) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_lookups(cls, nlp=None): | ||||
|  |  | |||
|  | @ -89,10 +89,7 @@ TOKEN_PATTERN_SCHEMA = { | |||
|                 "title": "Fine-grained part-of-speech tag", | ||||
|                 "$ref": "#/definitions/string_value", | ||||
|             }, | ||||
|             "DEP": { | ||||
|                 "title": "Dependency label", | ||||
|                 "$ref": "#/definitions/string_value" | ||||
|             }, | ||||
|             "DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"}, | ||||
|             "LEMMA": { | ||||
|                 "title": "Lemma (base form)", | ||||
|                 "$ref": "#/definitions/string_value", | ||||
|  |  | |||
|  | @ -6,8 +6,13 @@ import pytest | |||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "text,norms,lemmas", | ||||
|     [("о.г.", ["ове године"], ["ова година"]), ("чет.", ["четвртак"], ["четвртак"]), | ||||
|      ("гђа", ["госпођа"], ["госпођа"]), ("ил'", ["или"], ["или"])]) | ||||
|     [ | ||||
|         ("о.г.", ["ове године"], ["ова година"]), | ||||
|         ("чет.", ["четвртак"], ["четвртак"]), | ||||
|         ("гђа", ["госпођа"], ["госпођа"]), | ||||
|         ("ил'", ["или"], ["или"]), | ||||
|     ], | ||||
| ) | ||||
| def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas): | ||||
|     tokens = sr_tokenizer(text) | ||||
|     assert len(tokens) == 1 | ||||
|  |  | |||
|  | @ -394,7 +394,7 @@ def test_attr_pipeline_checks(en_vocab): | |||
|         ([{"IS_PUNCT": True}], "."), | ||||
|         ([{"IS_SPACE": True}], "\n"), | ||||
|         ([{"IS_BRACKET": True}], "["), | ||||
|         ([{"IS_QUOTE": True}], "\""), | ||||
|         ([{"IS_QUOTE": True}], '"'), | ||||
|         ([{"IS_LEFT_PUNCT": True}], "``"), | ||||
|         ([{"IS_RIGHT_PUNCT": True}], "''"), | ||||
|         ([{"IS_STOP": True}], "the"), | ||||
|  | @ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab): | |||
| ) | ||||
| def test_matcher_schema_token_attributes(en_vocab, pattern, text): | ||||
|     matcher = Matcher(en_vocab) | ||||
|     doc = Doc(en_vocab, words=text.split(' ')) | ||||
|     doc = Doc(en_vocab, words=text.split(" ")) | ||||
|     matcher.add("Rule", None, pattern) | ||||
|     assert len(matcher) == 1 | ||||
|     matches = matcher(doc) | ||||
|  |  | |||
|  | @ -49,8 +49,10 @@ def test_cli_converters_iob2json(): | |||
|         sent = converted[0]["paragraphs"][0]["sentences"][i] | ||||
|         assert len(sent["tokens"]) == 8 | ||||
|         tokens = sent["tokens"] | ||||
|         # fmt: off | ||||
|         assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."] | ||||
|         assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"] | ||||
|         # fmt: on | ||||
| 
 | ||||
| 
 | ||||
| def test_cli_converters_conll_ner2json(): | ||||
|  | @ -113,8 +115,10 @@ def test_cli_converters_conll_ner2json(): | |||
|         sent = converted[0]["paragraphs"][0]["sentences"][i] | ||||
|         assert len(sent["tokens"]) == 8 | ||||
|         tokens = sent["tokens"] | ||||
|         # fmt: off | ||||
|         assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."] | ||||
|         assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"] | ||||
|         # fmt: on | ||||
| 
 | ||||
| 
 | ||||
| def test_pretrain_make_docs(): | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user