mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Tidy up and auto-format [ci skip]
This commit is contained in:
		
							parent
							
								
									bcd1b12f43
								
							
						
					
					
						commit
						cd90752193
					
				| 
						 | 
					@ -88,12 +88,21 @@ def convert(
 | 
				
			||||||
            msg.info("Auto-detected sentence-per-line NER format")
 | 
					            msg.info("Auto-detected sentence-per-line NER format")
 | 
				
			||||||
            converter = converter_autodetect
 | 
					            converter = converter_autodetect
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            msg.warn("Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert")
 | 
					            msg.warn(
 | 
				
			||||||
 | 
					                "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
    if converter not in CONVERTERS:
 | 
					    if converter not in CONVERTERS:
 | 
				
			||||||
        msg.fail("Can't find converter for {}".format(converter), exits=1)
 | 
					        msg.fail("Can't find converter for {}".format(converter), exits=1)
 | 
				
			||||||
    # Use converter function to convert data
 | 
					    # Use converter function to convert data
 | 
				
			||||||
    func = CONVERTERS[converter]
 | 
					    func = CONVERTERS[converter]
 | 
				
			||||||
    data = func(input_data, n_sents=n_sents, seg_sents=seg_sents, use_morphology=morphology, lang=lang, model=model)
 | 
					    data = func(
 | 
				
			||||||
 | 
					        input_data,
 | 
				
			||||||
 | 
					        n_sents=n_sents,
 | 
				
			||||||
 | 
					        seg_sents=seg_sents,
 | 
				
			||||||
 | 
					        use_morphology=morphology,
 | 
				
			||||||
 | 
					        lang=lang,
 | 
				
			||||||
 | 
					        model=model,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    if output_dir != "-":
 | 
					    if output_dir != "-":
 | 
				
			||||||
        # Export data to a file
 | 
					        # Export data to a file
 | 
				
			||||||
        suffix = ".{}".format(file_type)
 | 
					        suffix = ".{}".format(file_type)
 | 
				
			||||||
| 
						 | 
					@ -104,7 +113,9 @@ def convert(
 | 
				
			||||||
            srsly.write_jsonl(output_file, data)
 | 
					            srsly.write_jsonl(output_file, data)
 | 
				
			||||||
        elif file_type == "msg":
 | 
					        elif file_type == "msg":
 | 
				
			||||||
            srsly.write_msgpack(output_file, data)
 | 
					            srsly.write_msgpack(output_file, data)
 | 
				
			||||||
        msg.good("Generated output file ({} documents): {}".format(len(data), output_file))
 | 
					        msg.good(
 | 
				
			||||||
 | 
					            "Generated output file ({} documents): {}".format(len(data), output_file)
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        # Print to stdout
 | 
					        # Print to stdout
 | 
				
			||||||
        if file_type == "json":
 | 
					        if file_type == "json":
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -38,32 +38,50 @@ def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs
 | 
				
			||||||
    doc_delimiter = "-DOCSTART- -X- O O"
 | 
					    doc_delimiter = "-DOCSTART- -X- O O"
 | 
				
			||||||
    # check for existing delimiters, which should be preserved
 | 
					    # check for existing delimiters, which should be preserved
 | 
				
			||||||
    if "\n\n" in input_data and seg_sents:
 | 
					    if "\n\n" in input_data and seg_sents:
 | 
				
			||||||
        msg.warn("Sentence boundaries found, automatic sentence segmentation with `-s` disabled.")
 | 
					        msg.warn(
 | 
				
			||||||
 | 
					            "Sentence boundaries found, automatic sentence segmentation with "
 | 
				
			||||||
 | 
					            "`-s` disabled."
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        seg_sents = False
 | 
					        seg_sents = False
 | 
				
			||||||
    if doc_delimiter in input_data and n_sents:
 | 
					    if doc_delimiter in input_data and n_sents:
 | 
				
			||||||
        msg.warn("Document delimiters found, automatic document segmentation with `-n` disabled.")
 | 
					        msg.warn(
 | 
				
			||||||
 | 
					            "Document delimiters found, automatic document segmentation with "
 | 
				
			||||||
 | 
					            "`-n` disabled."
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        n_sents = 0
 | 
					        n_sents = 0
 | 
				
			||||||
    # do document segmentation with existing sentences
 | 
					    # do document segmentation with existing sentences
 | 
				
			||||||
    if "\n\n" in input_data and not doc_delimiter in input_data and n_sents:
 | 
					    if "\n\n" in input_data and doc_delimiter not in input_data and n_sents:
 | 
				
			||||||
        n_sents_info(msg, n_sents)
 | 
					        n_sents_info(msg, n_sents)
 | 
				
			||||||
        input_data = segment_docs(input_data, n_sents, doc_delimiter)
 | 
					        input_data = segment_docs(input_data, n_sents, doc_delimiter)
 | 
				
			||||||
    # do sentence segmentation with existing documents
 | 
					    # do sentence segmentation with existing documents
 | 
				
			||||||
    if not "\n\n" in input_data and doc_delimiter in input_data and seg_sents:
 | 
					    if "\n\n" not in input_data and doc_delimiter in input_data and seg_sents:
 | 
				
			||||||
        input_data = segment_sents_and_docs(input_data, 0, "", model=model, msg=msg)
 | 
					        input_data = segment_sents_and_docs(input_data, 0, "", model=model, msg=msg)
 | 
				
			||||||
    # do both sentence segmentation and document segmentation according
 | 
					    # do both sentence segmentation and document segmentation according
 | 
				
			||||||
    # to options
 | 
					    # to options
 | 
				
			||||||
    if not "\n\n" in input_data and not doc_delimiter in input_data:
 | 
					    if "\n\n" not in input_data and doc_delimiter not in input_data:
 | 
				
			||||||
        # sentence segmentation required for document segmentation
 | 
					        # sentence segmentation required for document segmentation
 | 
				
			||||||
        if n_sents > 0 and not seg_sents:
 | 
					        if n_sents > 0 and not seg_sents:
 | 
				
			||||||
            msg.warn("No sentence boundaries found to use with option `-n {}`. Use `-s` to automatically segment sentences or `-n 0` to disable.".format(n_sents))
 | 
					            msg.warn(
 | 
				
			||||||
 | 
					                "No sentence boundaries found to use with option `-n {}`. "
 | 
				
			||||||
 | 
					                "Use `-s` to automatically segment sentences or `-n 0` "
 | 
				
			||||||
 | 
					                "to disable.".format(n_sents)
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            n_sents_info(msg, n_sents)
 | 
					            n_sents_info(msg, n_sents)
 | 
				
			||||||
            input_data = segment_sents_and_docs(input_data, n_sents, doc_delimiter, model=model, msg=msg)
 | 
					            input_data = segment_sents_and_docs(
 | 
				
			||||||
 | 
					                input_data, n_sents, doc_delimiter, model=model, msg=msg
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
    # provide warnings for problematic data
 | 
					    # provide warnings for problematic data
 | 
				
			||||||
    if not "\n\n" in input_data:
 | 
					    if "\n\n" not in input_data:
 | 
				
			||||||
        msg.warn("No sentence boundaries found. Use `-s` to automatically segment sentences.")
 | 
					        msg.warn(
 | 
				
			||||||
    if not doc_delimiter in input_data:
 | 
					            "No sentence boundaries found. Use `-s` to automatically segment "
 | 
				
			||||||
        msg.warn("No document delimiters found. Use `-n` to automatically group sentences into documents.")
 | 
					            "sentences."
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    if doc_delimiter not in input_data:
 | 
				
			||||||
 | 
					        msg.warn(
 | 
				
			||||||
 | 
					            "No document delimiters found. Use `-n` to automatically group "
 | 
				
			||||||
 | 
					            "sentences into documents."
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
    output_docs = []
 | 
					    output_docs = []
 | 
				
			||||||
    for doc in input_data.strip().split(doc_delimiter):
 | 
					    for doc in input_data.strip().split(doc_delimiter):
 | 
				
			||||||
        doc = doc.strip()
 | 
					        doc = doc.strip()
 | 
				
			||||||
| 
						 | 
					@ -78,8 +96,10 @@ def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs
 | 
				
			||||||
            cols = list(zip(*[line.split() for line in lines]))
 | 
					            cols = list(zip(*[line.split() for line in lines]))
 | 
				
			||||||
            if len(cols) < 2:
 | 
					            if len(cols) < 2:
 | 
				
			||||||
                raise ValueError(
 | 
					                raise ValueError(
 | 
				
			||||||
                    "The token-per-line NER file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
 | 
					                    "The token-per-line NER file is not formatted correctly. "
 | 
				
			||||||
            )
 | 
					                    "Try checking whitespace and delimiters. See "
 | 
				
			||||||
 | 
					                    "https://spacy.io/api/cli#convert"
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
            words = cols[0]
 | 
					            words = cols[0]
 | 
				
			||||||
            iob_ents = cols[-1]
 | 
					            iob_ents = cols[-1]
 | 
				
			||||||
            if len(cols) > 2:
 | 
					            if len(cols) > 2:
 | 
				
			||||||
| 
						 | 
					@ -110,7 +130,10 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
 | 
				
			||||||
            msg.info("Segmenting sentences with parser from model '{}'.".format(model))
 | 
					            msg.info("Segmenting sentences with parser from model '{}'.".format(model))
 | 
				
			||||||
            sentencizer = nlp.get_pipe("parser")
 | 
					            sentencizer = nlp.get_pipe("parser")
 | 
				
			||||||
    if not sentencizer:
 | 
					    if not sentencizer:
 | 
				
			||||||
        msg.info("Segmenting sentences with sentencizer. (Use `-b model` for improved parser-based sentence segmentation.)")
 | 
					        msg.info(
 | 
				
			||||||
 | 
					            "Segmenting sentences with sentencizer. (Use `-b model` for "
 | 
				
			||||||
 | 
					            "improved parser-based sentence segmentation.)"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        nlp = MultiLanguage()
 | 
					        nlp = MultiLanguage()
 | 
				
			||||||
        sentencizer = nlp.create_pipe("sentencizer")
 | 
					        sentencizer = nlp.create_pipe("sentencizer")
 | 
				
			||||||
    lines = doc.strip().split("\n")
 | 
					    lines = doc.strip().split("\n")
 | 
				
			||||||
| 
						 | 
					@ -132,7 +155,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
 | 
				
			||||||
def segment_docs(input_data, n_sents, doc_delimiter):
 | 
					def segment_docs(input_data, n_sents, doc_delimiter):
 | 
				
			||||||
    sent_delimiter = "\n\n"
 | 
					    sent_delimiter = "\n\n"
 | 
				
			||||||
    sents = input_data.split(sent_delimiter)
 | 
					    sents = input_data.split(sent_delimiter)
 | 
				
			||||||
    docs = [sents[i:i+n_sents] for i in range(0, len(sents), n_sents)]
 | 
					    docs = [sents[i : i + n_sents] for i in range(0, len(sents), n_sents)]
 | 
				
			||||||
    input_data = ""
 | 
					    input_data = ""
 | 
				
			||||||
    for doc in docs:
 | 
					    for doc in docs:
 | 
				
			||||||
        input_data += sent_delimiter + doc_delimiter
 | 
					        input_data += sent_delimiter + doc_delimiter
 | 
				
			||||||
| 
						 | 
					@ -143,4 +166,7 @@ def segment_docs(input_data, n_sents, doc_delimiter):
 | 
				
			||||||
def n_sents_info(msg, n_sents):
 | 
					def n_sents_info(msg, n_sents):
 | 
				
			||||||
    msg.info("Grouping every {} sentences into a document.".format(n_sents))
 | 
					    msg.info("Grouping every {} sentences into a document.".format(n_sents))
 | 
				
			||||||
    if n_sents == 1:
 | 
					    if n_sents == 1:
 | 
				
			||||||
        msg.warn("To generate better training data, you may want to group sentences into documents with `-n 10`.")
 | 
					        msg.warn(
 | 
				
			||||||
 | 
					            "To generate better training data, you may want to group "
 | 
				
			||||||
 | 
					            "sentences into documents with `-n 10`."
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -34,7 +34,7 @@ def read_iob(raw_sents):
 | 
				
			||||||
    for line in raw_sents:
 | 
					    for line in raw_sents:
 | 
				
			||||||
        if not line.strip():
 | 
					        if not line.strip():
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        tokens = [t.split('|') for t in line.split()]
 | 
					        tokens = [t.split("|") for t in line.split()]
 | 
				
			||||||
        if len(tokens[0]) == 3:
 | 
					        if len(tokens[0]) == 3:
 | 
				
			||||||
            words, pos, iob = zip(*tokens)
 | 
					            words, pos, iob = zip(*tokens)
 | 
				
			||||||
        elif len(tokens[0]) == 2:
 | 
					        elif len(tokens[0]) == 2:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -38,8 +38,8 @@ from . import about
 | 
				
			||||||
class BaseDefaults(object):
 | 
					class BaseDefaults(object):
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def create_lemmatizer(cls, nlp=None, lookups=None):
 | 
					    def create_lemmatizer(cls, nlp=None, lookups=None):
 | 
				
			||||||
        lemma_rules, lemma_index, lemma_exc, lemma_lookup = util.get_lemma_tables(lookups)
 | 
					        rules, index, exc, lookup = util.get_lemma_tables(lookups)
 | 
				
			||||||
        return Lemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
 | 
					        return Lemmatizer(index, exc, rules, lookup)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def create_lookups(cls, nlp=None):
 | 
					    def create_lookups(cls, nlp=None):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -89,10 +89,7 @@ TOKEN_PATTERN_SCHEMA = {
 | 
				
			||||||
                "title": "Fine-grained part-of-speech tag",
 | 
					                "title": "Fine-grained part-of-speech tag",
 | 
				
			||||||
                "$ref": "#/definitions/string_value",
 | 
					                "$ref": "#/definitions/string_value",
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            "DEP": {
 | 
					            "DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"},
 | 
				
			||||||
                "title": "Dependency label",
 | 
					 | 
				
			||||||
                "$ref": "#/definitions/string_value"
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
            "LEMMA": {
 | 
					            "LEMMA": {
 | 
				
			||||||
                "title": "Lemma (base form)",
 | 
					                "title": "Lemma (base form)",
 | 
				
			||||||
                "$ref": "#/definitions/string_value",
 | 
					                "$ref": "#/definitions/string_value",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,8 +6,13 @@ import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
    "text,norms,lemmas",
 | 
					    "text,norms,lemmas",
 | 
				
			||||||
    [("о.г.", ["ове године"], ["ова година"]), ("чет.", ["четвртак"], ["четвртак"]),
 | 
					    [
 | 
				
			||||||
     ("гђа", ["госпођа"], ["госпођа"]), ("ил'", ["или"], ["или"])])
 | 
					        ("о.г.", ["ове године"], ["ова година"]),
 | 
				
			||||||
 | 
					        ("чет.", ["четвртак"], ["четвртак"]),
 | 
				
			||||||
 | 
					        ("гђа", ["госпођа"], ["госпођа"]),
 | 
				
			||||||
 | 
					        ("ил'", ["или"], ["или"]),
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
 | 
					def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
 | 
				
			||||||
    tokens = sr_tokenizer(text)
 | 
					    tokens = sr_tokenizer(text)
 | 
				
			||||||
    assert len(tokens) == 1
 | 
					    assert len(tokens) == 1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -394,7 +394,7 @@ def test_attr_pipeline_checks(en_vocab):
 | 
				
			||||||
        ([{"IS_PUNCT": True}], "."),
 | 
					        ([{"IS_PUNCT": True}], "."),
 | 
				
			||||||
        ([{"IS_SPACE": True}], "\n"),
 | 
					        ([{"IS_SPACE": True}], "\n"),
 | 
				
			||||||
        ([{"IS_BRACKET": True}], "["),
 | 
					        ([{"IS_BRACKET": True}], "["),
 | 
				
			||||||
        ([{"IS_QUOTE": True}], "\""),
 | 
					        ([{"IS_QUOTE": True}], '"'),
 | 
				
			||||||
        ([{"IS_LEFT_PUNCT": True}], "``"),
 | 
					        ([{"IS_LEFT_PUNCT": True}], "``"),
 | 
				
			||||||
        ([{"IS_RIGHT_PUNCT": True}], "''"),
 | 
					        ([{"IS_RIGHT_PUNCT": True}], "''"),
 | 
				
			||||||
        ([{"IS_STOP": True}], "the"),
 | 
					        ([{"IS_STOP": True}], "the"),
 | 
				
			||||||
| 
						 | 
					@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab):
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
def test_matcher_schema_token_attributes(en_vocab, pattern, text):
 | 
					def test_matcher_schema_token_attributes(en_vocab, pattern, text):
 | 
				
			||||||
    matcher = Matcher(en_vocab)
 | 
					    matcher = Matcher(en_vocab)
 | 
				
			||||||
    doc = Doc(en_vocab, words=text.split(' '))
 | 
					    doc = Doc(en_vocab, words=text.split(" "))
 | 
				
			||||||
    matcher.add("Rule", None, pattern)
 | 
					    matcher.add("Rule", None, pattern)
 | 
				
			||||||
    assert len(matcher) == 1
 | 
					    assert len(matcher) == 1
 | 
				
			||||||
    matches = matcher(doc)
 | 
					    matches = matcher(doc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -49,8 +49,10 @@ def test_cli_converters_iob2json():
 | 
				
			||||||
        sent = converted[0]["paragraphs"][0]["sentences"][i]
 | 
					        sent = converted[0]["paragraphs"][0]["sentences"][i]
 | 
				
			||||||
        assert len(sent["tokens"]) == 8
 | 
					        assert len(sent["tokens"]) == 8
 | 
				
			||||||
        tokens = sent["tokens"]
 | 
					        tokens = sent["tokens"]
 | 
				
			||||||
 | 
					        # fmt: off
 | 
				
			||||||
        assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
 | 
					        assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
 | 
				
			||||||
        assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
 | 
					        assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
 | 
				
			||||||
 | 
					        # fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_cli_converters_conll_ner2json():
 | 
					def test_cli_converters_conll_ner2json():
 | 
				
			||||||
| 
						 | 
					@ -113,8 +115,10 @@ def test_cli_converters_conll_ner2json():
 | 
				
			||||||
        sent = converted[0]["paragraphs"][0]["sentences"][i]
 | 
					        sent = converted[0]["paragraphs"][0]["sentences"][i]
 | 
				
			||||||
        assert len(sent["tokens"]) == 8
 | 
					        assert len(sent["tokens"]) == 8
 | 
				
			||||||
        tokens = sent["tokens"]
 | 
					        tokens = sent["tokens"]
 | 
				
			||||||
 | 
					        # fmt: off
 | 
				
			||||||
        assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
 | 
					        assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
 | 
				
			||||||
        assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
 | 
					        assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
 | 
				
			||||||
 | 
					        # fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_pretrain_make_docs():
 | 
					def test_pretrain_make_docs():
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user