mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Tidy up and auto-format [ci skip]
This commit is contained in:
parent
bcd1b12f43
commit
cd90752193
|
@ -88,12 +88,21 @@ def convert(
|
||||||
msg.info("Auto-detected sentence-per-line NER format")
|
msg.info("Auto-detected sentence-per-line NER format")
|
||||||
converter = converter_autodetect
|
converter = converter_autodetect
|
||||||
else:
|
else:
|
||||||
msg.warn("Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert")
|
msg.warn(
|
||||||
|
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
|
||||||
|
)
|
||||||
if converter not in CONVERTERS:
|
if converter not in CONVERTERS:
|
||||||
msg.fail("Can't find converter for {}".format(converter), exits=1)
|
msg.fail("Can't find converter for {}".format(converter), exits=1)
|
||||||
# Use converter function to convert data
|
# Use converter function to convert data
|
||||||
func = CONVERTERS[converter]
|
func = CONVERTERS[converter]
|
||||||
data = func(input_data, n_sents=n_sents, seg_sents=seg_sents, use_morphology=morphology, lang=lang, model=model)
|
data = func(
|
||||||
|
input_data,
|
||||||
|
n_sents=n_sents,
|
||||||
|
seg_sents=seg_sents,
|
||||||
|
use_morphology=morphology,
|
||||||
|
lang=lang,
|
||||||
|
model=model,
|
||||||
|
)
|
||||||
if output_dir != "-":
|
if output_dir != "-":
|
||||||
# Export data to a file
|
# Export data to a file
|
||||||
suffix = ".{}".format(file_type)
|
suffix = ".{}".format(file_type)
|
||||||
|
@ -104,7 +113,9 @@ def convert(
|
||||||
srsly.write_jsonl(output_file, data)
|
srsly.write_jsonl(output_file, data)
|
||||||
elif file_type == "msg":
|
elif file_type == "msg":
|
||||||
srsly.write_msgpack(output_file, data)
|
srsly.write_msgpack(output_file, data)
|
||||||
msg.good("Generated output file ({} documents): {}".format(len(data), output_file))
|
msg.good(
|
||||||
|
"Generated output file ({} documents): {}".format(len(data), output_file)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# Print to stdout
|
# Print to stdout
|
||||||
if file_type == "json":
|
if file_type == "json":
|
||||||
|
|
|
@ -38,32 +38,50 @@ def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs
|
||||||
doc_delimiter = "-DOCSTART- -X- O O"
|
doc_delimiter = "-DOCSTART- -X- O O"
|
||||||
# check for existing delimiters, which should be preserved
|
# check for existing delimiters, which should be preserved
|
||||||
if "\n\n" in input_data and seg_sents:
|
if "\n\n" in input_data and seg_sents:
|
||||||
msg.warn("Sentence boundaries found, automatic sentence segmentation with `-s` disabled.")
|
msg.warn(
|
||||||
|
"Sentence boundaries found, automatic sentence segmentation with "
|
||||||
|
"`-s` disabled."
|
||||||
|
)
|
||||||
seg_sents = False
|
seg_sents = False
|
||||||
if doc_delimiter in input_data and n_sents:
|
if doc_delimiter in input_data and n_sents:
|
||||||
msg.warn("Document delimiters found, automatic document segmentation with `-n` disabled.")
|
msg.warn(
|
||||||
|
"Document delimiters found, automatic document segmentation with "
|
||||||
|
"`-n` disabled."
|
||||||
|
)
|
||||||
n_sents = 0
|
n_sents = 0
|
||||||
# do document segmentation with existing sentences
|
# do document segmentation with existing sentences
|
||||||
if "\n\n" in input_data and not doc_delimiter in input_data and n_sents:
|
if "\n\n" in input_data and doc_delimiter not in input_data and n_sents:
|
||||||
n_sents_info(msg, n_sents)
|
n_sents_info(msg, n_sents)
|
||||||
input_data = segment_docs(input_data, n_sents, doc_delimiter)
|
input_data = segment_docs(input_data, n_sents, doc_delimiter)
|
||||||
# do sentence segmentation with existing documents
|
# do sentence segmentation with existing documents
|
||||||
if not "\n\n" in input_data and doc_delimiter in input_data and seg_sents:
|
if "\n\n" not in input_data and doc_delimiter in input_data and seg_sents:
|
||||||
input_data = segment_sents_and_docs(input_data, 0, "", model=model, msg=msg)
|
input_data = segment_sents_and_docs(input_data, 0, "", model=model, msg=msg)
|
||||||
# do both sentence segmentation and document segmentation according
|
# do both sentence segmentation and document segmentation according
|
||||||
# to options
|
# to options
|
||||||
if not "\n\n" in input_data and not doc_delimiter in input_data:
|
if "\n\n" not in input_data and doc_delimiter not in input_data:
|
||||||
# sentence segmentation required for document segmentation
|
# sentence segmentation required for document segmentation
|
||||||
if n_sents > 0 and not seg_sents:
|
if n_sents > 0 and not seg_sents:
|
||||||
msg.warn("No sentence boundaries found to use with option `-n {}`. Use `-s` to automatically segment sentences or `-n 0` to disable.".format(n_sents))
|
msg.warn(
|
||||||
|
"No sentence boundaries found to use with option `-n {}`. "
|
||||||
|
"Use `-s` to automatically segment sentences or `-n 0` "
|
||||||
|
"to disable.".format(n_sents)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
n_sents_info(msg, n_sents)
|
n_sents_info(msg, n_sents)
|
||||||
input_data = segment_sents_and_docs(input_data, n_sents, doc_delimiter, model=model, msg=msg)
|
input_data = segment_sents_and_docs(
|
||||||
|
input_data, n_sents, doc_delimiter, model=model, msg=msg
|
||||||
|
)
|
||||||
# provide warnings for problematic data
|
# provide warnings for problematic data
|
||||||
if not "\n\n" in input_data:
|
if "\n\n" not in input_data:
|
||||||
msg.warn("No sentence boundaries found. Use `-s` to automatically segment sentences.")
|
msg.warn(
|
||||||
if not doc_delimiter in input_data:
|
"No sentence boundaries found. Use `-s` to automatically segment "
|
||||||
msg.warn("No document delimiters found. Use `-n` to automatically group sentences into documents.")
|
"sentences."
|
||||||
|
)
|
||||||
|
if doc_delimiter not in input_data:
|
||||||
|
msg.warn(
|
||||||
|
"No document delimiters found. Use `-n` to automatically group "
|
||||||
|
"sentences into documents."
|
||||||
|
)
|
||||||
output_docs = []
|
output_docs = []
|
||||||
for doc in input_data.strip().split(doc_delimiter):
|
for doc in input_data.strip().split(doc_delimiter):
|
||||||
doc = doc.strip()
|
doc = doc.strip()
|
||||||
|
@ -78,8 +96,10 @@ def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs
|
||||||
cols = list(zip(*[line.split() for line in lines]))
|
cols = list(zip(*[line.split() for line in lines]))
|
||||||
if len(cols) < 2:
|
if len(cols) < 2:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"The token-per-line NER file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
|
"The token-per-line NER file is not formatted correctly. "
|
||||||
)
|
"Try checking whitespace and delimiters. See "
|
||||||
|
"https://spacy.io/api/cli#convert"
|
||||||
|
)
|
||||||
words = cols[0]
|
words = cols[0]
|
||||||
iob_ents = cols[-1]
|
iob_ents = cols[-1]
|
||||||
if len(cols) > 2:
|
if len(cols) > 2:
|
||||||
|
@ -110,7 +130,10 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
|
||||||
msg.info("Segmenting sentences with parser from model '{}'.".format(model))
|
msg.info("Segmenting sentences with parser from model '{}'.".format(model))
|
||||||
sentencizer = nlp.get_pipe("parser")
|
sentencizer = nlp.get_pipe("parser")
|
||||||
if not sentencizer:
|
if not sentencizer:
|
||||||
msg.info("Segmenting sentences with sentencizer. (Use `-b model` for improved parser-based sentence segmentation.)")
|
msg.info(
|
||||||
|
"Segmenting sentences with sentencizer. (Use `-b model` for "
|
||||||
|
"improved parser-based sentence segmentation.)"
|
||||||
|
)
|
||||||
nlp = MultiLanguage()
|
nlp = MultiLanguage()
|
||||||
sentencizer = nlp.create_pipe("sentencizer")
|
sentencizer = nlp.create_pipe("sentencizer")
|
||||||
lines = doc.strip().split("\n")
|
lines = doc.strip().split("\n")
|
||||||
|
@ -132,7 +155,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
|
||||||
def segment_docs(input_data, n_sents, doc_delimiter):
|
def segment_docs(input_data, n_sents, doc_delimiter):
|
||||||
sent_delimiter = "\n\n"
|
sent_delimiter = "\n\n"
|
||||||
sents = input_data.split(sent_delimiter)
|
sents = input_data.split(sent_delimiter)
|
||||||
docs = [sents[i:i+n_sents] for i in range(0, len(sents), n_sents)]
|
docs = [sents[i : i + n_sents] for i in range(0, len(sents), n_sents)]
|
||||||
input_data = ""
|
input_data = ""
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
input_data += sent_delimiter + doc_delimiter
|
input_data += sent_delimiter + doc_delimiter
|
||||||
|
@ -143,4 +166,7 @@ def segment_docs(input_data, n_sents, doc_delimiter):
|
||||||
def n_sents_info(msg, n_sents):
|
def n_sents_info(msg, n_sents):
|
||||||
msg.info("Grouping every {} sentences into a document.".format(n_sents))
|
msg.info("Grouping every {} sentences into a document.".format(n_sents))
|
||||||
if n_sents == 1:
|
if n_sents == 1:
|
||||||
msg.warn("To generate better training data, you may want to group sentences into documents with `-n 10`.")
|
msg.warn(
|
||||||
|
"To generate better training data, you may want to group "
|
||||||
|
"sentences into documents with `-n 10`."
|
||||||
|
)
|
||||||
|
|
|
@ -34,7 +34,7 @@ def read_iob(raw_sents):
|
||||||
for line in raw_sents:
|
for line in raw_sents:
|
||||||
if not line.strip():
|
if not line.strip():
|
||||||
continue
|
continue
|
||||||
tokens = [t.split('|') for t in line.split()]
|
tokens = [t.split("|") for t in line.split()]
|
||||||
if len(tokens[0]) == 3:
|
if len(tokens[0]) == 3:
|
||||||
words, pos, iob = zip(*tokens)
|
words, pos, iob = zip(*tokens)
|
||||||
elif len(tokens[0]) == 2:
|
elif len(tokens[0]) == 2:
|
||||||
|
|
|
@ -38,8 +38,8 @@ from . import about
|
||||||
class BaseDefaults(object):
|
class BaseDefaults(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||||
lemma_rules, lemma_index, lemma_exc, lemma_lookup = util.get_lemma_tables(lookups)
|
rules, index, exc, lookup = util.get_lemma_tables(lookups)
|
||||||
return Lemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
|
return Lemmatizer(index, exc, rules, lookup)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lookups(cls, nlp=None):
|
def create_lookups(cls, nlp=None):
|
||||||
|
|
|
@ -89,10 +89,7 @@ TOKEN_PATTERN_SCHEMA = {
|
||||||
"title": "Fine-grained part-of-speech tag",
|
"title": "Fine-grained part-of-speech tag",
|
||||||
"$ref": "#/definitions/string_value",
|
"$ref": "#/definitions/string_value",
|
||||||
},
|
},
|
||||||
"DEP": {
|
"DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"},
|
||||||
"title": "Dependency label",
|
|
||||||
"$ref": "#/definitions/string_value"
|
|
||||||
},
|
|
||||||
"LEMMA": {
|
"LEMMA": {
|
||||||
"title": "Lemma (base form)",
|
"title": "Lemma (base form)",
|
||||||
"$ref": "#/definitions/string_value",
|
"$ref": "#/definitions/string_value",
|
||||||
|
|
|
@ -6,8 +6,13 @@ import pytest
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,norms,lemmas",
|
"text,norms,lemmas",
|
||||||
[("о.г.", ["ове године"], ["ова година"]), ("чет.", ["четвртак"], ["четвртак"]),
|
[
|
||||||
("гђа", ["госпођа"], ["госпођа"]), ("ил'", ["или"], ["или"])])
|
("о.г.", ["ове године"], ["ова година"]),
|
||||||
|
("чет.", ["четвртак"], ["четвртак"]),
|
||||||
|
("гђа", ["госпођа"], ["госпођа"]),
|
||||||
|
("ил'", ["или"], ["или"]),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
|
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
|
||||||
tokens = sr_tokenizer(text)
|
tokens = sr_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
|
@ -394,7 +394,7 @@ def test_attr_pipeline_checks(en_vocab):
|
||||||
([{"IS_PUNCT": True}], "."),
|
([{"IS_PUNCT": True}], "."),
|
||||||
([{"IS_SPACE": True}], "\n"),
|
([{"IS_SPACE": True}], "\n"),
|
||||||
([{"IS_BRACKET": True}], "["),
|
([{"IS_BRACKET": True}], "["),
|
||||||
([{"IS_QUOTE": True}], "\""),
|
([{"IS_QUOTE": True}], '"'),
|
||||||
([{"IS_LEFT_PUNCT": True}], "``"),
|
([{"IS_LEFT_PUNCT": True}], "``"),
|
||||||
([{"IS_RIGHT_PUNCT": True}], "''"),
|
([{"IS_RIGHT_PUNCT": True}], "''"),
|
||||||
([{"IS_STOP": True}], "the"),
|
([{"IS_STOP": True}], "the"),
|
||||||
|
@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab):
|
||||||
)
|
)
|
||||||
def test_matcher_schema_token_attributes(en_vocab, pattern, text):
|
def test_matcher_schema_token_attributes(en_vocab, pattern, text):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
doc = Doc(en_vocab, words=text.split(' '))
|
doc = Doc(en_vocab, words=text.split(" "))
|
||||||
matcher.add("Rule", None, pattern)
|
matcher.add("Rule", None, pattern)
|
||||||
assert len(matcher) == 1
|
assert len(matcher) == 1
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
|
|
|
@ -49,8 +49,10 @@ def test_cli_converters_iob2json():
|
||||||
sent = converted[0]["paragraphs"][0]["sentences"][i]
|
sent = converted[0]["paragraphs"][0]["sentences"][i]
|
||||||
assert len(sent["tokens"]) == 8
|
assert len(sent["tokens"]) == 8
|
||||||
tokens = sent["tokens"]
|
tokens = sent["tokens"]
|
||||||
|
# fmt: off
|
||||||
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
|
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
|
||||||
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
|
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
def test_cli_converters_conll_ner2json():
|
def test_cli_converters_conll_ner2json():
|
||||||
|
@ -113,8 +115,10 @@ def test_cli_converters_conll_ner2json():
|
||||||
sent = converted[0]["paragraphs"][0]["sentences"][i]
|
sent = converted[0]["paragraphs"][0]["sentences"][i]
|
||||||
assert len(sent["tokens"]) == 8
|
assert len(sent["tokens"]) == 8
|
||||||
tokens = sent["tokens"]
|
tokens = sent["tokens"]
|
||||||
|
# fmt: off
|
||||||
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
|
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
|
||||||
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
|
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
def test_pretrain_make_docs():
|
def test_pretrain_make_docs():
|
||||||
|
|
Loading…
Reference in New Issue
Block a user