Tidy up and auto-format [ci skip]

This commit is contained in:
Ines Montani 2019-08-31 13:39:06 +02:00
parent bcd1b12f43
commit cd90752193
8 changed files with 73 additions and 30 deletions

View File

@ -88,12 +88,21 @@ def convert(
msg.info("Auto-detected sentence-per-line NER format") msg.info("Auto-detected sentence-per-line NER format")
converter = converter_autodetect converter = converter_autodetect
else: else:
msg.warn("Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert") msg.warn(
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
)
if converter not in CONVERTERS: if converter not in CONVERTERS:
msg.fail("Can't find converter for {}".format(converter), exits=1) msg.fail("Can't find converter for {}".format(converter), exits=1)
# Use converter function to convert data # Use converter function to convert data
func = CONVERTERS[converter] func = CONVERTERS[converter]
data = func(input_data, n_sents=n_sents, seg_sents=seg_sents, use_morphology=morphology, lang=lang, model=model) data = func(
input_data,
n_sents=n_sents,
seg_sents=seg_sents,
use_morphology=morphology,
lang=lang,
model=model,
)
if output_dir != "-": if output_dir != "-":
# Export data to a file # Export data to a file
suffix = ".{}".format(file_type) suffix = ".{}".format(file_type)
@ -104,7 +113,9 @@ def convert(
srsly.write_jsonl(output_file, data) srsly.write_jsonl(output_file, data)
elif file_type == "msg": elif file_type == "msg":
srsly.write_msgpack(output_file, data) srsly.write_msgpack(output_file, data)
msg.good("Generated output file ({} documents): {}".format(len(data), output_file)) msg.good(
"Generated output file ({} documents): {}".format(len(data), output_file)
)
else: else:
# Print to stdout # Print to stdout
if file_type == "json": if file_type == "json":

View File

@ -38,32 +38,50 @@ def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs
doc_delimiter = "-DOCSTART- -X- O O" doc_delimiter = "-DOCSTART- -X- O O"
# check for existing delimiters, which should be preserved # check for existing delimiters, which should be preserved
if "\n\n" in input_data and seg_sents: if "\n\n" in input_data and seg_sents:
msg.warn("Sentence boundaries found, automatic sentence segmentation with `-s` disabled.") msg.warn(
"Sentence boundaries found, automatic sentence segmentation with "
"`-s` disabled."
)
seg_sents = False seg_sents = False
if doc_delimiter in input_data and n_sents: if doc_delimiter in input_data and n_sents:
msg.warn("Document delimiters found, automatic document segmentation with `-n` disabled.") msg.warn(
"Document delimiters found, automatic document segmentation with "
"`-n` disabled."
)
n_sents = 0 n_sents = 0
# do document segmentation with existing sentences # do document segmentation with existing sentences
if "\n\n" in input_data and not doc_delimiter in input_data and n_sents: if "\n\n" in input_data and doc_delimiter not in input_data and n_sents:
n_sents_info(msg, n_sents) n_sents_info(msg, n_sents)
input_data = segment_docs(input_data, n_sents, doc_delimiter) input_data = segment_docs(input_data, n_sents, doc_delimiter)
# do sentence segmentation with existing documents # do sentence segmentation with existing documents
if not "\n\n" in input_data and doc_delimiter in input_data and seg_sents: if "\n\n" not in input_data and doc_delimiter in input_data and seg_sents:
input_data = segment_sents_and_docs(input_data, 0, "", model=model, msg=msg) input_data = segment_sents_and_docs(input_data, 0, "", model=model, msg=msg)
# do both sentence segmentation and document segmentation according # do both sentence segmentation and document segmentation according
# to options # to options
if not "\n\n" in input_data and not doc_delimiter in input_data: if "\n\n" not in input_data and doc_delimiter not in input_data:
# sentence segmentation required for document segmentation # sentence segmentation required for document segmentation
if n_sents > 0 and not seg_sents: if n_sents > 0 and not seg_sents:
msg.warn("No sentence boundaries found to use with option `-n {}`. Use `-s` to automatically segment sentences or `-n 0` to disable.".format(n_sents)) msg.warn(
"No sentence boundaries found to use with option `-n {}`. "
"Use `-s` to automatically segment sentences or `-n 0` "
"to disable.".format(n_sents)
)
else: else:
n_sents_info(msg, n_sents) n_sents_info(msg, n_sents)
input_data = segment_sents_and_docs(input_data, n_sents, doc_delimiter, model=model, msg=msg) input_data = segment_sents_and_docs(
input_data, n_sents, doc_delimiter, model=model, msg=msg
)
# provide warnings for problematic data # provide warnings for problematic data
if not "\n\n" in input_data: if "\n\n" not in input_data:
msg.warn("No sentence boundaries found. Use `-s` to automatically segment sentences.") msg.warn(
if not doc_delimiter in input_data: "No sentence boundaries found. Use `-s` to automatically segment "
msg.warn("No document delimiters found. Use `-n` to automatically group sentences into documents.") "sentences."
)
if doc_delimiter not in input_data:
msg.warn(
"No document delimiters found. Use `-n` to automatically group "
"sentences into documents."
)
output_docs = [] output_docs = []
for doc in input_data.strip().split(doc_delimiter): for doc in input_data.strip().split(doc_delimiter):
doc = doc.strip() doc = doc.strip()
@ -78,7 +96,9 @@ def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs
cols = list(zip(*[line.split() for line in lines])) cols = list(zip(*[line.split() for line in lines]))
if len(cols) < 2: if len(cols) < 2:
raise ValueError( raise ValueError(
"The token-per-line NER file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert" "The token-per-line NER file is not formatted correctly. "
"Try checking whitespace and delimiters. See "
"https://spacy.io/api/cli#convert"
) )
words = cols[0] words = cols[0]
iob_ents = cols[-1] iob_ents = cols[-1]
@ -110,7 +130,10 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
msg.info("Segmenting sentences with parser from model '{}'.".format(model)) msg.info("Segmenting sentences with parser from model '{}'.".format(model))
sentencizer = nlp.get_pipe("parser") sentencizer = nlp.get_pipe("parser")
if not sentencizer: if not sentencizer:
msg.info("Segmenting sentences with sentencizer. (Use `-b model` for improved parser-based sentence segmentation.)") msg.info(
"Segmenting sentences with sentencizer. (Use `-b model` for "
"improved parser-based sentence segmentation.)"
)
nlp = MultiLanguage() nlp = MultiLanguage()
sentencizer = nlp.create_pipe("sentencizer") sentencizer = nlp.create_pipe("sentencizer")
lines = doc.strip().split("\n") lines = doc.strip().split("\n")
@ -132,7 +155,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
def segment_docs(input_data, n_sents, doc_delimiter): def segment_docs(input_data, n_sents, doc_delimiter):
sent_delimiter = "\n\n" sent_delimiter = "\n\n"
sents = input_data.split(sent_delimiter) sents = input_data.split(sent_delimiter)
docs = [sents[i:i+n_sents] for i in range(0, len(sents), n_sents)] docs = [sents[i : i + n_sents] for i in range(0, len(sents), n_sents)]
input_data = "" input_data = ""
for doc in docs: for doc in docs:
input_data += sent_delimiter + doc_delimiter input_data += sent_delimiter + doc_delimiter
@ -143,4 +166,7 @@ def segment_docs(input_data, n_sents, doc_delimiter):
def n_sents_info(msg, n_sents): def n_sents_info(msg, n_sents):
msg.info("Grouping every {} sentences into a document.".format(n_sents)) msg.info("Grouping every {} sentences into a document.".format(n_sents))
if n_sents == 1: if n_sents == 1:
msg.warn("To generate better training data, you may want to group sentences into documents with `-n 10`.") msg.warn(
"To generate better training data, you may want to group "
"sentences into documents with `-n 10`."
)

View File

@ -34,7 +34,7 @@ def read_iob(raw_sents):
for line in raw_sents: for line in raw_sents:
if not line.strip(): if not line.strip():
continue continue
tokens = [t.split('|') for t in line.split()] tokens = [t.split("|") for t in line.split()]
if len(tokens[0]) == 3: if len(tokens[0]) == 3:
words, pos, iob = zip(*tokens) words, pos, iob = zip(*tokens)
elif len(tokens[0]) == 2: elif len(tokens[0]) == 2:

View File

@ -38,8 +38,8 @@ from . import about
class BaseDefaults(object): class BaseDefaults(object):
@classmethod @classmethod
def create_lemmatizer(cls, nlp=None, lookups=None): def create_lemmatizer(cls, nlp=None, lookups=None):
lemma_rules, lemma_index, lemma_exc, lemma_lookup = util.get_lemma_tables(lookups) rules, index, exc, lookup = util.get_lemma_tables(lookups)
return Lemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup) return Lemmatizer(index, exc, rules, lookup)
@classmethod @classmethod
def create_lookups(cls, nlp=None): def create_lookups(cls, nlp=None):

View File

@ -89,10 +89,7 @@ TOKEN_PATTERN_SCHEMA = {
"title": "Fine-grained part-of-speech tag", "title": "Fine-grained part-of-speech tag",
"$ref": "#/definitions/string_value", "$ref": "#/definitions/string_value",
}, },
"DEP": { "DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"},
"title": "Dependency label",
"$ref": "#/definitions/string_value"
},
"LEMMA": { "LEMMA": {
"title": "Lemma (base form)", "title": "Lemma (base form)",
"$ref": "#/definitions/string_value", "$ref": "#/definitions/string_value",

View File

@ -6,8 +6,13 @@ import pytest
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,norms,lemmas", "text,norms,lemmas",
[("о.г.", ["ове године"], ["ова година"]), ("чет.", ["четвртак"], ["четвртак"]), [
("гђа", ["госпођа"], ["госпођа"]), ("ил'", ["или"], ["или"])]) ("о.г.", ["ове године"], ["ова година"]),
("чет.", ["четвртак"], ["четвртак"]),
("гђа", ["госпођа"], ["госпођа"]),
("ил'", ["или"], ["или"]),
],
)
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas): def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
tokens = sr_tokenizer(text) tokens = sr_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1

View File

@ -394,7 +394,7 @@ def test_attr_pipeline_checks(en_vocab):
([{"IS_PUNCT": True}], "."), ([{"IS_PUNCT": True}], "."),
([{"IS_SPACE": True}], "\n"), ([{"IS_SPACE": True}], "\n"),
([{"IS_BRACKET": True}], "["), ([{"IS_BRACKET": True}], "["),
([{"IS_QUOTE": True}], "\""), ([{"IS_QUOTE": True}], '"'),
([{"IS_LEFT_PUNCT": True}], "``"), ([{"IS_LEFT_PUNCT": True}], "``"),
([{"IS_RIGHT_PUNCT": True}], "''"), ([{"IS_RIGHT_PUNCT": True}], "''"),
([{"IS_STOP": True}], "the"), ([{"IS_STOP": True}], "the"),
@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab):
) )
def test_matcher_schema_token_attributes(en_vocab, pattern, text): def test_matcher_schema_token_attributes(en_vocab, pattern, text):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
doc = Doc(en_vocab, words=text.split(' ')) doc = Doc(en_vocab, words=text.split(" "))
matcher.add("Rule", None, pattern) matcher.add("Rule", None, pattern)
assert len(matcher) == 1 assert len(matcher) == 1
matches = matcher(doc) matches = matcher(doc)

View File

@ -49,8 +49,10 @@ def test_cli_converters_iob2json():
sent = converted[0]["paragraphs"][0]["sentences"][i] sent = converted[0]["paragraphs"][0]["sentences"][i]
assert len(sent["tokens"]) == 8 assert len(sent["tokens"]) == 8
tokens = sent["tokens"] tokens = sent["tokens"]
# fmt: off
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."] assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"] assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
# fmt: on
def test_cli_converters_conll_ner2json(): def test_cli_converters_conll_ner2json():
@ -113,8 +115,10 @@ def test_cli_converters_conll_ner2json():
sent = converted[0]["paragraphs"][0]["sentences"][i] sent = converted[0]["paragraphs"][0]["sentences"][i]
assert len(sent["tokens"]) == 8 assert len(sent["tokens"]) == 8
tokens = sent["tokens"] tokens = sent["tokens"]
# fmt: off
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."] assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"] assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
# fmt: on
def test_pretrain_make_docs(): def test_pretrain_make_docs():