mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Auto-format
This commit is contained in:
parent
4498dfe99d
commit
37c3bb35e2
|
@ -118,7 +118,9 @@ def debug_data(
|
||||||
|
|
||||||
# Create all gold data here to avoid iterating over the train_dataset constantly
|
# Create all gold data here to avoid iterating over the train_dataset constantly
|
||||||
gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True)
|
gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True)
|
||||||
gold_train_unpreprocessed_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=False)
|
gold_train_unpreprocessed_data = _compile_gold(
|
||||||
|
train_dataset, pipeline, nlp, make_proj=False
|
||||||
|
)
|
||||||
gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True)
|
gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True)
|
||||||
|
|
||||||
train_texts = gold_train_data["texts"]
|
train_texts = gold_train_data["texts"]
|
||||||
|
|
|
@ -229,7 +229,9 @@ def add_vectors(
|
||||||
else:
|
else:
|
||||||
if vectors_loc:
|
if vectors_loc:
|
||||||
with msg.loading(f"Reading vectors from {vectors_loc}"):
|
with msg.loading(f"Reading vectors from {vectors_loc}"):
|
||||||
vectors_data, vector_keys = read_vectors(msg, vectors_loc, truncate_vectors)
|
vectors_data, vector_keys = read_vectors(
|
||||||
|
msg, vectors_loc, truncate_vectors
|
||||||
|
)
|
||||||
msg.good(f"Loaded vectors from {vectors_loc}")
|
msg.good(f"Loaded vectors from {vectors_loc}")
|
||||||
else:
|
else:
|
||||||
vectors_data, vector_keys = (None, None)
|
vectors_data, vector_keys = (None, None)
|
||||||
|
|
|
@ -406,5 +406,5 @@ def verify_cli_args(
|
||||||
if not config["nlp"]["vectors"]:
|
if not config["nlp"]["vectors"]:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Must specify nlp.vectors if pretraining.objective.type is vectors",
|
"Must specify nlp.vectors if pretraining.objective.type is vectors",
|
||||||
exits=True
|
exits=True,
|
||||||
)
|
)
|
||||||
|
|
|
@ -202,11 +202,11 @@ def train(
|
||||||
nlp.resume_training()
|
nlp.resume_training()
|
||||||
else:
|
else:
|
||||||
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
||||||
train_examples = list(corpus.train_dataset(
|
train_examples = list(
|
||||||
nlp,
|
corpus.train_dataset(
|
||||||
shuffle=False,
|
nlp, shuffle=False, gold_preproc=training["gold_preproc"]
|
||||||
gold_preproc=training["gold_preproc"]
|
)
|
||||||
))
|
)
|
||||||
nlp.begin_training(lambda: train_examples)
|
nlp.begin_training(lambda: train_examples)
|
||||||
|
|
||||||
# Update tag map with provided mapping
|
# Update tag map with provided mapping
|
||||||
|
@ -293,12 +293,14 @@ def train(
|
||||||
|
|
||||||
def create_train_batches(nlp, corpus, cfg):
|
def create_train_batches(nlp, corpus, cfg):
|
||||||
max_epochs = cfg.get("max_epochs", 0)
|
max_epochs = cfg.get("max_epochs", 0)
|
||||||
train_examples = list(corpus.train_dataset(
|
train_examples = list(
|
||||||
nlp,
|
corpus.train_dataset(
|
||||||
shuffle=True,
|
nlp,
|
||||||
gold_preproc=cfg["gold_preproc"],
|
shuffle=True,
|
||||||
max_length=cfg["max_length"]
|
gold_preproc=cfg["gold_preproc"],
|
||||||
))
|
max_length=cfg["max_length"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
epoch = 0
|
epoch = 0
|
||||||
while True:
|
while True:
|
||||||
|
@ -520,7 +522,10 @@ def setup_printer(training, nlp):
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
data = (
|
data = (
|
||||||
[info["epoch"], info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
|
[info["epoch"], info["step"]]
|
||||||
|
+ losses
|
||||||
|
+ scores
|
||||||
|
+ ["{0:.2f}".format(float(info["score"]))]
|
||||||
)
|
)
|
||||||
msg.row(data, widths=table_widths, aligns=table_aligns)
|
msg.row(data, widths=table_widths, aligns=table_aligns)
|
||||||
|
|
||||||
|
|
|
@ -59,6 +59,6 @@ def read_iob(raw_sents, vocab, n_sents):
|
||||||
doc[i].is_sent_start = sent_start
|
doc[i].is_sent_start = sent_start
|
||||||
biluo = iob_to_biluo(iob)
|
biluo = iob_to_biluo(iob)
|
||||||
entities = tags_to_entities(biluo)
|
entities = tags_to_entities(biluo)
|
||||||
doc.ents = [Span(doc, start=s, end=e+1, label=L) for (L, s, e) in entities]
|
doc.ents = [Span(doc, start=s, end=e + 1, label=L) for (L, s, e) in entities]
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
return docs
|
return docs
|
||||||
|
|
|
@ -92,7 +92,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
|
||||||
# Handle entity cases
|
# Handle entity cases
|
||||||
for start_char, end_char, label in entities:
|
for start_char, end_char, label in entities:
|
||||||
if not label:
|
if not label:
|
||||||
for s in starts: # account for many-to-one
|
for s in starts: # account for many-to-one
|
||||||
if s >= start_char and s < end_char:
|
if s >= start_char and s < end_char:
|
||||||
biluo[starts[s]] = "O"
|
biluo[starts[s]] = "O"
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -17,11 +17,7 @@ def build_tb_parser_model(
|
||||||
nO=None,
|
nO=None,
|
||||||
):
|
):
|
||||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||||
tok2vec = chain(
|
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
|
||||||
tok2vec,
|
|
||||||
list2array(),
|
|
||||||
Linear(hidden_width, t2v_width),
|
|
||||||
)
|
|
||||||
tok2vec.set_dim("nO", hidden_width)
|
tok2vec.set_dim("nO", hidden_width)
|
||||||
|
|
||||||
lower = PrecomputableAffine(
|
lower = PrecomputableAffine(
|
||||||
|
|
|
@ -179,22 +179,9 @@ def test_doc_api_right_edge(en_tokenizer):
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||||
assert doc[6].text == "for"
|
assert doc[6].text == "for"
|
||||||
subtree = [w.text for w in doc[6].subtree]
|
subtree = [w.text for w in doc[6].subtree]
|
||||||
assert subtree == [
|
# fmt: off
|
||||||
"for",
|
assert subtree == ["for", "the", "sake", "of", "such", "as", "live", "under", "the", "government", "of", "the", "Romans", ","]
|
||||||
"the",
|
# fmt: on
|
||||||
"sake",
|
|
||||||
"of",
|
|
||||||
"such",
|
|
||||||
"as",
|
|
||||||
"live",
|
|
||||||
"under",
|
|
||||||
"the",
|
|
||||||
"government",
|
|
||||||
"of",
|
|
||||||
"the",
|
|
||||||
"Romans",
|
|
||||||
",",
|
|
||||||
]
|
|
||||||
assert doc[6].right_edge.text == ","
|
assert doc[6].right_edge.text == ","
|
||||||
|
|
||||||
|
|
||||||
|
@ -307,9 +294,14 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
en_texts = ["Merging the docs is fun.", "They don't think alike."]
|
en_texts = ["Merging the docs is fun.", "They don't think alike."]
|
||||||
de_text = "Wie war die Frage?"
|
de_text = "Wie war die Frage?"
|
||||||
en_docs = [en_tokenizer(text) for text in en_texts]
|
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||||
docs_idx = en_texts[0].index('docs')
|
docs_idx = en_texts[0].index("docs")
|
||||||
de_doc = de_tokenizer(de_text)
|
de_doc = de_tokenizer(de_text)
|
||||||
en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (True, None, None, None)
|
en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (
|
||||||
|
True,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
|
||||||
assert Doc.from_docs([]) is None
|
assert Doc.from_docs([]) is None
|
||||||
|
|
||||||
|
@ -323,15 +315,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
assert len(en_docs) == len(list(m_doc.sents))
|
assert len(en_docs) == len(list(m_doc.sents))
|
||||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||||
assert str(m_doc) == " ".join(en_texts)
|
assert str(m_doc) == " ".join(en_texts)
|
||||||
p_token = m_doc[len(en_docs[0])-1]
|
p_token = m_doc[len(en_docs[0]) - 1]
|
||||||
assert p_token.text == "." and bool(p_token.whitespace_)
|
assert p_token.text == "." and bool(p_token.whitespace_)
|
||||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||||
assert len(m_doc) == len(en_docs_tokens)
|
assert len(m_doc) == len(en_docs_tokens)
|
||||||
think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think')
|
think_idx = len(en_texts[0]) + 1 + en_texts[1].index("think")
|
||||||
assert m_doc[9].idx == think_idx
|
assert m_doc[9].idx == think_idx
|
||||||
with pytest.raises(AttributeError):
|
with pytest.raises(AttributeError):
|
||||||
not_available = m_doc[2]._.is_ambiguous # not callable, because it was not set via set_extension
|
# not callable, because it was not set via set_extension
|
||||||
assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there
|
m_doc[2]._.is_ambiguous
|
||||||
|
assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there
|
||||||
|
|
||||||
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
|
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
|
||||||
assert len(en_docs) == len(list(m_doc.sents))
|
assert len(en_docs) == len(list(m_doc.sents))
|
||||||
|
@ -341,19 +334,21 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
assert p_token.text == "." and not bool(p_token.whitespace_)
|
assert p_token.text == "." and not bool(p_token.whitespace_)
|
||||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||||
assert len(m_doc) == len(en_docs_tokens)
|
assert len(m_doc) == len(en_docs_tokens)
|
||||||
think_idx = len(en_texts[0]) + 0 + en_texts[1].index('think')
|
think_idx = len(en_texts[0]) + 0 + en_texts[1].index("think")
|
||||||
assert m_doc[9].idx == think_idx
|
assert m_doc[9].idx == think_idx
|
||||||
|
|
||||||
m_doc = Doc.from_docs(en_docs, attrs=['lemma', 'length', 'pos'])
|
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
|
||||||
with pytest.raises(ValueError): # important attributes from sentenziser or parser are missing
|
with pytest.raises(ValueError):
|
||||||
|
# important attributes from sentenziser or parser are missing
|
||||||
assert list(m_doc.sents)
|
assert list(m_doc.sents)
|
||||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||||
assert str(m_doc) == " ".join(en_texts) # space delimiter considered, although spacy attribute was missing
|
# space delimiter considered, although spacy attribute was missing
|
||||||
|
assert str(m_doc) == " ".join(en_texts)
|
||||||
p_token = m_doc[len(en_docs[0]) - 1]
|
p_token = m_doc[len(en_docs[0]) - 1]
|
||||||
assert p_token.text == "." and bool(p_token.whitespace_)
|
assert p_token.text == "." and bool(p_token.whitespace_)
|
||||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||||
assert len(m_doc) == len(en_docs_tokens)
|
assert len(m_doc) == len(en_docs_tokens)
|
||||||
think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think')
|
think_idx = len(en_texts[0]) + 1 + en_texts[1].index("think")
|
||||||
assert m_doc[9].idx == think_idx
|
assert m_doc[9].idx == think_idx
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -118,6 +118,7 @@ def test_oracle_moves_missing_B(en_vocab):
|
||||||
moves.add_action(move_types.index("U"), label)
|
moves.add_action(move_types.index("U"), label)
|
||||||
moves.get_oracle_sequence(example)
|
moves.get_oracle_sequence(example)
|
||||||
|
|
||||||
|
|
||||||
# We can't easily represent this on a Doc object. Not sure what the best solution
|
# We can't easily represent this on a Doc object. Not sure what the best solution
|
||||||
# would be, but I don't think it's an important use case?
|
# would be, but I don't think it's an important use case?
|
||||||
@pytest.mark.xfail(reason="No longer supported")
|
@pytest.mark.xfail(reason="No longer supported")
|
||||||
|
|
|
@ -91,6 +91,7 @@ def test_parser_merge_pp(en_tokenizer):
|
||||||
assert doc[2].text == "another phrase"
|
assert doc[2].text == "another phrase"
|
||||||
assert doc[3].text == "occurs"
|
assert doc[3].text == "occurs"
|
||||||
|
|
||||||
|
|
||||||
# We removed the step_through API a while ago. we should bring it back though
|
# We removed the step_through API a while ago. we should bring it back though
|
||||||
@pytest.mark.xfail(reason="Unsupported")
|
@pytest.mark.xfail(reason="Unsupported")
|
||||||
def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
|
def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
|
||||||
|
|
|
@ -8,10 +8,11 @@ from ...tokens import DocBin
|
||||||
|
|
||||||
def test_issue4402():
|
def test_issue4402():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
output_file = tmpdir / "test4402.spacy"
|
output_file = tmpdir / "test4402.spacy"
|
||||||
docs = json2docs([json_data])
|
docs = json2docs([json_data])
|
||||||
data = DocBin(docs=docs, attrs =["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]).to_bytes()
|
data = DocBin(docs=docs, attrs=attrs).to_bytes()
|
||||||
with output_file.open("wb") as file_:
|
with output_file.open("wb") as file_:
|
||||||
file_.write(data)
|
file_.write(data)
|
||||||
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
||||||
|
@ -25,74 +26,73 @@ def test_issue4402():
|
||||||
assert len(split_train_data) == 4
|
assert len(split_train_data) == 4
|
||||||
|
|
||||||
|
|
||||||
json_data =\
|
json_data = {
|
||||||
{
|
"id": 0,
|
||||||
"id": 0,
|
"paragraphs": [
|
||||||
"paragraphs": [
|
{
|
||||||
{
|
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
|
||||||
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
|
"sentences": [
|
||||||
"sentences": [
|
{
|
||||||
{
|
"tokens": [
|
||||||
"tokens": [
|
{"id": 0, "orth": "How", "ner": "O"},
|
||||||
{"id": 0, "orth": "How", "ner": "O"},
|
{"id": 1, "orth": "should", "ner": "O"},
|
||||||
{"id": 1, "orth": "should", "ner": "O"},
|
{"id": 2, "orth": "I", "ner": "O"},
|
||||||
{"id": 2, "orth": "I", "ner": "O"},
|
{"id": 3, "orth": "cook", "ner": "O"},
|
||||||
{"id": 3, "orth": "cook", "ner": "O"},
|
{"id": 4, "orth": "bacon", "ner": "O"},
|
||||||
{"id": 4, "orth": "bacon", "ner": "O"},
|
{"id": 5, "orth": "in", "ner": "O"},
|
||||||
{"id": 5, "orth": "in", "ner": "O"},
|
{"id": 6, "orth": "an", "ner": "O"},
|
||||||
{"id": 6, "orth": "an", "ner": "O"},
|
{"id": 7, "orth": "oven", "ner": "O"},
|
||||||
{"id": 7, "orth": "oven", "ner": "O"},
|
{"id": 8, "orth": "?", "ner": "O"},
|
||||||
{"id": 8, "orth": "?", "ner": "O"},
|
],
|
||||||
],
|
"brackets": [],
|
||||||
"brackets": [],
|
},
|
||||||
},
|
{
|
||||||
{
|
"tokens": [
|
||||||
"tokens": [
|
{"id": 9, "orth": "\n", "ner": "O"},
|
||||||
{"id": 9, "orth": "\n", "ner": "O"},
|
{"id": 10, "orth": "I", "ner": "O"},
|
||||||
{"id": 10, "orth": "I", "ner": "O"},
|
{"id": 11, "orth": "'ve", "ner": "O"},
|
||||||
{"id": 11, "orth": "'ve", "ner": "O"},
|
{"id": 12, "orth": "heard", "ner": "O"},
|
||||||
{"id": 12, "orth": "heard", "ner": "O"},
|
{"id": 13, "orth": "of", "ner": "O"},
|
||||||
{"id": 13, "orth": "of", "ner": "O"},
|
{"id": 14, "orth": "people", "ner": "O"},
|
||||||
{"id": 14, "orth": "people", "ner": "O"},
|
{"id": 15, "orth": "cooking", "ner": "O"},
|
||||||
{"id": 15, "orth": "cooking", "ner": "O"},
|
{"id": 16, "orth": "bacon", "ner": "O"},
|
||||||
{"id": 16, "orth": "bacon", "ner": "O"},
|
{"id": 17, "orth": "in", "ner": "O"},
|
||||||
{"id": 17, "orth": "in", "ner": "O"},
|
{"id": 18, "orth": "an", "ner": "O"},
|
||||||
{"id": 18, "orth": "an", "ner": "O"},
|
{"id": 19, "orth": "oven", "ner": "O"},
|
||||||
{"id": 19, "orth": "oven", "ner": "O"},
|
{"id": 20, "orth": ".", "ner": "O"},
|
||||||
{"id": 20, "orth": ".", "ner": "O"},
|
],
|
||||||
],
|
"brackets": [],
|
||||||
"brackets": [],
|
},
|
||||||
},
|
],
|
||||||
],
|
"cats": [
|
||||||
"cats": [
|
{"label": "baking", "value": 1.0},
|
||||||
{"label": "baking", "value": 1.0},
|
{"label": "not_baking", "value": 0.0},
|
||||||
{"label": "not_baking", "value": 0.0},
|
],
|
||||||
],
|
},
|
||||||
},
|
{
|
||||||
{
|
"raw": "What is the difference between white and brown eggs?\n",
|
||||||
"raw": "What is the difference between white and brown eggs?\n",
|
"sentences": [
|
||||||
"sentences": [
|
{
|
||||||
{
|
"tokens": [
|
||||||
"tokens": [
|
{"id": 0, "orth": "What", "ner": "O"},
|
||||||
{"id": 0, "orth": "What", "ner": "O"},
|
{"id": 1, "orth": "is", "ner": "O"},
|
||||||
{"id": 1, "orth": "is", "ner": "O"},
|
{"id": 2, "orth": "the", "ner": "O"},
|
||||||
{"id": 2, "orth": "the", "ner": "O"},
|
{"id": 3, "orth": "difference", "ner": "O"},
|
||||||
{"id": 3, "orth": "difference", "ner": "O"},
|
{"id": 4, "orth": "between", "ner": "O"},
|
||||||
{"id": 4, "orth": "between", "ner": "O"},
|
{"id": 5, "orth": "white", "ner": "O"},
|
||||||
{"id": 5, "orth": "white", "ner": "O"},
|
{"id": 6, "orth": "and", "ner": "O"},
|
||||||
{"id": 6, "orth": "and", "ner": "O"},
|
{"id": 7, "orth": "brown", "ner": "O"},
|
||||||
{"id": 7, "orth": "brown", "ner": "O"},
|
{"id": 8, "orth": "eggs", "ner": "O"},
|
||||||
{"id": 8, "orth": "eggs", "ner": "O"},
|
{"id": 9, "orth": "?", "ner": "O"},
|
||||||
{"id": 9, "orth": "?", "ner": "O"},
|
],
|
||||||
],
|
"brackets": [],
|
||||||
"brackets": [],
|
},
|
||||||
},
|
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
|
||||||
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
|
],
|
||||||
],
|
"cats": [
|
||||||
"cats": [
|
{"label": "baking", "value": 0.0},
|
||||||
{"label": "baking", "value": 0.0},
|
{"label": "not_baking", "value": 1.0},
|
||||||
{"label": "not_baking", "value": 1.0},
|
],
|
||||||
],
|
},
|
||||||
},
|
],
|
||||||
],
|
}
|
||||||
}
|
|
||||||
|
|
|
@ -28,7 +28,9 @@ def test_cli_converters_conllu2json():
|
||||||
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"]
|
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"]
|
||||||
assert [t["head"] for t in tokens] == [1, 2, -1, 0]
|
assert [t["head"] for t in tokens] == [1, 2, -1, 0]
|
||||||
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"]
|
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"]
|
||||||
ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
|
ent_offsets = [
|
||||||
|
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
|
||||||
|
]
|
||||||
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
||||||
assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
|
assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
|
||||||
|
|
||||||
|
@ -54,7 +56,9 @@ def test_cli_converters_conllu2json():
|
||||||
)
|
)
|
||||||
def test_cli_converters_conllu2json_name_ner_map(lines):
|
def test_cli_converters_conllu2json_name_ner_map(lines):
|
||||||
input_data = "\n".join(lines)
|
input_data = "\n".join(lines)
|
||||||
converted_docs = conllu2docs(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
|
converted_docs = conllu2docs(
|
||||||
|
input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}
|
||||||
|
)
|
||||||
assert len(converted_docs) == 1
|
assert len(converted_docs) == 1
|
||||||
converted = [docs_to_json(converted_docs)]
|
converted = [docs_to_json(converted_docs)]
|
||||||
assert converted[0]["id"] == 0
|
assert converted[0]["id"] == 0
|
||||||
|
@ -68,7 +72,9 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
|
||||||
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
|
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
|
||||||
assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
|
assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
|
||||||
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
|
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
|
||||||
ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
|
ent_offsets = [
|
||||||
|
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
|
||||||
|
]
|
||||||
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
||||||
assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
|
assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
|
||||||
|
|
||||||
|
@ -115,7 +121,9 @@ def test_cli_converters_conllu2json_subtokens():
|
||||||
assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."]
|
assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."]
|
||||||
assert [t["head"] for t in tokens] == [1, 1, 0, -1]
|
assert [t["head"] for t in tokens] == [1, 1, 0, -1]
|
||||||
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"]
|
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"]
|
||||||
ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
|
ent_offsets = [
|
||||||
|
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
|
||||||
|
]
|
||||||
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
||||||
assert biluo_tags == ["O", "U-PER", "O", "O"]
|
assert biluo_tags == ["O", "U-PER", "O", "O"]
|
||||||
|
|
||||||
|
@ -138,11 +146,11 @@ def test_cli_converters_iob2json(en_vocab):
|
||||||
sent = converted["paragraphs"][0]["sentences"][i]
|
sent = converted["paragraphs"][0]["sentences"][i]
|
||||||
assert len(sent["tokens"]) == 8
|
assert len(sent["tokens"]) == 8
|
||||||
tokens = sent["tokens"]
|
tokens = sent["tokens"]
|
||||||
# fmt: off
|
expected = ["I", "like", "London", "and", "New", "York", "City", "."]
|
||||||
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
|
assert [t["orth"] for t in tokens] == expected
|
||||||
assert len(converted_docs[0].ents) == 8
|
assert len(converted_docs[0].ents) == 8
|
||||||
for ent in converted_docs[0].ents:
|
for ent in converted_docs[0].ents:
|
||||||
assert(ent.text in ["New York City", "London"])
|
assert ent.text in ["New York City", "London"]
|
||||||
|
|
||||||
|
|
||||||
def test_cli_converters_conll_ner2json():
|
def test_cli_converters_conll_ner2json():
|
||||||
|
@ -210,7 +218,7 @@ def test_cli_converters_conll_ner2json():
|
||||||
# fmt: on
|
# fmt: on
|
||||||
assert len(converted_docs[0].ents) == 10
|
assert len(converted_docs[0].ents) == 10
|
||||||
for ent in converted_docs[0].ents:
|
for ent in converted_docs[0].ents:
|
||||||
assert (ent.text in ["New York City", "London"])
|
assert ent.text in ["New York City", "London"]
|
||||||
|
|
||||||
|
|
||||||
def test_pretrain_make_docs():
|
def test_pretrain_make_docs():
|
||||||
|
|
|
@ -161,65 +161,54 @@ def test_example_from_dict_no_ner(en_vocab):
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == [None, None, None, None]
|
assert ner_tags == [None, None, None, None]
|
||||||
|
|
||||||
|
|
||||||
def test_example_from_dict_some_ner(en_vocab):
|
def test_example_from_dict_some_ner(en_vocab):
|
||||||
words = ["a", "b", "c", "d"]
|
words = ["a", "b", "c", "d"]
|
||||||
spaces = [True, True, False, True]
|
spaces = [True, True, False, True]
|
||||||
predicted = Doc(en_vocab, words=words, spaces=spaces)
|
predicted = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
example = Example.from_dict(
|
example = Example.from_dict(
|
||||||
predicted,
|
predicted, {"words": words, "entities": ["U-LOC", None, None, None]}
|
||||||
{
|
|
||||||
"words": words,
|
|
||||||
"entities": ["U-LOC", None, None, None]
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["U-LOC", None, None, None]
|
assert ner_tags == ["U-LOC", None, None, None]
|
||||||
|
|
||||||
|
|
||||||
def test_json2docs_no_ner(en_vocab):
|
def test_json2docs_no_ner(en_vocab):
|
||||||
data = [{
|
data = [
|
||||||
"id":1,
|
{
|
||||||
"paragraphs":[
|
"id": 1,
|
||||||
{
|
"paragraphs": [
|
||||||
"sentences":[
|
{
|
||||||
{
|
"sentences": [
|
||||||
"tokens":[
|
{
|
||||||
{
|
"tokens": [
|
||||||
"dep":"nn",
|
{"dep": "nn", "head": 1, "tag": "NNP", "orth": "Ms."},
|
||||||
"head":1,
|
{
|
||||||
"tag":"NNP",
|
"dep": "nsubj",
|
||||||
"orth":"Ms."
|
"head": 1,
|
||||||
},
|
"tag": "NNP",
|
||||||
{
|
"orth": "Haag",
|
||||||
"dep":"nsubj",
|
},
|
||||||
"head":1,
|
{
|
||||||
"tag":"NNP",
|
"dep": "ROOT",
|
||||||
"orth":"Haag"
|
"head": 0,
|
||||||
},
|
"tag": "VBZ",
|
||||||
{
|
"orth": "plays",
|
||||||
"dep":"ROOT",
|
},
|
||||||
"head":0,
|
{
|
||||||
"tag":"VBZ",
|
"dep": "dobj",
|
||||||
"orth":"plays"
|
"head": -1,
|
||||||
},
|
"tag": "NNP",
|
||||||
{
|
"orth": "Elianti",
|
||||||
"dep":"dobj",
|
},
|
||||||
"head":-1,
|
{"dep": "punct", "head": -2, "tag": ".", "orth": "."},
|
||||||
"tag":"NNP",
|
]
|
||||||
"orth":"Elianti"
|
}
|
||||||
},
|
|
||||||
{
|
|
||||||
"dep":"punct",
|
|
||||||
"head":-2,
|
|
||||||
"tag":".",
|
|
||||||
"orth":"."
|
|
||||||
}
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}]
|
|
||||||
docs = json2docs(data)
|
docs = json2docs(data)
|
||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
|
|
@ -8,8 +8,9 @@ from ..tokens import Doc
|
||||||
from ..attrs import SPACY, ORTH, intify_attr
|
from ..attrs import SPACY, ORTH, intify_attr
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
|
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
class DocBin(object):
|
class DocBin(object):
|
||||||
|
@ -86,9 +87,7 @@ class DocBin(object):
|
||||||
assert array.shape[0] == spaces.shape[0] # this should never happen
|
assert array.shape[0] == spaces.shape[0] # this should never happen
|
||||||
spaces = spaces.reshape((spaces.shape[0], 1))
|
spaces = spaces.reshape((spaces.shape[0], 1))
|
||||||
self.spaces.append(numpy.asarray(spaces, dtype=bool))
|
self.spaces.append(numpy.asarray(spaces, dtype=bool))
|
||||||
self.flags.append({
|
self.flags.append({"has_unknown_spaces": doc.has_unknown_spaces})
|
||||||
"has_unknown_spaces": doc.has_unknown_spaces
|
|
||||||
})
|
|
||||||
for token in doc:
|
for token in doc:
|
||||||
self.strings.add(token.text)
|
self.strings.add(token.text)
|
||||||
self.strings.add(token.tag_)
|
self.strings.add(token.tag_)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user