mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-29 01:13:17 +03:00
Auto-formatting
This commit is contained in:
parent
2569339a98
commit
3d577b77c6
|
@ -1,12 +1,11 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..util import get_doc
|
|
||||||
from ...vocab import Vocab
|
|
||||||
from ...tokens import Doc
|
|
||||||
from ...tokens import Span
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
def test_doc_split(en_tokenizer):
|
def test_doc_split(en_tokenizer):
|
||||||
|
@ -17,35 +16,41 @@ def test_doc_split(en_tokenizer):
|
||||||
|
|
||||||
assert len(doc) == 3
|
assert len(doc) == 3
|
||||||
assert len(str(doc)) == 19
|
assert len(str(doc)) == 19
|
||||||
assert doc[0].head.text == 'start'
|
assert doc[0].head.text == "start"
|
||||||
assert doc[1].head.text == '.'
|
assert doc[1].head.text == "."
|
||||||
|
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'})
|
retokenizer.split(
|
||||||
|
doc[0],
|
||||||
|
["Los", "Angeles"],
|
||||||
|
[1, 0],
|
||||||
|
attrs={"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"},
|
||||||
|
)
|
||||||
|
|
||||||
assert len(doc) == 4
|
assert len(doc) == 4
|
||||||
assert doc[0].text == 'Los'
|
assert doc[0].text == "Los"
|
||||||
assert doc[0].head.text == 'Angeles'
|
assert doc[0].head.text == "Angeles"
|
||||||
assert doc[0].idx == 0
|
assert doc[0].idx == 0
|
||||||
assert doc[1].idx == 3
|
assert doc[1].idx == 3
|
||||||
|
|
||||||
assert doc[1].text == 'Angeles'
|
assert doc[1].text == "Angeles"
|
||||||
assert doc[1].head.text == 'start'
|
assert doc[1].head.text == "start"
|
||||||
|
|
||||||
assert doc[2].text == 'start'
|
assert doc[2].text == "start"
|
||||||
assert doc[2].head.text == '.'
|
assert doc[2].head.text == "."
|
||||||
|
|
||||||
assert doc[3].text == '.'
|
assert doc[3].text == "."
|
||||||
assert doc[3].head.text == '.'
|
assert doc[3].head.text == "."
|
||||||
|
|
||||||
assert len(str(doc)) == 19
|
assert len(str(doc)) == 19
|
||||||
|
|
||||||
|
|
||||||
def test_split_dependencies(en_tokenizer):
|
def test_split_dependencies(en_tokenizer):
|
||||||
text = "LosAngeles start."
|
text = "LosAngeles start."
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens])
|
doc = get_doc(tokens.vocab, [t.text for t in tokens])
|
||||||
dep1 = doc.vocab.strings.add('amod')
|
dep1 = doc.vocab.strings.add("amod")
|
||||||
dep2 = doc.vocab.strings.add('subject')
|
dep2 = doc.vocab.strings.add("subject")
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2])
|
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2])
|
||||||
|
|
||||||
|
@ -53,27 +58,26 @@ def test_split_dependencies(en_tokenizer):
|
||||||
assert doc[1].dep == dep2
|
assert doc[1].dep == dep2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_split_heads_error(en_tokenizer):
|
def test_split_heads_error(en_tokenizer):
|
||||||
text = "LosAngeles start."
|
text = "LosAngeles start."
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens])
|
doc = get_doc(tokens.vocab, [t.text for t in tokens])
|
||||||
#Not enough heads
|
# Not enough heads
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.split(doc[0], ["Los", "Angeles"], [0])
|
retokenizer.split(doc[0], ["Los", "Angeles"], [0])
|
||||||
|
|
||||||
#Too many heads
|
# Too many heads
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0])
|
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0])
|
||||||
|
|
||||||
#No token head
|
# No token head
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1])
|
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1])
|
||||||
|
|
||||||
#Several token heads
|
# Several token heads
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0])
|
retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0])
|
||||||
|
@ -83,7 +87,7 @@ def test_spans_entity_merge_iob():
|
||||||
# Test entity IOB stays consistent after merging
|
# Test entity IOB stays consistent after merging
|
||||||
words = ["abc", "d", "e"]
|
words = ["abc", "d", "e"]
|
||||||
doc = Doc(Vocab(), words=words)
|
doc = Doc(Vocab(), words=words)
|
||||||
doc.ents = [(doc.vocab.strings.add('ent-abcd'), 0, 2)]
|
doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)]
|
||||||
assert doc[0].ent_iob_ == "B"
|
assert doc[0].ent_iob_ == "B"
|
||||||
assert doc[1].ent_iob_ == "I"
|
assert doc[1].ent_iob_ == "I"
|
||||||
|
|
||||||
|
@ -94,12 +98,14 @@ def test_spans_entity_merge_iob():
|
||||||
assert doc[2].ent_iob_ == "I"
|
assert doc[2].ent_iob_ == "I"
|
||||||
assert doc[3].ent_iob_ == "I"
|
assert doc[3].ent_iob_ == "I"
|
||||||
|
|
||||||
|
|
||||||
def test_spans_sentence_update_after_merge(en_tokenizer):
|
def test_spans_sentence_update_after_merge(en_tokenizer):
|
||||||
|
# fmt: off
|
||||||
text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale."
|
text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale."
|
||||||
heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
|
heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
|
||||||
deps = ['nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
|
deps = ["nsubj", "ROOT", "det", "amod", "prt", "attr", "punct", "nsubj",
|
||||||
'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
|
"ROOT", "prep", "pobj", "cc", "conj", "compound", "punct"]
|
||||||
'compound', 'punct']
|
# fmt: on
|
||||||
|
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user