mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-18 20:40:34 +03:00
Previous Sputnik integration caused API change: Vocab, Tagger, etc were loaded via a from_package classmethod, that required a sputnik.Package instance. This forced users to first create a sputnik.Sputnik() instance, in order to acquire a Package via sp.pool(). Instead I've created a small file-system shim, util.Package, which allows classes to have a .load() classmethod, that accepts either util.Package objects, or strings. We can later gut the internals of this and make it a proxy for Sputnik if we need more functionality that should live in the Sputnik library. Sputnik is now only used to download and install the data, in spacy.en.download
58 lines
1.5 KiB
Python
58 lines
1.5 KiB
Python
from __future__ import unicode_literals
|
|
import pytest
|
|
|
|
|
|
def test_possess(en_tokenizer):
|
|
tokens = en_tokenizer("Mike's")
|
|
assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike"
|
|
assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s"
|
|
assert len(tokens) == 2
|
|
|
|
|
|
def test_apostrophe(en_tokenizer):
|
|
tokens = en_tokenizer("schools'")
|
|
assert len(tokens) == 2
|
|
assert tokens[1].orth_ == "'"
|
|
assert tokens[0].orth_ == "schools"
|
|
|
|
|
|
def test_LL(en_tokenizer):
|
|
tokens = en_tokenizer("we'll")
|
|
assert len(tokens) == 2
|
|
assert tokens[1].orth_ == "'ll"
|
|
assert tokens[1].lemma_ == "will"
|
|
assert tokens[0].orth_ == "we"
|
|
|
|
|
|
def test_aint(en_tokenizer):
|
|
tokens = en_tokenizer("ain't")
|
|
assert len(tokens) == 2
|
|
assert tokens[0].orth_ == "ai"
|
|
assert tokens[0].lemma_ == "be"
|
|
assert tokens[1].orth_ == "n't"
|
|
assert tokens[1].lemma_ == "not"
|
|
|
|
def test_capitalized(en_tokenizer):
|
|
tokens = en_tokenizer("can't")
|
|
assert len(tokens) == 2
|
|
tokens = en_tokenizer("Can't")
|
|
assert len(tokens) == 2
|
|
tokens = en_tokenizer("Ain't")
|
|
assert len(tokens) == 2
|
|
assert tokens[0].orth_ == "Ai"
|
|
assert tokens[0].lemma_ == "be"
|
|
|
|
|
|
def test_punct(en_tokenizer):
|
|
tokens = en_tokenizer("We've")
|
|
assert len(tokens) == 2
|
|
tokens = en_tokenizer("``We've")
|
|
assert len(tokens) == 3
|
|
|
|
|
|
def test_therell(en_tokenizer):
|
|
tokens = en_tokenizer("there'll")
|
|
assert len(tokens) == 2
|
|
assert tokens[0].text == "there"
|
|
assert tokens[1].text == "'ll"
|