mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
* Fixed contraction tests. Need to correct problem with the way case stats and tag stats are supposed to work.
This commit is contained in:
parent
fdaf24604a
commit
fd4e61e58b
|
@ -20,7 +20,7 @@ cdef class Language:
|
|||
self.name = name
|
||||
self.cache = {}
|
||||
self.lexicon = Lexicon()
|
||||
#self.load_special_tokenization(util.read_tokenization(name))
|
||||
self.load_special_tokenization(util.read_tokenization(name))
|
||||
|
||||
cpdef list tokenize(self, unicode string):
|
||||
"""Tokenize a string.
|
||||
|
@ -49,6 +49,7 @@ cdef class Language:
|
|||
i += 1
|
||||
if start < i:
|
||||
tokens.extend(self._tokenize(string[start:]))
|
||||
assert tokens
|
||||
return tokens
|
||||
|
||||
cdef list _tokenize(self, unicode string):
|
||||
|
@ -101,7 +102,7 @@ cdef class Language:
|
|||
for string, substrings in token_rules:
|
||||
lexemes = []
|
||||
for i, substring in enumerate(substrings):
|
||||
lexemes.append(self.lookup(substring))
|
||||
lexemes.append(self.lexicon.lookup(substring))
|
||||
self.cache[string] = lexemes
|
||||
|
||||
|
||||
|
@ -143,13 +144,15 @@ cdef class Lexicon:
|
|||
cdef Lexeme word
|
||||
flag_id = len(self.flag_checkers)
|
||||
for string, word in self.lexicon.items():
|
||||
if flag_checker(string, word.prob, {}):
|
||||
if flag_checker(string, word.prob, {}, {}):
|
||||
word.set_flag(flag_id)
|
||||
self.flag_checkers.append(flag_checker)
|
||||
return flag_id
|
||||
|
||||
def add_transform(self, string_transform):
|
||||
self.string_transformers.append(string_transform)
|
||||
for string, word in self.lexicon.items():
|
||||
word.add_view(string_transform(string, word.prob, {}, {}))
|
||||
return len(self.string_transformers) - 1
|
||||
|
||||
def load_probs(self, location):
|
||||
|
|
|
@ -1,41 +1,41 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.en import tokenize, lookup, unhash
|
||||
from spacy.en import EN
|
||||
|
||||
|
||||
def test_possess():
|
||||
tokens = tokenize("Mike's")
|
||||
assert unhash(tokens[0].lex) == "Mike"
|
||||
assert unhash(tokens[1].lex) == "'s"
|
||||
tokens = EN.tokenize("Mike's")
|
||||
assert tokens[0].string == "Mike"
|
||||
assert tokens[1].string == "'s"
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
def test_apostrophe():
|
||||
tokens = tokenize("schools'")
|
||||
tokens = EN.tokenize("schools'")
|
||||
assert len(tokens) == 2
|
||||
assert unhash(tokens[1].lex) == "'"
|
||||
assert unhash(tokens[0].lex) == "schools"
|
||||
assert tokens[1].string == "'"
|
||||
assert tokens[0].string == "schools"
|
||||
|
||||
|
||||
def test_LL():
|
||||
tokens = tokenize("we'll")
|
||||
tokens = EN.tokenize("we'll")
|
||||
assert len(tokens) == 2
|
||||
assert unhash(tokens[1].lex) == "will"
|
||||
assert unhash(tokens[0].lex) == "we"
|
||||
assert tokens[1].string == "will"
|
||||
assert tokens[0].string == "we"
|
||||
|
||||
|
||||
def test_aint():
|
||||
tokens = tokenize("ain't")
|
||||
tokens = EN.tokenize("ain't")
|
||||
assert len(tokens) == 2
|
||||
assert unhash(tokens[0].lex) == "are"
|
||||
assert unhash(tokens[1].lex) == "not"
|
||||
assert tokens[0].string == "are"
|
||||
assert tokens[1].string == "not"
|
||||
|
||||
|
||||
def test_capitalized():
|
||||
tokens = tokenize("can't")
|
||||
tokens = EN.tokenize("can't")
|
||||
assert len(tokens) == 2
|
||||
tokens = tokenize("Can't")
|
||||
tokens = EN.tokenize("Can't")
|
||||
assert len(tokens) == 2
|
||||
tokens = tokenize("Ain't")
|
||||
tokens = EN.tokenize("Ain't")
|
||||
assert len(tokens) == 2
|
||||
assert unhash(tokens[0].lex) == "Are"
|
||||
assert tokens[0].string == "Are"
|
||||
|
|
Loading…
Reference in New Issue
Block a user