mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Fixed contraction tests. Need to correct problem with the way case stats and tag stats are supposed to work.
This commit is contained in:
parent
fdaf24604a
commit
fd4e61e58b
|
@ -20,7 +20,7 @@ cdef class Language:
|
||||||
self.name = name
|
self.name = name
|
||||||
self.cache = {}
|
self.cache = {}
|
||||||
self.lexicon = Lexicon()
|
self.lexicon = Lexicon()
|
||||||
#self.load_special_tokenization(util.read_tokenization(name))
|
self.load_special_tokenization(util.read_tokenization(name))
|
||||||
|
|
||||||
cpdef list tokenize(self, unicode string):
|
cpdef list tokenize(self, unicode string):
|
||||||
"""Tokenize a string.
|
"""Tokenize a string.
|
||||||
|
@ -49,6 +49,7 @@ cdef class Language:
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
tokens.extend(self._tokenize(string[start:]))
|
tokens.extend(self._tokenize(string[start:]))
|
||||||
|
assert tokens
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef list _tokenize(self, unicode string):
|
cdef list _tokenize(self, unicode string):
|
||||||
|
@ -101,7 +102,7 @@ cdef class Language:
|
||||||
for string, substrings in token_rules:
|
for string, substrings in token_rules:
|
||||||
lexemes = []
|
lexemes = []
|
||||||
for i, substring in enumerate(substrings):
|
for i, substring in enumerate(substrings):
|
||||||
lexemes.append(self.lookup(substring))
|
lexemes.append(self.lexicon.lookup(substring))
|
||||||
self.cache[string] = lexemes
|
self.cache[string] = lexemes
|
||||||
|
|
||||||
|
|
||||||
|
@ -143,13 +144,15 @@ cdef class Lexicon:
|
||||||
cdef Lexeme word
|
cdef Lexeme word
|
||||||
flag_id = len(self.flag_checkers)
|
flag_id = len(self.flag_checkers)
|
||||||
for string, word in self.lexicon.items():
|
for string, word in self.lexicon.items():
|
||||||
if flag_checker(string, word.prob, {}):
|
if flag_checker(string, word.prob, {}, {}):
|
||||||
word.set_flag(flag_id)
|
word.set_flag(flag_id)
|
||||||
self.flag_checkers.append(flag_checker)
|
self.flag_checkers.append(flag_checker)
|
||||||
return flag_id
|
return flag_id
|
||||||
|
|
||||||
def add_transform(self, string_transform):
|
def add_transform(self, string_transform):
|
||||||
self.string_transformers.append(string_transform)
|
self.string_transformers.append(string_transform)
|
||||||
|
for string, word in self.lexicon.items():
|
||||||
|
word.add_view(string_transform(string, word.prob, {}, {}))
|
||||||
return len(self.string_transformers) - 1
|
return len(self.string_transformers) - 1
|
||||||
|
|
||||||
def load_probs(self, location):
|
def load_probs(self, location):
|
||||||
|
|
|
@ -1,41 +1,41 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.en import tokenize, lookup, unhash
|
from spacy.en import EN
|
||||||
|
|
||||||
|
|
||||||
def test_possess():
|
def test_possess():
|
||||||
tokens = tokenize("Mike's")
|
tokens = EN.tokenize("Mike's")
|
||||||
assert unhash(tokens[0].lex) == "Mike"
|
assert tokens[0].string == "Mike"
|
||||||
assert unhash(tokens[1].lex) == "'s"
|
assert tokens[1].string == "'s"
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
def test_apostrophe():
|
def test_apostrophe():
|
||||||
tokens = tokenize("schools'")
|
tokens = EN.tokenize("schools'")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert unhash(tokens[1].lex) == "'"
|
assert tokens[1].string == "'"
|
||||||
assert unhash(tokens[0].lex) == "schools"
|
assert tokens[0].string == "schools"
|
||||||
|
|
||||||
|
|
||||||
def test_LL():
|
def test_LL():
|
||||||
tokens = tokenize("we'll")
|
tokens = EN.tokenize("we'll")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert unhash(tokens[1].lex) == "will"
|
assert tokens[1].string == "will"
|
||||||
assert unhash(tokens[0].lex) == "we"
|
assert tokens[0].string == "we"
|
||||||
|
|
||||||
|
|
||||||
def test_aint():
|
def test_aint():
|
||||||
tokens = tokenize("ain't")
|
tokens = EN.tokenize("ain't")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert unhash(tokens[0].lex) == "are"
|
assert tokens[0].string == "are"
|
||||||
assert unhash(tokens[1].lex) == "not"
|
assert tokens[1].string == "not"
|
||||||
|
|
||||||
|
|
||||||
def test_capitalized():
|
def test_capitalized():
|
||||||
tokens = tokenize("can't")
|
tokens = EN.tokenize("can't")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
tokens = tokenize("Can't")
|
tokens = EN.tokenize("Can't")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
tokens = tokenize("Ain't")
|
tokens = EN.tokenize("Ain't")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert unhash(tokens[0].lex) == "Are"
|
assert tokens[0].string == "Are"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user