mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Add offsets to Tokens class. Some changes to interfaces, and reorganization of spacy.Lang
This commit is contained in:
parent
2805068ca8
commit
6fb42c4919
|
@ -1,20 +1,21 @@
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
from spacy.word cimport Lexeme
|
|
||||||
from spacy.tokens cimport Tokens
|
|
||||||
from spacy.lexeme cimport LexemeC
|
|
||||||
from preshed.maps cimport PreshMap
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
|
|
||||||
from libcpp.utility cimport pair
|
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libc.stdint cimport uint64_t, int64_t
|
from libc.stdint cimport uint64_t, int64_t
|
||||||
|
|
||||||
|
from preshed.maps cimport PreshMap
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
from .word cimport Lexeme
|
||||||
|
from .tokens cimport Tokens
|
||||||
|
from .lexeme cimport LexemeC
|
||||||
|
|
||||||
|
|
||||||
cdef extern from "Python.h":
|
cdef extern from "Python.h":
|
||||||
cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
|
cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
|
||||||
cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
|
cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
|
||||||
|
cdef bint Py_UNICODE_ISALPHA(Py_UNICODE ch)
|
||||||
|
cdef bint Py_UNICODE_ISUPPER(Py_UNICODE ch)
|
||||||
|
|
||||||
|
|
||||||
cdef struct String:
|
cdef struct String:
|
||||||
|
@ -24,7 +25,7 @@ cdef struct String:
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
cdef Pool _mem
|
cdef Pool mem
|
||||||
cpdef readonly size_t size
|
cpdef readonly size_t size
|
||||||
|
|
||||||
cdef vector[LexemeC*] lexemes
|
cdef vector[LexemeC*] lexemes
|
||||||
|
@ -37,7 +38,6 @@ cdef class Lexicon:
|
||||||
cdef list _string_features
|
cdef list _string_features
|
||||||
cdef list _flag_features
|
cdef list _flag_features
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef Pool _mem
|
cdef Pool _mem
|
||||||
cdef unicode name
|
cdef unicode name
|
||||||
|
@ -47,19 +47,17 @@ cdef class Language:
|
||||||
|
|
||||||
cdef object prefix_re
|
cdef object prefix_re
|
||||||
cdef object suffix_re
|
cdef object suffix_re
|
||||||
|
cdef object infix_re
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
cpdef Lexeme lookup(self, unicode text)
|
|
||||||
|
|
||||||
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
|
|
||||||
|
|
||||||
cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1
|
|
||||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
|
|
||||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
|
|
||||||
|
|
||||||
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
|
|
||||||
vector[LexemeC*] *prefixes,
|
|
||||||
vector[LexemeC*] *suffixes) except -1
|
|
||||||
|
|
||||||
|
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1
|
||||||
|
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
|
||||||
|
vector[LexemeC*] *suffixes) except NULL
|
||||||
|
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
|
||||||
|
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
||||||
|
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
|
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
|
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1
|
cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1
|
||||||
|
|
||||||
|
|
153
spacy/lang.pyx
153
spacy/lang.pyx
|
@ -14,9 +14,9 @@ from os import path
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .util import read_lang_data
|
from .util import read_lang_data
|
||||||
from spacy.tokens import Tokens
|
from .tokens import Tokens
|
||||||
from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
|
from .lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
|
||||||
from spacy.lexeme cimport LexStr_orig
|
from .lexeme cimport LexStr_orig
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
from cpython.ref cimport Py_INCREF
|
from cpython.ref cimport Py_INCREF
|
||||||
|
@ -41,23 +41,13 @@ cdef class Language:
|
||||||
self._mem = Pool()
|
self._mem = Pool()
|
||||||
self.cache = PreshMap(2 ** 25)
|
self.cache = PreshMap(2 ** 25)
|
||||||
self.specials = PreshMap(2 ** 16)
|
self.specials = PreshMap(2 ** 16)
|
||||||
rules, prefix, suffix, lexemes = util.read_lang_data(name)
|
rules, prefix, suffix, infix, lexemes = util.read_lang_data(name)
|
||||||
self.prefix_re = re.compile(prefix)
|
self.prefix_re = re.compile(prefix)
|
||||||
self.suffix_re = re.compile(suffix)
|
self.suffix_re = re.compile(suffix)
|
||||||
|
self.infix_re = re.compile(infix)
|
||||||
self.lexicon = Lexicon(lexemes)
|
self.lexicon = Lexicon(lexemes)
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string):
|
|
||||||
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
lexeme (Lexeme): A reference to a lexical type.
|
|
||||||
"""
|
|
||||||
return self.lexicon.lookup(string)
|
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode string):
|
cpdef Tokens tokenize(self, unicode string):
|
||||||
"""Tokenize a string.
|
"""Tokenize a string.
|
||||||
|
|
||||||
|
@ -73,37 +63,43 @@ cdef class Language:
|
||||||
Returns:
|
Returns:
|
||||||
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
|
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
|
||||||
"""
|
"""
|
||||||
cdef size_t length = len(string)
|
cdef int length = len(string)
|
||||||
cdef Tokens tokens = Tokens(length)
|
cdef Tokens tokens = Tokens(length)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return tokens
|
return tokens
|
||||||
|
cdef int start = 0
|
||||||
cdef size_t start = 0
|
cdef int i = 0
|
||||||
cdef size_t i = 0
|
|
||||||
cdef Py_UNICODE* chars = string
|
cdef Py_UNICODE* chars = string
|
||||||
cdef String span
|
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
if Py_UNICODE_ISSPACE(chars[i]) == 1:
|
if Py_UNICODE_ISSPACE(chars[i]) == 1:
|
||||||
if start < i:
|
if start < i:
|
||||||
string_from_slice(&span, chars, start, i)
|
self._tokenize(tokens, chars, start, i)
|
||||||
if not _extend_from_map(tokens.v, &span, self.cache):
|
|
||||||
self._tokenize(tokens.v, &span)
|
|
||||||
start = i + 1
|
start = i + 1
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
string_from_slice(&span, chars, start, i)
|
self._tokenize(tokens, chars, start, i)
|
||||||
if not _extend_from_map(tokens.v, &span, self.cache):
|
|
||||||
self._tokenize(tokens.v, &span)
|
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
|
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1:
|
||||||
cdef size_t i
|
cdef String span
|
||||||
cdef uint64_t orig_key = string.key
|
|
||||||
cdef size_t orig_size = tokens_v.size()
|
|
||||||
|
|
||||||
cdef vector[LexemeC*] prefixes
|
cdef vector[LexemeC*] prefixes
|
||||||
cdef vector[LexemeC*] suffixes
|
cdef vector[LexemeC*] suffixes
|
||||||
|
cdef uint64_t orig_key
|
||||||
|
cdef int orig_size
|
||||||
|
string_slice(&span, chars, start, end)
|
||||||
|
lexemes = <LexemeC**>self.cache.get(span.key)
|
||||||
|
if lexemes != NULL:
|
||||||
|
tokens.extend(start, lexemes, 0)
|
||||||
|
else:
|
||||||
|
orig_key = span.key
|
||||||
|
orig_size = tokens.lex.size()
|
||||||
|
span = self._split_affixes(&span, &prefixes, &suffixes)[0]
|
||||||
|
self._attach_tokens(tokens, start, &span, &prefixes, &suffixes)
|
||||||
|
self._save_cached(&tokens.lex, orig_key, orig_size)
|
||||||
|
|
||||||
|
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
|
||||||
|
vector[LexemeC*] *suffixes) except NULL:
|
||||||
|
cdef size_t i
|
||||||
cdef String prefix
|
cdef String prefix
|
||||||
cdef String suffix
|
cdef String suffix
|
||||||
cdef String minus_pre
|
cdef String minus_pre
|
||||||
|
@ -113,8 +109,8 @@ cdef class Language:
|
||||||
last_size = string.n
|
last_size = string.n
|
||||||
pre_len = self._find_prefix(string.chars, string.n)
|
pre_len = self._find_prefix(string.chars, string.n)
|
||||||
if pre_len != 0:
|
if pre_len != 0:
|
||||||
string_from_slice(&prefix, string.chars, 0, pre_len)
|
string_slice(&prefix, string.chars, 0, pre_len)
|
||||||
string_from_slice(&minus_pre, string.chars, pre_len, string.n)
|
string_slice(&minus_pre, string.chars, pre_len, string.n)
|
||||||
# Check whether we've hit a special-case
|
# Check whether we've hit a special-case
|
||||||
if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL:
|
if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL:
|
||||||
string = &minus_pre
|
string = &minus_pre
|
||||||
|
@ -122,16 +118,15 @@ cdef class Language:
|
||||||
break
|
break
|
||||||
suf_len = self._find_suffix(string.chars, string.n)
|
suf_len = self._find_suffix(string.chars, string.n)
|
||||||
if suf_len != 0:
|
if suf_len != 0:
|
||||||
string_from_slice(&suffix, string.chars, string.n - suf_len, string.n)
|
string_slice(&suffix, string.chars, string.n - suf_len, string.n)
|
||||||
string_from_slice(&minus_suf, string.chars, 0, string.n - suf_len)
|
string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
|
||||||
# Check whether we've hit a special-case
|
# Check whether we've hit a special-case
|
||||||
if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL:
|
if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL:
|
||||||
string = &minus_suf
|
string = &minus_suf
|
||||||
suffixes.push_back(self.lexicon.get(&suffix))
|
suffixes.push_back(self.lexicon.get(&suffix))
|
||||||
break
|
break
|
||||||
|
|
||||||
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
|
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
|
||||||
string_from_slice(string, string.chars, pre_len, string.n - suf_len)
|
string_slice(string, string.chars, pre_len, string.n - suf_len)
|
||||||
prefixes.push_back(self.lexicon.get(&prefix))
|
prefixes.push_back(self.lexicon.get(&prefix))
|
||||||
suffixes.push_back(self.lexicon.get(&suffix))
|
suffixes.push_back(self.lexicon.get(&suffix))
|
||||||
elif pre_len:
|
elif pre_len:
|
||||||
|
@ -140,26 +135,37 @@ cdef class Language:
|
||||||
elif suf_len:
|
elif suf_len:
|
||||||
string = &minus_suf
|
string = &minus_suf
|
||||||
suffixes.push_back(self.lexicon.get(&suffix))
|
suffixes.push_back(self.lexicon.get(&suffix))
|
||||||
|
|
||||||
if self.specials.get(string.key):
|
if self.specials.get(string.key):
|
||||||
break
|
break
|
||||||
|
return string
|
||||||
|
|
||||||
self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
|
cdef int _attach_tokens(self, Tokens tokens,
|
||||||
self._save_cached(tokens_v, orig_key, orig_size)
|
int idx, String* string,
|
||||||
|
|
||||||
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
|
|
||||||
vector[LexemeC*] *prefixes,
|
vector[LexemeC*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes) except -1:
|
vector[LexemeC*] *suffixes) except -1:
|
||||||
cdef size_t i
|
cdef int split
|
||||||
cdef LexemeC** lexemes
|
cdef LexemeC** lexemes
|
||||||
cdef LexemeC* lexeme
|
cdef LexemeC* lexeme
|
||||||
for lexeme in deref(prefixes):
|
cdef String span
|
||||||
tokens.push_back(lexeme)
|
idx = tokens.extend(idx, prefixes.data(), prefixes.size())
|
||||||
if not _extend_from_map(tokens, string, self.specials):
|
if string.n != 0:
|
||||||
self._split_body_token(tokens, string)
|
lexemes = <LexemeC**>self.cache.get(string.key)
|
||||||
|
if lexemes != NULL:
|
||||||
|
idx = tokens.extend(idx, lexemes, 0)
|
||||||
|
else:
|
||||||
|
split = self._find_infix(string.chars, string.n)
|
||||||
|
if split == 0 or split == -1:
|
||||||
|
idx = tokens.push_back(idx, self.lexicon.get(string))
|
||||||
|
else:
|
||||||
|
string_slice(&span, string.chars, 0, split)
|
||||||
|
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
||||||
|
string_slice(&span, string.chars, split, split+1)
|
||||||
|
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
||||||
|
string_slice(&span, string.chars, split + 1, string.n)
|
||||||
|
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
||||||
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
|
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
|
||||||
while it != suffixes.rend():
|
while it != suffixes.rend():
|
||||||
tokens.push_back(deref(it))
|
idx = tokens.push_back(idx, deref(it))
|
||||||
preinc(it)
|
preinc(it)
|
||||||
|
|
||||||
cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1:
|
cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1:
|
||||||
|
@ -171,15 +177,17 @@ cdef class Language:
|
||||||
lexemes[i + 1] = NULL
|
lexemes[i + 1] = NULL
|
||||||
self.cache.set(key, lexemes)
|
self.cache.set(key, lexemes)
|
||||||
|
|
||||||
cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1:
|
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||||
tokens.push_back(self.lexicon.get(string))
|
cdef unicode string = chars[:length]
|
||||||
|
match = self.infix_re.search(string)
|
||||||
|
return match.start() if match is not None else 0
|
||||||
|
|
||||||
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
|
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||||
cdef unicode string = chars[:length]
|
cdef unicode string = chars[:length]
|
||||||
match = self.prefix_re.search(string)
|
match = self.prefix_re.search(string)
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
cdef int _find_suffix(self, Py_UNICODE* chars, size_t length):
|
cdef int _find_suffix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||||
cdef unicode string = chars[:length]
|
cdef unicode string = chars[:length]
|
||||||
match = self.suffix_re.search(string)
|
match = self.suffix_re.search(string)
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
@ -212,27 +220,30 @@ cdef class Language:
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
def __cinit__(self, lexemes):
|
def __cinit__(self, lexemes):
|
||||||
self._mem = Pool()
|
self.mem = Pool()
|
||||||
self._dict = PreshMap(2 ** 20)
|
self._dict = PreshMap(2 ** 20)
|
||||||
self.size = 0
|
self.size = 0
|
||||||
cdef String string
|
cdef String string
|
||||||
cdef dict lexeme_dict
|
cdef dict lexeme_dict
|
||||||
cdef LexemeC* lexeme
|
cdef LexemeC* lexeme
|
||||||
for lexeme_dict in lexemes:
|
for py_string, lexeme_dict in lexemes.iteritems():
|
||||||
string_from_unicode(&string, lexeme_dict['string'])
|
string_from_unicode(&string, py_string)
|
||||||
lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
|
lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
|
||||||
lexeme_unpack(lexeme, lexeme_dict)
|
lexeme_unpack(lexeme, lexeme_dict)
|
||||||
self._dict.set(string.key, lexeme)
|
self._dict.set(string.key, lexeme)
|
||||||
self.lexemes.push_back(lexeme)
|
self.lexemes.push_back(lexeme)
|
||||||
self.size += 1
|
self.size += 1
|
||||||
|
|
||||||
|
def __getitem__(self, size_t i):
|
||||||
|
return Lexeme(<size_t>self.lexemes.at(i))
|
||||||
|
|
||||||
cdef LexemeC* get(self, String* string) except NULL:
|
cdef LexemeC* get(self, String* string) except NULL:
|
||||||
cdef LexemeC* lex
|
cdef LexemeC* lex
|
||||||
lex = <LexemeC*>self._dict.get(string.key)
|
lex = <LexemeC*>self._dict.get(string.key)
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
lex = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
|
lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
|
||||||
cdef unicode unicode_string = string.chars[:string.n]
|
cdef unicode unicode_string = string.chars[:string.n]
|
||||||
lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
|
lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
|
||||||
self._dict.set(string.key, lex)
|
self._dict.set(string.key, lex)
|
||||||
|
@ -255,38 +266,12 @@ cdef class Lexicon:
|
||||||
return Lexeme(<size_t>lexeme)
|
return Lexeme(<size_t>lexeme)
|
||||||
|
|
||||||
|
|
||||||
cdef int _extend_from_map(vector[LexemeC*] *tokens, String* string, PreshMap map_) except -1:
|
|
||||||
if string.n == 0:
|
|
||||||
return 1
|
|
||||||
lexemes = <LexemeC**>map_.get(string.key)
|
|
||||||
if lexemes == NULL:
|
|
||||||
return 0
|
|
||||||
cdef size_t i = 0
|
|
||||||
while lexemes[i] != NULL:
|
|
||||||
tokens.push_back(lexemes[i])
|
|
||||||
i += 1
|
|
||||||
return 1
|
|
||||||
|
|
||||||
|
|
||||||
cdef void string_from_unicode(String* s, unicode uni):
|
cdef void string_from_unicode(String* s, unicode uni):
|
||||||
cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
|
cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
|
||||||
string_from_slice(s, c_uni, 0, len(uni))
|
string_slice(s, c_uni, 0, len(uni))
|
||||||
|
|
||||||
|
|
||||||
cdef inline void string_from_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil:
|
cdef inline void string_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil:
|
||||||
s.chars = &chars[start]
|
s.chars = &chars[start]
|
||||||
s.n = end - start
|
s.n = end - start
|
||||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||||
|
|
||||||
|
|
||||||
cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil:
|
|
||||||
string_from_slice(prefix, s.chars, 0, n)
|
|
||||||
s.chars += n
|
|
||||||
s.n -= n
|
|
||||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil:
|
|
||||||
string_from_slice(suffix, s.chars, s.n - n, s.n)
|
|
||||||
s.n -= n
|
|
||||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
|
||||||
|
|
|
@ -79,7 +79,7 @@ def canon_case(string, prob, cluster, case_stats, tag_stats):
|
||||||
|
|
||||||
def word_shape(string, *args):
|
def word_shape(string, *args):
|
||||||
length = len(string)
|
length = len(string)
|
||||||
shape = ""
|
shape = []
|
||||||
last = ""
|
last = ""
|
||||||
shape_char = ""
|
shape_char = ""
|
||||||
seq = 0
|
seq = 0
|
||||||
|
@ -99,8 +99,8 @@ def word_shape(string, *args):
|
||||||
seq = 0
|
seq = 0
|
||||||
last = shape_char
|
last = shape_char
|
||||||
if seq < 5:
|
if seq < 5:
|
||||||
shape += shape_char
|
shape.append(shape_char)
|
||||||
return shape
|
return ''.join(shape)
|
||||||
|
|
||||||
|
|
||||||
def non_sparse(string, prob, cluster, case_stats, tag_stats):
|
def non_sparse(string, prob, cluster, case_stats, tag_stats):
|
||||||
|
|
|
@ -2,14 +2,10 @@ from spacy.lexeme cimport LexemeC
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
|
|
||||||
cdef struct Token:
|
|
||||||
int i
|
|
||||||
int pos
|
|
||||||
LexemeC* lex
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
cdef vector[Token] v
|
cdef vector[LexemeC*] lex
|
||||||
|
cdef vector[int] idx
|
||||||
|
cdef vector[int] pos
|
||||||
|
|
||||||
cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
|
cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
|
||||||
cdef int push_back(self, int i, LexemeC* lexeme) except -1
|
cdef int push_back(self, int i, LexemeC* lexeme) except -1
|
||||||
|
@ -21,6 +17,7 @@ cdef class Tokens:
|
||||||
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *
|
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *
|
||||||
cpdef unicode string_view(self, size_t i, size_t view_id)
|
cpdef unicode string_view(self, size_t i, size_t view_id)
|
||||||
|
|
||||||
|
cpdef unicode string(self, size_t i)
|
||||||
cpdef unicode orig(self, size_t i)
|
cpdef unicode orig(self, size_t i)
|
||||||
cpdef unicode norm(self, size_t i)
|
cpdef unicode norm(self, size_t i)
|
||||||
cpdef unicode shape(self, size_t i)
|
cpdef unicode shape(self, size_t i)
|
||||||
|
|
|
@ -25,17 +25,20 @@ cdef class Tokens:
|
||||||
"""
|
"""
|
||||||
def __cinit__(self, string_length=0):
|
def __cinit__(self, string_length=0):
|
||||||
size = int(string_length / 3) if string_length >= 3 else 1
|
size = int(string_length / 3) if string_length >= 3 else 1
|
||||||
self.v = vector[Token]()
|
self.lex.reserve(size)
|
||||||
self.v.reserve(size)
|
self.idx.reserve(size)
|
||||||
|
self.pos.reserve(size)
|
||||||
|
|
||||||
def __getitem__(self, i):
|
def __getitem__(self, i):
|
||||||
return Lexeme(<size_t>self.v.at(i).lex)
|
return Lexeme(<size_t>self.lex.at(i))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.v.size()
|
return self.lex.size()
|
||||||
|
|
||||||
cdef int push_back(self, int idx, LexemeC* lexeme) except -1:
|
cdef int push_back(self, int idx, LexemeC* lexeme) except -1:
|
||||||
self.v.push_back(Token(idx, 0, lexeme))
|
self.lex.push_back(lexeme)
|
||||||
|
self.idx.push_back(idx)
|
||||||
|
self.pos.push_back(0)
|
||||||
return idx + lexeme.ints[<int>LexInt_length]
|
return idx + lexeme.ints[<int>LexInt_length]
|
||||||
|
|
||||||
cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
|
cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
|
||||||
|
@ -46,120 +49,124 @@ cdef class Tokens:
|
||||||
i = 0
|
i = 0
|
||||||
while lexemes[i] != NULL:
|
while lexemes[i] != NULL:
|
||||||
idx = self.push_back(idx, lexemes[i])
|
idx = self.push_back(idx, lexemes[i])
|
||||||
|
i += 1
|
||||||
else:
|
else:
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
idx = self.push_back(idx, lexemes[i])
|
idx = self.push_back(idx, lexemes[i])
|
||||||
return idx
|
return idx
|
||||||
|
|
||||||
cpdef int id(self, size_t i) except -1:
|
cpdef int id(self, size_t i) except -1:
|
||||||
return self.v.at(i).lex.ints[<int>LexInt_id]
|
return self.lex.at(i).ints[<int>LexInt_id]
|
||||||
|
|
||||||
cpdef float prob(self, size_t i) except 1:
|
cpdef float prob(self, size_t i) except 1:
|
||||||
return self.v.at(i).lex.floats[<int>LexFloat_prob]
|
return self.lex.at(i).floats[<int>LexFloat_prob]
|
||||||
|
|
||||||
cpdef int cluster(self, size_t i) except *:
|
cpdef int cluster(self, size_t i) except *:
|
||||||
return self.v.at(i).lex.ints[<int>LexInt_cluster]
|
return self.lex.at(i).ints[<int>LexInt_cluster]
|
||||||
|
|
||||||
cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *:
|
cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *:
|
||||||
return lexeme_check_orth_flag(self.v.at(i).lex, flag_id)
|
return lexeme_check_orth_flag(self.lex.at(i), flag_id)
|
||||||
|
|
||||||
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *:
|
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, flag_id)
|
return lexeme_check_dist_flag(self.lex.at(i), flag_id)
|
||||||
|
|
||||||
cpdef unicode string_view(self, size_t i, size_t view_id):
|
cpdef unicode string_view(self, size_t i, size_t view_id):
|
||||||
return lexeme_get_string(self.v.at(i).lex, view_id)
|
return lexeme_get_string(self.lex.at(i), view_id)
|
||||||
|
|
||||||
# Provide accessor methods for the features supported by the language.
|
# Provide accessor methods for the features supported by the language.
|
||||||
# Without these, clients have to use the underlying string_view and check_flag
|
# Without these, clients have to use the underlying string_view and check_flag
|
||||||
# methods, which requires them to know the IDs.
|
# methods, which requires them to know the IDs.
|
||||||
|
|
||||||
|
cpdef unicode string(self, size_t i):
|
||||||
|
return self.orig(i)
|
||||||
|
|
||||||
cpdef unicode orig(self, size_t i):
|
cpdef unicode orig(self, size_t i):
|
||||||
cdef bytes utf8_string = self.v.at(i).lex.strings[<int>LexStr_orig]
|
cdef bytes utf8_string = self.lex.at(i).strings[<int>LexStr_orig]
|
||||||
cdef unicode string = utf8_string.decode('utf8')
|
cdef unicode string = utf8_string.decode('utf8')
|
||||||
return string
|
return string
|
||||||
|
|
||||||
cpdef unicode norm(self, size_t i):
|
cpdef unicode norm(self, size_t i):
|
||||||
cdef bytes utf8_string = self.v.at(i).lex.strings[<int>LexStr_norm]
|
cdef bytes utf8_string = self.lex.at(i).strings[<int>LexStr_norm]
|
||||||
cdef unicode string = utf8_string.decode('utf8')
|
cdef unicode string = utf8_string.decode('utf8')
|
||||||
return string
|
return string
|
||||||
|
|
||||||
cpdef unicode shape(self, size_t i):
|
cpdef unicode shape(self, size_t i):
|
||||||
return lexeme_get_string(self.v.at(i).lex, LexStr_shape)
|
return lexeme_get_string(self.lex.at(i), LexStr_shape)
|
||||||
|
|
||||||
cpdef unicode unsparse(self, size_t i):
|
cpdef unicode unsparse(self, size_t i):
|
||||||
return lexeme_get_string(self.v.at(i).lex, LexStr_unsparse)
|
return lexeme_get_string(self.lex.at(i), LexStr_unsparse)
|
||||||
|
|
||||||
cpdef unicode asciied(self, size_t i):
|
cpdef unicode asciied(self, size_t i):
|
||||||
return lexeme_get_string(self.v.at(i).lex, LexStr_asciied)
|
return lexeme_get_string(self.lex.at(i), LexStr_asciied)
|
||||||
|
|
||||||
cpdef bint is_alpha(self, size_t i) except *:
|
cpdef bint is_alpha(self, size_t i) except *:
|
||||||
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_alpha)
|
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_alpha)
|
||||||
|
|
||||||
cpdef bint is_ascii(self, size_t i) except *:
|
cpdef bint is_ascii(self, size_t i) except *:
|
||||||
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_ascii)
|
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_ascii)
|
||||||
|
|
||||||
cpdef bint is_digit(self, size_t i) except *:
|
cpdef bint is_digit(self, size_t i) except *:
|
||||||
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_digit)
|
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_digit)
|
||||||
|
|
||||||
cpdef bint is_lower(self, size_t i) except *:
|
cpdef bint is_lower(self, size_t i) except *:
|
||||||
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_lower)
|
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_lower)
|
||||||
|
|
||||||
cpdef bint is_punct(self, size_t i) except *:
|
cpdef bint is_punct(self, size_t i) except *:
|
||||||
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_punct)
|
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_punct)
|
||||||
|
|
||||||
cpdef bint is_space(self, size_t i) except *:
|
cpdef bint is_space(self, size_t i) except *:
|
||||||
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_space)
|
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_space)
|
||||||
|
|
||||||
cpdef bint is_title(self, size_t i) except *:
|
cpdef bint is_title(self, size_t i) except *:
|
||||||
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_title)
|
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_title)
|
||||||
|
|
||||||
cpdef bint is_upper(self, size_t i) except *:
|
cpdef bint is_upper(self, size_t i) except *:
|
||||||
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_upper)
|
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_upper)
|
||||||
|
|
||||||
cpdef bint can_adj(self, size_t i) except *:
|
cpdef bint can_adj(self, size_t i) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adj)
|
return lexeme_check_dist_flag(self.lex.at(i), LexDist_adj)
|
||||||
|
|
||||||
cpdef bint can_adp(self, size_t i) except *:
|
cpdef bint can_adp(self, size_t i) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adp)
|
return lexeme_check_dist_flag(self.lex.at(i), LexDist_adp)
|
||||||
|
|
||||||
cpdef bint can_adv(self, size_t i) except *:
|
cpdef bint can_adv(self, size_t i) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adv)
|
return lexeme_check_dist_flag(self.lex.at(i), LexDist_adv)
|
||||||
|
|
||||||
cpdef bint can_conj(self, size_t i) except *:
|
cpdef bint can_conj(self, size_t i) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_conj)
|
return lexeme_check_dist_flag(self.lex.at(i), LexDist_conj)
|
||||||
|
|
||||||
cpdef bint can_det(self, size_t i) except *:
|
cpdef bint can_det(self, size_t i) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_det)
|
return lexeme_check_dist_flag(self.lex.at(i), LexDist_det)
|
||||||
|
|
||||||
cpdef bint can_noun(self, size_t i) except *:
|
cpdef bint can_noun(self, size_t i) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_noun)
|
return lexeme_check_dist_flag(self.lex.at(i), LexDist_noun)
|
||||||
|
|
||||||
cpdef bint can_num(self, size_t i) except *:
|
cpdef bint can_num(self, size_t i) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_num)
|
return lexeme_check_dist_flag(self.lex.at(i), LexDist_num)
|
||||||
|
|
||||||
cpdef bint can_pdt(self, size_t i) except *:
|
cpdef bint can_pdt(self, size_t i) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pdt)
|
return lexeme_check_dist_flag(self.lex.at(i), LexDist_pdt)
|
||||||
|
|
||||||
cpdef bint can_pos(self, size_t i) except *:
|
cpdef bint can_pos(self, size_t i) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pos)
|
return lexeme_check_dist_flag(self.lex.at(i), LexDist_pos)
|
||||||
|
|
||||||
cpdef bint can_pron(self, size_t i) except *:
|
cpdef bint can_pron(self, size_t i) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pron)
|
return lexeme_check_dist_flag(self.lex.at(i), LexDist_pron)
|
||||||
|
|
||||||
cpdef bint can_prt(self, size_t i) except *:
|
cpdef bint can_prt(self, size_t i) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_prt)
|
return lexeme_check_dist_flag(self.lex.at(i), LexDist_prt)
|
||||||
|
|
||||||
cpdef bint can_punct(self, size_t i) except *:
|
cpdef bint can_punct(self, size_t i) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_punct)
|
return lexeme_check_dist_flag(self.lex.at(i), LexDist_punct)
|
||||||
|
|
||||||
cpdef bint can_verb(self, size_t i) except *:
|
cpdef bint can_verb(self, size_t i) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_verb)
|
return lexeme_check_dist_flag(self.lex.at(i), LexDist_verb)
|
||||||
|
|
||||||
cpdef bint oft_lower(self, size_t i) except *:
|
cpdef bint oft_lower(self, size_t i) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_lower)
|
return lexeme_check_dist_flag(self.lex.at(i), LexDist_lower)
|
||||||
|
|
||||||
cpdef bint oft_title(self, size_t i) except *:
|
cpdef bint oft_title(self, size_t i) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_title)
|
return lexeme_check_dist_flag(self.lex.at(i), LexDist_title)
|
||||||
|
|
||||||
cpdef bint oft_upper(self, size_t i) except *:
|
cpdef bint oft_upper(self, size_t i) except *:
|
||||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_upper)
|
return lexeme_check_dist_flag(self.lex.at(i), LexDist_upper)
|
||||||
|
|
|
@ -4,3 +4,5 @@ ctypedef uint64_t hash_t
|
||||||
ctypedef char* utf8_t
|
ctypedef char* utf8_t
|
||||||
ctypedef uint64_t flag_t
|
ctypedef uint64_t flag_t
|
||||||
ctypedef uintptr_t id_t
|
ctypedef uintptr_t id_t
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import os
|
import os
|
||||||
from os import path
|
from os import path
|
||||||
import codecs
|
import codecs
|
||||||
import json
|
import ujson
|
||||||
import re
|
import re
|
||||||
|
|
||||||
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
|
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
|
||||||
|
@ -16,28 +16,36 @@ def read_lang_data(name):
|
||||||
tokenization = read_tokenization(data_dir)
|
tokenization = read_tokenization(data_dir)
|
||||||
prefix = read_prefix(data_dir)
|
prefix = read_prefix(data_dir)
|
||||||
suffix = read_suffix(data_dir)
|
suffix = read_suffix(data_dir)
|
||||||
|
infix = read_infix(data_dir)
|
||||||
|
|
||||||
lex_loc = path.join(data_dir, 'lexemes.json')
|
lex_loc = path.join(data_dir, 'lexemes.json')
|
||||||
if path.exists(lex_loc):
|
if path.exists(lex_loc):
|
||||||
with open(lex_loc) as file_:
|
with open(lex_loc) as file_:
|
||||||
lexemes = ujson.load(file_)
|
lexemes = ujson.load(file_)
|
||||||
else:
|
else:
|
||||||
lexemes = []
|
lexemes = {}
|
||||||
return tokenization, prefix, suffix, lexemes
|
return tokenization, prefix, suffix, infix, lexemes
|
||||||
|
|
||||||
|
|
||||||
def read_prefix(data_dir):
|
def read_prefix(data_dir):
|
||||||
with utf8open(path.join(data_dir, 'prefix')) as file_:
|
with utf8open(path.join(data_dir, 'prefix')) as file_:
|
||||||
entries = file_.read().split('\n')
|
entries = file_.read().split('\n')
|
||||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries])
|
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||||
return expression
|
return expression
|
||||||
|
|
||||||
def read_suffix(data_dir):
|
def read_suffix(data_dir):
|
||||||
with utf8open(path.join(data_dir, 'suffix')) as file_:
|
with utf8open(path.join(data_dir, 'suffix')) as file_:
|
||||||
entries = file_.read().split('\n')
|
entries = file_.read().split('\n')
|
||||||
expression = '|'.join([re.escape(piece) + '$' for piece in entries])
|
expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
|
||||||
return expression
|
return expression
|
||||||
|
|
||||||
|
def read_infix(data_dir):
|
||||||
|
with utf8open(path.join(data_dir, 'infix')) as file_:
|
||||||
|
entries = file_.read().split('\n')
|
||||||
|
expression = '|'.join([piece for piece in entries if piece.strip()])
|
||||||
|
return expression
|
||||||
|
|
||||||
|
|
||||||
def read_tokenization(lang):
|
def read_tokenization(lang):
|
||||||
loc = path.join(DATA_DIR, lang, 'tokenization')
|
loc = path.join(DATA_DIR, lang, 'tokenization')
|
||||||
entries = []
|
entries = []
|
||||||
|
@ -60,3 +68,16 @@ def read_tokenization(lang):
|
||||||
seen.add(chunk)
|
seen.add(chunk)
|
||||||
entries.append((chunk, pieces))
|
entries.append((chunk, pieces))
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
def align_tokens(ref, indices):
|
||||||
|
start = 0
|
||||||
|
queue = list(indices)
|
||||||
|
for token in ref:
|
||||||
|
end = start + len(token)
|
||||||
|
emit = []
|
||||||
|
while queue and queue[0][1] <= end:
|
||||||
|
emit.append(queue.pop(0))
|
||||||
|
yield token, emit
|
||||||
|
start = end
|
||||||
|
assert not queue
|
||||||
|
|
|
@ -7,20 +7,20 @@ from spacy.lexeme import *
|
||||||
|
|
||||||
|
|
||||||
def test_is_alpha():
|
def test_is_alpha():
|
||||||
the = EN.lookup('the')
|
the = EN.lexicon.lookup('the')
|
||||||
assert the.check_orth_flag(LexOrth_alpha)
|
assert the.check_orth_flag(LexOrth_alpha)
|
||||||
year = EN.lookup('1999')
|
year = EN.lexicon.lookup('1999')
|
||||||
assert not year.check_orth_flag(LexOrth_alpha)
|
assert not year.check_orth_flag(LexOrth_alpha)
|
||||||
mixed = EN.lookup('hello1')
|
mixed = EN.lexicon.lookup('hello1')
|
||||||
assert not mixed.check_orth_flag(LexOrth_alpha)
|
assert not mixed.check_orth_flag(LexOrth_alpha)
|
||||||
|
|
||||||
|
|
||||||
def test_is_digit():
|
def test_is_digit():
|
||||||
the = EN.lookup('the')
|
the = EN.lexicon.lookup('the')
|
||||||
assert not the.check_orth_flag(LexOrth_digit)
|
assert not the.check_orth_flag(LexOrth_digit)
|
||||||
year = EN.lookup('1999')
|
year = EN.lexicon.lookup('1999')
|
||||||
assert year.check_orth_flag(LexOrth_digit)
|
assert year.check_orth_flag(LexOrth_digit)
|
||||||
mixed = EN.lookup('hello1')
|
mixed = EN.lexicon.lookup('hello1')
|
||||||
assert not mixed.check_orth_flag(LexOrth_digit)
|
assert not mixed.check_orth_flag(LexOrth_digit)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ from spacy.lexeme import *
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def C3P0():
|
def C3P0():
|
||||||
return EN.lookup("C3P0")
|
return EN.lexicon.lookup("C3P0")
|
||||||
|
|
||||||
|
|
||||||
def test_shape(C3P0):
|
def test_shape(C3P0):
|
||||||
|
@ -17,11 +17,11 @@ def test_shape(C3P0):
|
||||||
|
|
||||||
|
|
||||||
def test_length():
|
def test_length():
|
||||||
t = EN.lookup('the')
|
t = EN.lexicon.lookup('the')
|
||||||
assert t.length == 3
|
assert t.length == 3
|
||||||
t = EN.lookup("n't")
|
t = EN.lexicon.lookup("n't")
|
||||||
assert t.length == 3
|
assert t.length == 3
|
||||||
t = EN.lookup("'s")
|
t = EN.lexicon.lookup("'s")
|
||||||
assert t.length == 2
|
assert t.length == 2
|
||||||
t = EN.lookup('Xxxx')
|
t = EN.lexicon.lookup('Xxxx')
|
||||||
assert t.length == 4
|
assert t.length == 4
|
||||||
|
|
|
@ -27,7 +27,7 @@ def test_punct():
|
||||||
|
|
||||||
def test_digits():
|
def test_digits():
|
||||||
lex_ids = EN.tokenize('The year: 1984.')
|
lex_ids = EN.tokenize('The year: 1984.')
|
||||||
assert lex_ids.string(3) == "1984"
|
assert lex_ids.orig(3) == "1984"
|
||||||
assert len(lex_ids) == 5
|
assert len(lex_ids) == 5
|
||||||
assert lex_ids[0].string == EN.lexicon.lookup('The').string
|
assert lex_ids[0].string == EN.lexicon.lookup('The').string
|
||||||
assert lex_ids[3].string == EN.lexicon.lookup('1984').string
|
assert lex_ids[3].string == EN.lexicon.lookup('1984').string
|
||||||
|
@ -101,4 +101,4 @@ def test_cnts6():
|
||||||
def test_cnts7():
|
def test_cnts7():
|
||||||
text = 'But then the 6,000-year ice age came...'
|
text = 'But then the 6,000-year ice age came...'
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN.tokenize(text)
|
||||||
assert len(tokens) == 8
|
assert len(tokens) == 10
|
||||||
|
|
|
@ -4,31 +4,31 @@ from spacy.en import EN
|
||||||
|
|
||||||
|
|
||||||
def test_neq():
|
def test_neq():
|
||||||
addr = EN.lookup('Hello')
|
addr = EN.lexicon.lookup('Hello')
|
||||||
assert EN.lookup('bye').string != addr.string
|
assert EN.lexicon.lookup('bye').string != addr.string
|
||||||
|
|
||||||
|
|
||||||
def test_eq():
|
def test_eq():
|
||||||
addr = EN.lookup('Hello')
|
addr = EN.lexicon.lookup('Hello')
|
||||||
assert EN.lookup('Hello').string == addr.string
|
assert EN.lexicon.lookup('Hello').string == addr.string
|
||||||
|
|
||||||
|
|
||||||
def test_round_trip():
|
def test_round_trip():
|
||||||
hello = EN.lookup('Hello')
|
hello = EN.lexicon.lookup('Hello')
|
||||||
assert hello.string == 'Hello'
|
assert hello.string == 'Hello'
|
||||||
|
|
||||||
|
|
||||||
def test_case_neq():
|
def test_case_neq():
|
||||||
addr = EN.lookup('Hello')
|
addr = EN.lexicon.lookup('Hello')
|
||||||
assert EN.lookup('hello').string != addr.string
|
assert EN.lexicon.lookup('hello').string != addr.string
|
||||||
|
|
||||||
|
|
||||||
def test_punct_neq():
|
def test_punct_neq():
|
||||||
addr = EN.lookup('Hello')
|
addr = EN.lexicon.lookup('Hello')
|
||||||
assert EN.lookup('Hello,').string != addr.string
|
assert EN.lexicon.lookup('Hello,').string != addr.string
|
||||||
|
|
||||||
|
|
||||||
def test_short():
|
def test_short():
|
||||||
addr = EN.lookup('I')
|
addr = EN.lexicon.lookup('I')
|
||||||
assert addr.string == 'I'
|
assert addr.string == 'I'
|
||||||
assert addr.string != 'not'
|
assert addr.string != 'not'
|
||||||
|
|
Loading…
Reference in New Issue
Block a user