* Add offsets to Tokens class. Some changes to interfaces, and reorganization of spacy.Lang

This commit is contained in:
Matthew Honnibal 2014-10-14 15:47:06 +11:00
parent 2805068ca8
commit 6fb42c4919
11 changed files with 193 additions and 183 deletions

View File

@ -1,20 +1,21 @@
from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t
from spacy.word cimport Lexeme
from spacy.tokens cimport Tokens
from spacy.lexeme cimport LexemeC
from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from libcpp.utility cimport pair
from libcpp.vector cimport vector
from libc.stdint cimport uint64_t, int64_t
from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from .word cimport Lexeme
from .tokens cimport Tokens
from .lexeme cimport LexemeC
cdef extern from "Python.h":
cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
cdef bint Py_UNICODE_ISALPHA(Py_UNICODE ch)
cdef bint Py_UNICODE_ISUPPER(Py_UNICODE ch)
cdef struct String:
@ -24,7 +25,7 @@ cdef struct String:
cdef class Lexicon:
cdef Pool _mem
cdef Pool mem
cpdef readonly size_t size
cdef vector[LexemeC*] lexemes
@ -37,7 +38,6 @@ cdef class Lexicon:
cdef list _string_features
cdef list _flag_features
cdef class Language:
cdef Pool _mem
cdef unicode name
@ -47,19 +47,17 @@ cdef class Language:
cdef object prefix_re
cdef object suffix_re
cdef object infix_re
cpdef Tokens tokenize(self, unicode text)
cpdef Lexeme lookup(self, unicode text)
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except -1
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except NULL
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1

View File

@ -14,9 +14,9 @@ from os import path
import re
from .util import read_lang_data
from spacy.tokens import Tokens
from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
from spacy.lexeme cimport LexStr_orig
from .tokens import Tokens
from .lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
from .lexeme cimport LexStr_orig
from murmurhash.mrmr cimport hash64
from cpython.ref cimport Py_INCREF
@ -41,23 +41,13 @@ cdef class Language:
self._mem = Pool()
self.cache = PreshMap(2 ** 25)
self.specials = PreshMap(2 ** 16)
rules, prefix, suffix, lexemes = util.read_lang_data(name)
rules, prefix, suffix, infix, lexemes = util.read_lang_data(name)
self.prefix_re = re.compile(prefix)
self.suffix_re = re.compile(suffix)
self.infix_re = re.compile(infix)
self.lexicon = Lexicon(lexemes)
self._load_special_tokenization(rules)
cpdef Lexeme lookup(self, unicode string):
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
Args:
string (unicode): The string to be looked up. Must be unicode, not bytes.
Returns:
lexeme (Lexeme): A reference to a lexical type.
"""
return self.lexicon.lookup(string)
cpdef Tokens tokenize(self, unicode string):
"""Tokenize a string.
@ -73,37 +63,43 @@ cdef class Language:
Returns:
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
"""
cdef size_t length = len(string)
cdef int length = len(string)
cdef Tokens tokens = Tokens(length)
if length == 0:
return tokens
cdef size_t start = 0
cdef size_t i = 0
cdef int start = 0
cdef int i = 0
cdef Py_UNICODE* chars = string
cdef String span
for i in range(length):
if Py_UNICODE_ISSPACE(chars[i]) == 1:
if start < i:
string_from_slice(&span, chars, start, i)
if not _extend_from_map(tokens.v, &span, self.cache):
self._tokenize(tokens.v, &span)
self._tokenize(tokens, chars, start, i)
start = i + 1
i += 1
if start < i:
string_from_slice(&span, chars, start, i)
if not _extend_from_map(tokens.v, &span, self.cache):
self._tokenize(tokens.v, &span)
self._tokenize(tokens, chars, start, i)
return tokens
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
cdef size_t i
cdef uint64_t orig_key = string.key
cdef size_t orig_size = tokens_v.size()
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1:
cdef String span
cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes
cdef uint64_t orig_key
cdef int orig_size
string_slice(&span, chars, start, end)
lexemes = <LexemeC**>self.cache.get(span.key)
if lexemes != NULL:
tokens.extend(start, lexemes, 0)
else:
orig_key = span.key
orig_size = tokens.lex.size()
span = self._split_affixes(&span, &prefixes, &suffixes)[0]
self._attach_tokens(tokens, start, &span, &prefixes, &suffixes)
self._save_cached(&tokens.lex, orig_key, orig_size)
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except NULL:
cdef size_t i
cdef String prefix
cdef String suffix
cdef String minus_pre
@ -113,8 +109,8 @@ cdef class Language:
last_size = string.n
pre_len = self._find_prefix(string.chars, string.n)
if pre_len != 0:
string_from_slice(&prefix, string.chars, 0, pre_len)
string_from_slice(&minus_pre, string.chars, pre_len, string.n)
string_slice(&prefix, string.chars, 0, pre_len)
string_slice(&minus_pre, string.chars, pre_len, string.n)
# Check whether we've hit a special-case
if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL:
string = &minus_pre
@ -122,16 +118,15 @@ cdef class Language:
break
suf_len = self._find_suffix(string.chars, string.n)
if suf_len != 0:
string_from_slice(&suffix, string.chars, string.n - suf_len, string.n)
string_from_slice(&minus_suf, string.chars, 0, string.n - suf_len)
string_slice(&suffix, string.chars, string.n - suf_len, string.n)
string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
# Check whether we've hit a special-case
if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL:
string = &minus_suf
suffixes.push_back(self.lexicon.get(&suffix))
break
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
string_from_slice(string, string.chars, pre_len, string.n - suf_len)
string_slice(string, string.chars, pre_len, string.n - suf_len)
prefixes.push_back(self.lexicon.get(&prefix))
suffixes.push_back(self.lexicon.get(&suffix))
elif pre_len:
@ -140,26 +135,37 @@ cdef class Language:
elif suf_len:
string = &minus_suf
suffixes.push_back(self.lexicon.get(&suffix))
if self.specials.get(string.key):
break
return string
self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
self._save_cached(tokens_v, orig_key, orig_size)
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
cdef int _attach_tokens(self, Tokens tokens,
int idx, String* string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except -1:
cdef size_t i
cdef int split
cdef LexemeC** lexemes
cdef LexemeC* lexeme
for lexeme in deref(prefixes):
tokens.push_back(lexeme)
if not _extend_from_map(tokens, string, self.specials):
self._split_body_token(tokens, string)
cdef String span
idx = tokens.extend(idx, prefixes.data(), prefixes.size())
if string.n != 0:
lexemes = <LexemeC**>self.cache.get(string.key)
if lexemes != NULL:
idx = tokens.extend(idx, lexemes, 0)
else:
split = self._find_infix(string.chars, string.n)
if split == 0 or split == -1:
idx = tokens.push_back(idx, self.lexicon.get(string))
else:
string_slice(&span, string.chars, 0, split)
idx = tokens.push_back(idx, self.lexicon.get(&span))
string_slice(&span, string.chars, split, split+1)
idx = tokens.push_back(idx, self.lexicon.get(&span))
string_slice(&span, string.chars, split + 1, string.n)
idx = tokens.push_back(idx, self.lexicon.get(&span))
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend():
tokens.push_back(deref(it))
idx = tokens.push_back(idx, deref(it))
preinc(it)
cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1:
@ -171,15 +177,17 @@ cdef class Language:
lexemes[i + 1] = NULL
self.cache.set(key, lexemes)
cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1:
tokens.push_back(self.lexicon.get(string))
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
cdef unicode string = chars[:length]
match = self.infix_re.search(string)
return match.start() if match is not None else 0
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
cdef unicode string = chars[:length]
match = self.prefix_re.search(string)
return (match.end() - match.start()) if match is not None else 0
cdef int _find_suffix(self, Py_UNICODE* chars, size_t length):
cdef int _find_suffix(self, Py_UNICODE* chars, size_t length) except -1:
cdef unicode string = chars[:length]
match = self.suffix_re.search(string)
return (match.end() - match.start()) if match is not None else 0
@ -212,27 +220,30 @@ cdef class Language:
cdef class Lexicon:
def __cinit__(self, lexemes):
self._mem = Pool()
self.mem = Pool()
self._dict = PreshMap(2 ** 20)
self.size = 0
cdef String string
cdef dict lexeme_dict
cdef LexemeC* lexeme
for lexeme_dict in lexemes:
string_from_unicode(&string, lexeme_dict['string'])
lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
for py_string, lexeme_dict in lexemes.iteritems():
string_from_unicode(&string, py_string)
lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
lexeme_unpack(lexeme, lexeme_dict)
self._dict.set(string.key, lexeme)
self.lexemes.push_back(lexeme)
self.size += 1
def __getitem__(self, size_t i):
return Lexeme(<size_t>self.lexemes.at(i))
cdef LexemeC* get(self, String* string) except NULL:
cdef LexemeC* lex
lex = <LexemeC*>self._dict.get(string.key)
if lex != NULL:
return lex
lex = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
cdef unicode unicode_string = string.chars[:string.n]
lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
self._dict.set(string.key, lex)
@ -255,38 +266,12 @@ cdef class Lexicon:
return Lexeme(<size_t>lexeme)
cdef int _extend_from_map(vector[LexemeC*] *tokens, String* string, PreshMap map_) except -1:
if string.n == 0:
return 1
lexemes = <LexemeC**>map_.get(string.key)
if lexemes == NULL:
return 0
cdef size_t i = 0
while lexemes[i] != NULL:
tokens.push_back(lexemes[i])
i += 1
return 1
cdef void string_from_unicode(String* s, unicode uni):
cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
string_from_slice(s, c_uni, 0, len(uni))
string_slice(s, c_uni, 0, len(uni))
cdef inline void string_from_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil:
cdef inline void string_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil:
s.chars = &chars[start]
s.n = end - start
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil:
string_from_slice(prefix, s.chars, 0, n)
s.chars += n
s.n -= n
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil:
string_from_slice(suffix, s.chars, s.n - n, s.n)
s.n -= n
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)

View File

@ -79,7 +79,7 @@ def canon_case(string, prob, cluster, case_stats, tag_stats):
def word_shape(string, *args):
length = len(string)
shape = ""
shape = []
last = ""
shape_char = ""
seq = 0
@ -99,8 +99,8 @@ def word_shape(string, *args):
seq = 0
last = shape_char
if seq < 5:
shape += shape_char
return shape
shape.append(shape_char)
return ''.join(shape)
def non_sparse(string, prob, cluster, case_stats, tag_stats):

View File

@ -2,14 +2,10 @@ from spacy.lexeme cimport LexemeC
from libcpp.vector cimport vector
cdef struct Token:
int i
int pos
LexemeC* lex
cdef class Tokens:
cdef vector[Token] v
cdef vector[LexemeC*] lex
cdef vector[int] idx
cdef vector[int] pos
cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
cdef int push_back(self, int i, LexemeC* lexeme) except -1
@ -21,6 +17,7 @@ cdef class Tokens:
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *
cpdef unicode string_view(self, size_t i, size_t view_id)
cpdef unicode string(self, size_t i)
cpdef unicode orig(self, size_t i)
cpdef unicode norm(self, size_t i)
cpdef unicode shape(self, size_t i)

View File

@ -25,17 +25,20 @@ cdef class Tokens:
"""
def __cinit__(self, string_length=0):
size = int(string_length / 3) if string_length >= 3 else 1
self.v = vector[Token]()
self.v.reserve(size)
self.lex.reserve(size)
self.idx.reserve(size)
self.pos.reserve(size)
def __getitem__(self, i):
return Lexeme(<size_t>self.v.at(i).lex)
return Lexeme(<size_t>self.lex.at(i))
def __len__(self):
return self.v.size()
return self.lex.size()
cdef int push_back(self, int idx, LexemeC* lexeme) except -1:
self.v.push_back(Token(idx, 0, lexeme))
self.lex.push_back(lexeme)
self.idx.push_back(idx)
self.pos.push_back(0)
return idx + lexeme.ints[<int>LexInt_length]
cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
@ -46,120 +49,124 @@ cdef class Tokens:
i = 0
while lexemes[i] != NULL:
idx = self.push_back(idx, lexemes[i])
i += 1
else:
for i in range(n):
idx = self.push_back(idx, lexemes[i])
return idx
cpdef int id(self, size_t i) except -1:
return self.v.at(i).lex.ints[<int>LexInt_id]
return self.lex.at(i).ints[<int>LexInt_id]
cpdef float prob(self, size_t i) except 1:
return self.v.at(i).lex.floats[<int>LexFloat_prob]
return self.lex.at(i).floats[<int>LexFloat_prob]
cpdef int cluster(self, size_t i) except *:
return self.v.at(i).lex.ints[<int>LexInt_cluster]
return self.lex.at(i).ints[<int>LexInt_cluster]
cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, flag_id)
return lexeme_check_orth_flag(self.lex.at(i), flag_id)
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, flag_id)
return lexeme_check_dist_flag(self.lex.at(i), flag_id)
cpdef unicode string_view(self, size_t i, size_t view_id):
return lexeme_get_string(self.v.at(i).lex, view_id)
return lexeme_get_string(self.lex.at(i), view_id)
# Provide accessor methods for the features supported by the language.
# Without these, clients have to use the underlying string_view and check_flag
# methods, which requires them to know the IDs.
cpdef unicode string(self, size_t i):
return self.orig(i)
cpdef unicode orig(self, size_t i):
cdef bytes utf8_string = self.v.at(i).lex.strings[<int>LexStr_orig]
cdef bytes utf8_string = self.lex.at(i).strings[<int>LexStr_orig]
cdef unicode string = utf8_string.decode('utf8')
return string
cpdef unicode norm(self, size_t i):
cdef bytes utf8_string = self.v.at(i).lex.strings[<int>LexStr_norm]
cdef bytes utf8_string = self.lex.at(i).strings[<int>LexStr_norm]
cdef unicode string = utf8_string.decode('utf8')
return string
cpdef unicode shape(self, size_t i):
return lexeme_get_string(self.v.at(i).lex, LexStr_shape)
return lexeme_get_string(self.lex.at(i), LexStr_shape)
cpdef unicode unsparse(self, size_t i):
return lexeme_get_string(self.v.at(i).lex, LexStr_unsparse)
return lexeme_get_string(self.lex.at(i), LexStr_unsparse)
cpdef unicode asciied(self, size_t i):
return lexeme_get_string(self.v.at(i).lex, LexStr_asciied)
return lexeme_get_string(self.lex.at(i), LexStr_asciied)
cpdef bint is_alpha(self, size_t i) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_alpha)
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_alpha)
cpdef bint is_ascii(self, size_t i) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_ascii)
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_ascii)
cpdef bint is_digit(self, size_t i) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_digit)
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_digit)
cpdef bint is_lower(self, size_t i) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_lower)
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_lower)
cpdef bint is_punct(self, size_t i) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_punct)
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_punct)
cpdef bint is_space(self, size_t i) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_space)
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_space)
cpdef bint is_title(self, size_t i) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_title)
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_title)
cpdef bint is_upper(self, size_t i) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_upper)
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_upper)
cpdef bint can_adj(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adj)
return lexeme_check_dist_flag(self.lex.at(i), LexDist_adj)
cpdef bint can_adp(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adp)
return lexeme_check_dist_flag(self.lex.at(i), LexDist_adp)
cpdef bint can_adv(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adv)
return lexeme_check_dist_flag(self.lex.at(i), LexDist_adv)
cpdef bint can_conj(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_conj)
return lexeme_check_dist_flag(self.lex.at(i), LexDist_conj)
cpdef bint can_det(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_det)
return lexeme_check_dist_flag(self.lex.at(i), LexDist_det)
cpdef bint can_noun(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_noun)
return lexeme_check_dist_flag(self.lex.at(i), LexDist_noun)
cpdef bint can_num(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_num)
return lexeme_check_dist_flag(self.lex.at(i), LexDist_num)
cpdef bint can_pdt(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pdt)
return lexeme_check_dist_flag(self.lex.at(i), LexDist_pdt)
cpdef bint can_pos(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pos)
return lexeme_check_dist_flag(self.lex.at(i), LexDist_pos)
cpdef bint can_pron(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pron)
return lexeme_check_dist_flag(self.lex.at(i), LexDist_pron)
cpdef bint can_prt(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_prt)
return lexeme_check_dist_flag(self.lex.at(i), LexDist_prt)
cpdef bint can_punct(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_punct)
return lexeme_check_dist_flag(self.lex.at(i), LexDist_punct)
cpdef bint can_verb(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_verb)
return lexeme_check_dist_flag(self.lex.at(i), LexDist_verb)
cpdef bint oft_lower(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_lower)
return lexeme_check_dist_flag(self.lex.at(i), LexDist_lower)
cpdef bint oft_title(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_title)
return lexeme_check_dist_flag(self.lex.at(i), LexDist_title)
cpdef bint oft_upper(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_upper)
return lexeme_check_dist_flag(self.lex.at(i), LexDist_upper)

View File

@ -4,3 +4,5 @@ ctypedef uint64_t hash_t
ctypedef char* utf8_t
ctypedef uint64_t flag_t
ctypedef uintptr_t id_t

View File

@ -1,7 +1,7 @@
import os
from os import path
import codecs
import json
import ujson
import re
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
@ -16,28 +16,36 @@ def read_lang_data(name):
tokenization = read_tokenization(data_dir)
prefix = read_prefix(data_dir)
suffix = read_suffix(data_dir)
infix = read_infix(data_dir)
lex_loc = path.join(data_dir, 'lexemes.json')
if path.exists(lex_loc):
with open(lex_loc) as file_:
lexemes = ujson.load(file_)
else:
lexemes = []
return tokenization, prefix, suffix, lexemes
lexemes = {}
return tokenization, prefix, suffix, infix, lexemes
def read_prefix(data_dir):
with utf8open(path.join(data_dir, 'prefix')) as file_:
entries = file_.read().split('\n')
expression = '|'.join(['^' + re.escape(piece) for piece in entries])
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
return expression
def read_suffix(data_dir):
with utf8open(path.join(data_dir, 'suffix')) as file_:
entries = file_.read().split('\n')
expression = '|'.join([re.escape(piece) + '$' for piece in entries])
expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
return expression
def read_infix(data_dir):
with utf8open(path.join(data_dir, 'infix')) as file_:
entries = file_.read().split('\n')
expression = '|'.join([piece for piece in entries if piece.strip()])
return expression
def read_tokenization(lang):
loc = path.join(DATA_DIR, lang, 'tokenization')
entries = []
@ -60,3 +68,16 @@ def read_tokenization(lang):
seen.add(chunk)
entries.append((chunk, pieces))
return entries
def align_tokens(ref, indices):
start = 0
queue = list(indices)
for token in ref:
end = start + len(token)
emit = []
while queue and queue[0][1] <= end:
emit.append(queue.pop(0))
yield token, emit
start = end
assert not queue

View File

@ -7,20 +7,20 @@ from spacy.lexeme import *
def test_is_alpha():
the = EN.lookup('the')
the = EN.lexicon.lookup('the')
assert the.check_orth_flag(LexOrth_alpha)
year = EN.lookup('1999')
year = EN.lexicon.lookup('1999')
assert not year.check_orth_flag(LexOrth_alpha)
mixed = EN.lookup('hello1')
mixed = EN.lexicon.lookup('hello1')
assert not mixed.check_orth_flag(LexOrth_alpha)
def test_is_digit():
the = EN.lookup('the')
the = EN.lexicon.lookup('the')
assert not the.check_orth_flag(LexOrth_digit)
year = EN.lookup('1999')
year = EN.lexicon.lookup('1999')
assert year.check_orth_flag(LexOrth_digit)
mixed = EN.lookup('hello1')
mixed = EN.lexicon.lookup('hello1')
assert not mixed.check_orth_flag(LexOrth_digit)

View File

@ -9,7 +9,7 @@ from spacy.lexeme import *
@pytest.fixture
def C3P0():
return EN.lookup("C3P0")
return EN.lexicon.lookup("C3P0")
def test_shape(C3P0):
@ -17,11 +17,11 @@ def test_shape(C3P0):
def test_length():
t = EN.lookup('the')
t = EN.lexicon.lookup('the')
assert t.length == 3
t = EN.lookup("n't")
t = EN.lexicon.lookup("n't")
assert t.length == 3
t = EN.lookup("'s")
t = EN.lexicon.lookup("'s")
assert t.length == 2
t = EN.lookup('Xxxx')
t = EN.lexicon.lookup('Xxxx')
assert t.length == 4

View File

@ -27,7 +27,7 @@ def test_punct():
def test_digits():
lex_ids = EN.tokenize('The year: 1984.')
assert lex_ids.string(3) == "1984"
assert lex_ids.orig(3) == "1984"
assert len(lex_ids) == 5
assert lex_ids[0].string == EN.lexicon.lookup('The').string
assert lex_ids[3].string == EN.lexicon.lookup('1984').string
@ -101,4 +101,4 @@ def test_cnts6():
def test_cnts7():
text = 'But then the 6,000-year ice age came...'
tokens = EN.tokenize(text)
assert len(tokens) == 8
assert len(tokens) == 10

View File

@ -4,31 +4,31 @@ from spacy.en import EN
def test_neq():
addr = EN.lookup('Hello')
assert EN.lookup('bye').string != addr.string
addr = EN.lexicon.lookup('Hello')
assert EN.lexicon.lookup('bye').string != addr.string
def test_eq():
addr = EN.lookup('Hello')
assert EN.lookup('Hello').string == addr.string
addr = EN.lexicon.lookup('Hello')
assert EN.lexicon.lookup('Hello').string == addr.string
def test_round_trip():
hello = EN.lookup('Hello')
hello = EN.lexicon.lookup('Hello')
assert hello.string == 'Hello'
def test_case_neq():
addr = EN.lookup('Hello')
assert EN.lookup('hello').string != addr.string
addr = EN.lexicon.lookup('Hello')
assert EN.lexicon.lookup('hello').string != addr.string
def test_punct_neq():
addr = EN.lookup('Hello')
assert EN.lookup('Hello,').string != addr.string
addr = EN.lexicon.lookup('Hello')
assert EN.lexicon.lookup('Hello,').string != addr.string
def test_short():
addr = EN.lookup('I')
addr = EN.lexicon.lookup('I')
assert addr.string == 'I'
assert addr.string != 'not'