* Add offsets to Tokens class. Some changes to interfaces, and reorganization of spacy.Lang

This commit is contained in:
Matthew Honnibal 2014-10-14 15:47:06 +11:00
parent 2805068ca8
commit 6fb42c4919
11 changed files with 193 additions and 183 deletions

View File

@ -1,20 +1,21 @@
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from spacy.word cimport Lexeme
from spacy.tokens cimport Tokens
from spacy.lexeme cimport LexemeC
from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from libcpp.utility cimport pair
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libc.stdint cimport uint64_t, int64_t from libc.stdint cimport uint64_t, int64_t
from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from .word cimport Lexeme
from .tokens cimport Tokens
from .lexeme cimport LexemeC
cdef extern from "Python.h": cdef extern from "Python.h":
cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch) cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch) cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
cdef bint Py_UNICODE_ISALPHA(Py_UNICODE ch)
cdef bint Py_UNICODE_ISUPPER(Py_UNICODE ch)
cdef struct String: cdef struct String:
@ -24,7 +25,7 @@ cdef struct String:
cdef class Lexicon: cdef class Lexicon:
cdef Pool _mem cdef Pool mem
cpdef readonly size_t size cpdef readonly size_t size
cdef vector[LexemeC*] lexemes cdef vector[LexemeC*] lexemes
@ -37,7 +38,6 @@ cdef class Lexicon:
cdef list _string_features cdef list _string_features
cdef list _flag_features cdef list _flag_features
cdef class Language: cdef class Language:
cdef Pool _mem cdef Pool _mem
cdef unicode name cdef unicode name
@ -47,19 +47,17 @@ cdef class Language:
cdef object prefix_re cdef object prefix_re
cdef object suffix_re cdef object suffix_re
cdef object infix_re
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cpdef Lexeme lookup(self, unicode text)
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except -1
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except NULL
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1 cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1

View File

@ -14,9 +14,9 @@ from os import path
import re import re
from .util import read_lang_data from .util import read_lang_data
from spacy.tokens import Tokens from .tokens import Tokens
from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack from .lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
from spacy.lexeme cimport LexStr_orig from .lexeme cimport LexStr_orig
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from cpython.ref cimport Py_INCREF from cpython.ref cimport Py_INCREF
@ -41,23 +41,13 @@ cdef class Language:
self._mem = Pool() self._mem = Pool()
self.cache = PreshMap(2 ** 25) self.cache = PreshMap(2 ** 25)
self.specials = PreshMap(2 ** 16) self.specials = PreshMap(2 ** 16)
rules, prefix, suffix, lexemes = util.read_lang_data(name) rules, prefix, suffix, infix, lexemes = util.read_lang_data(name)
self.prefix_re = re.compile(prefix) self.prefix_re = re.compile(prefix)
self.suffix_re = re.compile(suffix) self.suffix_re = re.compile(suffix)
self.infix_re = re.compile(infix)
self.lexicon = Lexicon(lexemes) self.lexicon = Lexicon(lexemes)
self._load_special_tokenization(rules) self._load_special_tokenization(rules)
cpdef Lexeme lookup(self, unicode string):
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
Args:
string (unicode): The string to be looked up. Must be unicode, not bytes.
Returns:
lexeme (Lexeme): A reference to a lexical type.
"""
return self.lexicon.lookup(string)
cpdef Tokens tokenize(self, unicode string): cpdef Tokens tokenize(self, unicode string):
"""Tokenize a string. """Tokenize a string.
@ -73,37 +63,43 @@ cdef class Language:
Returns: Returns:
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes. tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
""" """
cdef size_t length = len(string) cdef int length = len(string)
cdef Tokens tokens = Tokens(length) cdef Tokens tokens = Tokens(length)
if length == 0: if length == 0:
return tokens return tokens
cdef int start = 0
cdef size_t start = 0 cdef int i = 0
cdef size_t i = 0
cdef Py_UNICODE* chars = string cdef Py_UNICODE* chars = string
cdef String span
for i in range(length): for i in range(length):
if Py_UNICODE_ISSPACE(chars[i]) == 1: if Py_UNICODE_ISSPACE(chars[i]) == 1:
if start < i: if start < i:
string_from_slice(&span, chars, start, i) self._tokenize(tokens, chars, start, i)
if not _extend_from_map(tokens.v, &span, self.cache):
self._tokenize(tokens.v, &span)
start = i + 1 start = i + 1
i += 1 i += 1
if start < i: if start < i:
string_from_slice(&span, chars, start, i) self._tokenize(tokens, chars, start, i)
if not _extend_from_map(tokens.v, &span, self.cache):
self._tokenize(tokens.v, &span)
return tokens return tokens
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1: cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1:
cdef size_t i cdef String span
cdef uint64_t orig_key = string.key
cdef size_t orig_size = tokens_v.size()
cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes cdef vector[LexemeC*] suffixes
cdef uint64_t orig_key
cdef int orig_size
string_slice(&span, chars, start, end)
lexemes = <LexemeC**>self.cache.get(span.key)
if lexemes != NULL:
tokens.extend(start, lexemes, 0)
else:
orig_key = span.key
orig_size = tokens.lex.size()
span = self._split_affixes(&span, &prefixes, &suffixes)[0]
self._attach_tokens(tokens, start, &span, &prefixes, &suffixes)
self._save_cached(&tokens.lex, orig_key, orig_size)
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except NULL:
cdef size_t i
cdef String prefix cdef String prefix
cdef String suffix cdef String suffix
cdef String minus_pre cdef String minus_pre
@ -113,8 +109,8 @@ cdef class Language:
last_size = string.n last_size = string.n
pre_len = self._find_prefix(string.chars, string.n) pre_len = self._find_prefix(string.chars, string.n)
if pre_len != 0: if pre_len != 0:
string_from_slice(&prefix, string.chars, 0, pre_len) string_slice(&prefix, string.chars, 0, pre_len)
string_from_slice(&minus_pre, string.chars, pre_len, string.n) string_slice(&minus_pre, string.chars, pre_len, string.n)
# Check whether we've hit a special-case # Check whether we've hit a special-case
if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL: if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL:
string = &minus_pre string = &minus_pre
@ -122,16 +118,15 @@ cdef class Language:
break break
suf_len = self._find_suffix(string.chars, string.n) suf_len = self._find_suffix(string.chars, string.n)
if suf_len != 0: if suf_len != 0:
string_from_slice(&suffix, string.chars, string.n - suf_len, string.n) string_slice(&suffix, string.chars, string.n - suf_len, string.n)
string_from_slice(&minus_suf, string.chars, 0, string.n - suf_len) string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
# Check whether we've hit a special-case # Check whether we've hit a special-case
if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL: if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL:
string = &minus_suf string = &minus_suf
suffixes.push_back(self.lexicon.get(&suffix)) suffixes.push_back(self.lexicon.get(&suffix))
break break
if pre_len and suf_len and (pre_len + suf_len) <= string.n: if pre_len and suf_len and (pre_len + suf_len) <= string.n:
string_from_slice(string, string.chars, pre_len, string.n - suf_len) string_slice(string, string.chars, pre_len, string.n - suf_len)
prefixes.push_back(self.lexicon.get(&prefix)) prefixes.push_back(self.lexicon.get(&prefix))
suffixes.push_back(self.lexicon.get(&suffix)) suffixes.push_back(self.lexicon.get(&suffix))
elif pre_len: elif pre_len:
@ -140,26 +135,37 @@ cdef class Language:
elif suf_len: elif suf_len:
string = &minus_suf string = &minus_suf
suffixes.push_back(self.lexicon.get(&suffix)) suffixes.push_back(self.lexicon.get(&suffix))
if self.specials.get(string.key): if self.specials.get(string.key):
break break
return string
self._attach_tokens(tokens_v, string, &prefixes, &suffixes) cdef int _attach_tokens(self, Tokens tokens,
self._save_cached(tokens_v, orig_key, orig_size) int idx, String* string,
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
vector[LexemeC*] *prefixes, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except -1: vector[LexemeC*] *suffixes) except -1:
cdef size_t i cdef int split
cdef LexemeC** lexemes cdef LexemeC** lexemes
cdef LexemeC* lexeme cdef LexemeC* lexeme
for lexeme in deref(prefixes): cdef String span
tokens.push_back(lexeme) idx = tokens.extend(idx, prefixes.data(), prefixes.size())
if not _extend_from_map(tokens, string, self.specials): if string.n != 0:
self._split_body_token(tokens, string) lexemes = <LexemeC**>self.cache.get(string.key)
if lexemes != NULL:
idx = tokens.extend(idx, lexemes, 0)
else:
split = self._find_infix(string.chars, string.n)
if split == 0 or split == -1:
idx = tokens.push_back(idx, self.lexicon.get(string))
else:
string_slice(&span, string.chars, 0, split)
idx = tokens.push_back(idx, self.lexicon.get(&span))
string_slice(&span, string.chars, split, split+1)
idx = tokens.push_back(idx, self.lexicon.get(&span))
string_slice(&span, string.chars, split + 1, string.n)
idx = tokens.push_back(idx, self.lexicon.get(&span))
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin() cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend(): while it != suffixes.rend():
tokens.push_back(deref(it)) idx = tokens.push_back(idx, deref(it))
preinc(it) preinc(it)
cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1: cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1:
@ -171,15 +177,17 @@ cdef class Language:
lexemes[i + 1] = NULL lexemes[i + 1] = NULL
self.cache.set(key, lexemes) self.cache.set(key, lexemes)
cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1: cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
tokens.push_back(self.lexicon.get(string)) cdef unicode string = chars[:length]
match = self.infix_re.search(string)
return match.start() if match is not None else 0
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1: cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
cdef unicode string = chars[:length] cdef unicode string = chars[:length]
match = self.prefix_re.search(string) match = self.prefix_re.search(string)
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
cdef int _find_suffix(self, Py_UNICODE* chars, size_t length): cdef int _find_suffix(self, Py_UNICODE* chars, size_t length) except -1:
cdef unicode string = chars[:length] cdef unicode string = chars[:length]
match = self.suffix_re.search(string) match = self.suffix_re.search(string)
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
@ -212,27 +220,30 @@ cdef class Language:
cdef class Lexicon: cdef class Lexicon:
def __cinit__(self, lexemes): def __cinit__(self, lexemes):
self._mem = Pool() self.mem = Pool()
self._dict = PreshMap(2 ** 20) self._dict = PreshMap(2 ** 20)
self.size = 0 self.size = 0
cdef String string cdef String string
cdef dict lexeme_dict cdef dict lexeme_dict
cdef LexemeC* lexeme cdef LexemeC* lexeme
for lexeme_dict in lexemes: for py_string, lexeme_dict in lexemes.iteritems():
string_from_unicode(&string, lexeme_dict['string']) string_from_unicode(&string, py_string)
lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC)) lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
lexeme_unpack(lexeme, lexeme_dict) lexeme_unpack(lexeme, lexeme_dict)
self._dict.set(string.key, lexeme) self._dict.set(string.key, lexeme)
self.lexemes.push_back(lexeme) self.lexemes.push_back(lexeme)
self.size += 1 self.size += 1
def __getitem__(self, size_t i):
return Lexeme(<size_t>self.lexemes.at(i))
cdef LexemeC* get(self, String* string) except NULL: cdef LexemeC* get(self, String* string) except NULL:
cdef LexemeC* lex cdef LexemeC* lex
lex = <LexemeC*>self._dict.get(string.key) lex = <LexemeC*>self._dict.get(string.key)
if lex != NULL: if lex != NULL:
return lex return lex
lex = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC)) lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
cdef unicode unicode_string = string.chars[:string.n] cdef unicode unicode_string = string.chars[:string.n]
lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string)) lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
self._dict.set(string.key, lex) self._dict.set(string.key, lex)
@ -255,38 +266,12 @@ cdef class Lexicon:
return Lexeme(<size_t>lexeme) return Lexeme(<size_t>lexeme)
cdef int _extend_from_map(vector[LexemeC*] *tokens, String* string, PreshMap map_) except -1:
if string.n == 0:
return 1
lexemes = <LexemeC**>map_.get(string.key)
if lexemes == NULL:
return 0
cdef size_t i = 0
while lexemes[i] != NULL:
tokens.push_back(lexemes[i])
i += 1
return 1
cdef void string_from_unicode(String* s, unicode uni): cdef void string_from_unicode(String* s, unicode uni):
cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
string_from_slice(s, c_uni, 0, len(uni)) string_slice(s, c_uni, 0, len(uni))
cdef inline void string_from_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil: cdef inline void string_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil:
s.chars = &chars[start] s.chars = &chars[start]
s.n = end - start s.n = end - start
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil:
string_from_slice(prefix, s.chars, 0, n)
s.chars += n
s.n -= n
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil:
string_from_slice(suffix, s.chars, s.n - n, s.n)
s.n -= n
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)

View File

@ -79,7 +79,7 @@ def canon_case(string, prob, cluster, case_stats, tag_stats):
def word_shape(string, *args): def word_shape(string, *args):
length = len(string) length = len(string)
shape = "" shape = []
last = "" last = ""
shape_char = "" shape_char = ""
seq = 0 seq = 0
@ -99,8 +99,8 @@ def word_shape(string, *args):
seq = 0 seq = 0
last = shape_char last = shape_char
if seq < 5: if seq < 5:
shape += shape_char shape.append(shape_char)
return shape return ''.join(shape)
def non_sparse(string, prob, cluster, case_stats, tag_stats): def non_sparse(string, prob, cluster, case_stats, tag_stats):

View File

@ -2,14 +2,10 @@ from spacy.lexeme cimport LexemeC
from libcpp.vector cimport vector from libcpp.vector cimport vector
cdef struct Token:
int i
int pos
LexemeC* lex
cdef class Tokens: cdef class Tokens:
cdef vector[Token] v cdef vector[LexemeC*] lex
cdef vector[int] idx
cdef vector[int] pos
cdef int extend(self, int i, LexemeC** lexemes, int n) except -1 cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
cdef int push_back(self, int i, LexemeC* lexeme) except -1 cdef int push_back(self, int i, LexemeC* lexeme) except -1
@ -21,6 +17,7 @@ cdef class Tokens:
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except * cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *
cpdef unicode string_view(self, size_t i, size_t view_id) cpdef unicode string_view(self, size_t i, size_t view_id)
cpdef unicode string(self, size_t i)
cpdef unicode orig(self, size_t i) cpdef unicode orig(self, size_t i)
cpdef unicode norm(self, size_t i) cpdef unicode norm(self, size_t i)
cpdef unicode shape(self, size_t i) cpdef unicode shape(self, size_t i)

View File

@ -25,17 +25,20 @@ cdef class Tokens:
""" """
def __cinit__(self, string_length=0): def __cinit__(self, string_length=0):
size = int(string_length / 3) if string_length >= 3 else 1 size = int(string_length / 3) if string_length >= 3 else 1
self.v = vector[Token]() self.lex.reserve(size)
self.v.reserve(size) self.idx.reserve(size)
self.pos.reserve(size)
def __getitem__(self, i): def __getitem__(self, i):
return Lexeme(<size_t>self.v.at(i).lex) return Lexeme(<size_t>self.lex.at(i))
def __len__(self): def __len__(self):
return self.v.size() return self.lex.size()
cdef int push_back(self, int idx, LexemeC* lexeme) except -1: cdef int push_back(self, int idx, LexemeC* lexeme) except -1:
self.v.push_back(Token(idx, 0, lexeme)) self.lex.push_back(lexeme)
self.idx.push_back(idx)
self.pos.push_back(0)
return idx + lexeme.ints[<int>LexInt_length] return idx + lexeme.ints[<int>LexInt_length]
cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1: cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
@ -46,120 +49,124 @@ cdef class Tokens:
i = 0 i = 0
while lexemes[i] != NULL: while lexemes[i] != NULL:
idx = self.push_back(idx, lexemes[i]) idx = self.push_back(idx, lexemes[i])
i += 1
else: else:
for i in range(n): for i in range(n):
idx = self.push_back(idx, lexemes[i]) idx = self.push_back(idx, lexemes[i])
return idx return idx
cpdef int id(self, size_t i) except -1: cpdef int id(self, size_t i) except -1:
return self.v.at(i).lex.ints[<int>LexInt_id] return self.lex.at(i).ints[<int>LexInt_id]
cpdef float prob(self, size_t i) except 1: cpdef float prob(self, size_t i) except 1:
return self.v.at(i).lex.floats[<int>LexFloat_prob] return self.lex.at(i).floats[<int>LexFloat_prob]
cpdef int cluster(self, size_t i) except *: cpdef int cluster(self, size_t i) except *:
return self.v.at(i).lex.ints[<int>LexInt_cluster] return self.lex.at(i).ints[<int>LexInt_cluster]
cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *: cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, flag_id) return lexeme_check_orth_flag(self.lex.at(i), flag_id)
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *: cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, flag_id) return lexeme_check_dist_flag(self.lex.at(i), flag_id)
cpdef unicode string_view(self, size_t i, size_t view_id): cpdef unicode string_view(self, size_t i, size_t view_id):
return lexeme_get_string(self.v.at(i).lex, view_id) return lexeme_get_string(self.lex.at(i), view_id)
# Provide accessor methods for the features supported by the language. # Provide accessor methods for the features supported by the language.
# Without these, clients have to use the underlying string_view and check_flag # Without these, clients have to use the underlying string_view and check_flag
# methods, which requires them to know the IDs. # methods, which requires them to know the IDs.
cpdef unicode string(self, size_t i):
return self.orig(i)
cpdef unicode orig(self, size_t i): cpdef unicode orig(self, size_t i):
cdef bytes utf8_string = self.v.at(i).lex.strings[<int>LexStr_orig] cdef bytes utf8_string = self.lex.at(i).strings[<int>LexStr_orig]
cdef unicode string = utf8_string.decode('utf8') cdef unicode string = utf8_string.decode('utf8')
return string return string
cpdef unicode norm(self, size_t i): cpdef unicode norm(self, size_t i):
cdef bytes utf8_string = self.v.at(i).lex.strings[<int>LexStr_norm] cdef bytes utf8_string = self.lex.at(i).strings[<int>LexStr_norm]
cdef unicode string = utf8_string.decode('utf8') cdef unicode string = utf8_string.decode('utf8')
return string return string
cpdef unicode shape(self, size_t i): cpdef unicode shape(self, size_t i):
return lexeme_get_string(self.v.at(i).lex, LexStr_shape) return lexeme_get_string(self.lex.at(i), LexStr_shape)
cpdef unicode unsparse(self, size_t i): cpdef unicode unsparse(self, size_t i):
return lexeme_get_string(self.v.at(i).lex, LexStr_unsparse) return lexeme_get_string(self.lex.at(i), LexStr_unsparse)
cpdef unicode asciied(self, size_t i): cpdef unicode asciied(self, size_t i):
return lexeme_get_string(self.v.at(i).lex, LexStr_asciied) return lexeme_get_string(self.lex.at(i), LexStr_asciied)
cpdef bint is_alpha(self, size_t i) except *: cpdef bint is_alpha(self, size_t i) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_alpha) return lexeme_check_orth_flag(self.lex.at(i), LexOrth_alpha)
cpdef bint is_ascii(self, size_t i) except *: cpdef bint is_ascii(self, size_t i) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_ascii) return lexeme_check_orth_flag(self.lex.at(i), LexOrth_ascii)
cpdef bint is_digit(self, size_t i) except *: cpdef bint is_digit(self, size_t i) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_digit) return lexeme_check_orth_flag(self.lex.at(i), LexOrth_digit)
cpdef bint is_lower(self, size_t i) except *: cpdef bint is_lower(self, size_t i) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_lower) return lexeme_check_orth_flag(self.lex.at(i), LexOrth_lower)
cpdef bint is_punct(self, size_t i) except *: cpdef bint is_punct(self, size_t i) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_punct) return lexeme_check_orth_flag(self.lex.at(i), LexOrth_punct)
cpdef bint is_space(self, size_t i) except *: cpdef bint is_space(self, size_t i) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_space) return lexeme_check_orth_flag(self.lex.at(i), LexOrth_space)
cpdef bint is_title(self, size_t i) except *: cpdef bint is_title(self, size_t i) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_title) return lexeme_check_orth_flag(self.lex.at(i), LexOrth_title)
cpdef bint is_upper(self, size_t i) except *: cpdef bint is_upper(self, size_t i) except *:
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_upper) return lexeme_check_orth_flag(self.lex.at(i), LexOrth_upper)
cpdef bint can_adj(self, size_t i) except *: cpdef bint can_adj(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adj) return lexeme_check_dist_flag(self.lex.at(i), LexDist_adj)
cpdef bint can_adp(self, size_t i) except *: cpdef bint can_adp(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adp) return lexeme_check_dist_flag(self.lex.at(i), LexDist_adp)
cpdef bint can_adv(self, size_t i) except *: cpdef bint can_adv(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adv) return lexeme_check_dist_flag(self.lex.at(i), LexDist_adv)
cpdef bint can_conj(self, size_t i) except *: cpdef bint can_conj(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_conj) return lexeme_check_dist_flag(self.lex.at(i), LexDist_conj)
cpdef bint can_det(self, size_t i) except *: cpdef bint can_det(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_det) return lexeme_check_dist_flag(self.lex.at(i), LexDist_det)
cpdef bint can_noun(self, size_t i) except *: cpdef bint can_noun(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_noun) return lexeme_check_dist_flag(self.lex.at(i), LexDist_noun)
cpdef bint can_num(self, size_t i) except *: cpdef bint can_num(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_num) return lexeme_check_dist_flag(self.lex.at(i), LexDist_num)
cpdef bint can_pdt(self, size_t i) except *: cpdef bint can_pdt(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pdt) return lexeme_check_dist_flag(self.lex.at(i), LexDist_pdt)
cpdef bint can_pos(self, size_t i) except *: cpdef bint can_pos(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pos) return lexeme_check_dist_flag(self.lex.at(i), LexDist_pos)
cpdef bint can_pron(self, size_t i) except *: cpdef bint can_pron(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pron) return lexeme_check_dist_flag(self.lex.at(i), LexDist_pron)
cpdef bint can_prt(self, size_t i) except *: cpdef bint can_prt(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_prt) return lexeme_check_dist_flag(self.lex.at(i), LexDist_prt)
cpdef bint can_punct(self, size_t i) except *: cpdef bint can_punct(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_punct) return lexeme_check_dist_flag(self.lex.at(i), LexDist_punct)
cpdef bint can_verb(self, size_t i) except *: cpdef bint can_verb(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_verb) return lexeme_check_dist_flag(self.lex.at(i), LexDist_verb)
cpdef bint oft_lower(self, size_t i) except *: cpdef bint oft_lower(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_lower) return lexeme_check_dist_flag(self.lex.at(i), LexDist_lower)
cpdef bint oft_title(self, size_t i) except *: cpdef bint oft_title(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_title) return lexeme_check_dist_flag(self.lex.at(i), LexDist_title)
cpdef bint oft_upper(self, size_t i) except *: cpdef bint oft_upper(self, size_t i) except *:
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_upper) return lexeme_check_dist_flag(self.lex.at(i), LexDist_upper)

View File

@ -4,3 +4,5 @@ ctypedef uint64_t hash_t
ctypedef char* utf8_t ctypedef char* utf8_t
ctypedef uint64_t flag_t ctypedef uint64_t flag_t
ctypedef uintptr_t id_t ctypedef uintptr_t id_t

View File

@ -1,7 +1,7 @@
import os import os
from os import path from os import path
import codecs import codecs
import json import ujson
import re import re
DATA_DIR = path.join(path.dirname(__file__), '..', 'data') DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
@ -16,28 +16,36 @@ def read_lang_data(name):
tokenization = read_tokenization(data_dir) tokenization = read_tokenization(data_dir)
prefix = read_prefix(data_dir) prefix = read_prefix(data_dir)
suffix = read_suffix(data_dir) suffix = read_suffix(data_dir)
infix = read_infix(data_dir)
lex_loc = path.join(data_dir, 'lexemes.json') lex_loc = path.join(data_dir, 'lexemes.json')
if path.exists(lex_loc): if path.exists(lex_loc):
with open(lex_loc) as file_: with open(lex_loc) as file_:
lexemes = ujson.load(file_) lexemes = ujson.load(file_)
else: else:
lexemes = [] lexemes = {}
return tokenization, prefix, suffix, lexemes return tokenization, prefix, suffix, infix, lexemes
def read_prefix(data_dir): def read_prefix(data_dir):
with utf8open(path.join(data_dir, 'prefix')) as file_: with utf8open(path.join(data_dir, 'prefix')) as file_:
entries = file_.read().split('\n') entries = file_.read().split('\n')
expression = '|'.join(['^' + re.escape(piece) for piece in entries]) expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
return expression return expression
def read_suffix(data_dir): def read_suffix(data_dir):
with utf8open(path.join(data_dir, 'suffix')) as file_: with utf8open(path.join(data_dir, 'suffix')) as file_:
entries = file_.read().split('\n') entries = file_.read().split('\n')
expression = '|'.join([re.escape(piece) + '$' for piece in entries]) expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
return expression return expression
def read_infix(data_dir):
with utf8open(path.join(data_dir, 'infix')) as file_:
entries = file_.read().split('\n')
expression = '|'.join([piece for piece in entries if piece.strip()])
return expression
def read_tokenization(lang): def read_tokenization(lang):
loc = path.join(DATA_DIR, lang, 'tokenization') loc = path.join(DATA_DIR, lang, 'tokenization')
entries = [] entries = []
@ -60,3 +68,16 @@ def read_tokenization(lang):
seen.add(chunk) seen.add(chunk)
entries.append((chunk, pieces)) entries.append((chunk, pieces))
return entries return entries
def align_tokens(ref, indices):
start = 0
queue = list(indices)
for token in ref:
end = start + len(token)
emit = []
while queue and queue[0][1] <= end:
emit.append(queue.pop(0))
yield token, emit
start = end
assert not queue

View File

@ -7,20 +7,20 @@ from spacy.lexeme import *
def test_is_alpha(): def test_is_alpha():
the = EN.lookup('the') the = EN.lexicon.lookup('the')
assert the.check_orth_flag(LexOrth_alpha) assert the.check_orth_flag(LexOrth_alpha)
year = EN.lookup('1999') year = EN.lexicon.lookup('1999')
assert not year.check_orth_flag(LexOrth_alpha) assert not year.check_orth_flag(LexOrth_alpha)
mixed = EN.lookup('hello1') mixed = EN.lexicon.lookup('hello1')
assert not mixed.check_orth_flag(LexOrth_alpha) assert not mixed.check_orth_flag(LexOrth_alpha)
def test_is_digit(): def test_is_digit():
the = EN.lookup('the') the = EN.lexicon.lookup('the')
assert not the.check_orth_flag(LexOrth_digit) assert not the.check_orth_flag(LexOrth_digit)
year = EN.lookup('1999') year = EN.lexicon.lookup('1999')
assert year.check_orth_flag(LexOrth_digit) assert year.check_orth_flag(LexOrth_digit)
mixed = EN.lookup('hello1') mixed = EN.lexicon.lookup('hello1')
assert not mixed.check_orth_flag(LexOrth_digit) assert not mixed.check_orth_flag(LexOrth_digit)

View File

@ -9,7 +9,7 @@ from spacy.lexeme import *
@pytest.fixture @pytest.fixture
def C3P0(): def C3P0():
return EN.lookup("C3P0") return EN.lexicon.lookup("C3P0")
def test_shape(C3P0): def test_shape(C3P0):
@ -17,11 +17,11 @@ def test_shape(C3P0):
def test_length(): def test_length():
t = EN.lookup('the') t = EN.lexicon.lookup('the')
assert t.length == 3 assert t.length == 3
t = EN.lookup("n't") t = EN.lexicon.lookup("n't")
assert t.length == 3 assert t.length == 3
t = EN.lookup("'s") t = EN.lexicon.lookup("'s")
assert t.length == 2 assert t.length == 2
t = EN.lookup('Xxxx') t = EN.lexicon.lookup('Xxxx')
assert t.length == 4 assert t.length == 4

View File

@ -27,7 +27,7 @@ def test_punct():
def test_digits(): def test_digits():
lex_ids = EN.tokenize('The year: 1984.') lex_ids = EN.tokenize('The year: 1984.')
assert lex_ids.string(3) == "1984" assert lex_ids.orig(3) == "1984"
assert len(lex_ids) == 5 assert len(lex_ids) == 5
assert lex_ids[0].string == EN.lexicon.lookup('The').string assert lex_ids[0].string == EN.lexicon.lookup('The').string
assert lex_ids[3].string == EN.lexicon.lookup('1984').string assert lex_ids[3].string == EN.lexicon.lookup('1984').string
@ -101,4 +101,4 @@ def test_cnts6():
def test_cnts7(): def test_cnts7():
text = 'But then the 6,000-year ice age came...' text = 'But then the 6,000-year ice age came...'
tokens = EN.tokenize(text) tokens = EN.tokenize(text)
assert len(tokens) == 8 assert len(tokens) == 10

View File

@ -4,31 +4,31 @@ from spacy.en import EN
def test_neq(): def test_neq():
addr = EN.lookup('Hello') addr = EN.lexicon.lookup('Hello')
assert EN.lookup('bye').string != addr.string assert EN.lexicon.lookup('bye').string != addr.string
def test_eq(): def test_eq():
addr = EN.lookup('Hello') addr = EN.lexicon.lookup('Hello')
assert EN.lookup('Hello').string == addr.string assert EN.lexicon.lookup('Hello').string == addr.string
def test_round_trip(): def test_round_trip():
hello = EN.lookup('Hello') hello = EN.lexicon.lookup('Hello')
assert hello.string == 'Hello' assert hello.string == 'Hello'
def test_case_neq(): def test_case_neq():
addr = EN.lookup('Hello') addr = EN.lexicon.lookup('Hello')
assert EN.lookup('hello').string != addr.string assert EN.lexicon.lookup('hello').string != addr.string
def test_punct_neq(): def test_punct_neq():
addr = EN.lookup('Hello') addr = EN.lexicon.lookup('Hello')
assert EN.lookup('Hello,').string != addr.string assert EN.lexicon.lookup('Hello,').string != addr.string
def test_short(): def test_short():
addr = EN.lookup('I') addr = EN.lexicon.lookup('I')
assert addr.string == 'I' assert addr.string == 'I'
assert addr.string != 'not' assert addr.string != 'not'