* Large refactor, particularly to Python API

This commit is contained in:
Matthew Honnibal 2014-10-24 00:59:17 +11:00
parent 168b2b8cb2
commit 08ce602243
21 changed files with 327 additions and 958 deletions

View File

@ -1,42 +0,0 @@
from spacy.spacy cimport Language
from spacy.word cimport Lexeme
cimport cython
cpdef size_t ALPHA
cpdef size_t DIGIT
cpdef size_t PUNCT
cpdef size_t SPACE
cpdef size_t LOWER
cpdef size_t UPPER
cpdef size_t TITLE
cpdef size_t ASCII
cpdef size_t OFT_LOWER
cpdef size_t OFT_TITLE
cpdef size_t OFT_UPPER
cpdef size_t PUNCT
cpdef size_t CONJ
cpdef size_t NUM
cpdef size_t N
cpdef size_t DET
cpdef size_t ADP
cpdef size_t ADJ
cpdef size_t ADV
cpdef size_t VERB
cpdef size_t NOUN
cpdef size_t PDT
cpdef size_t POS
cpdef size_t PRON
cpdef size_t PRT
cdef class English(spacy.Language):
cdef int find_split(self, unicode word)
cdef English EN
cpdef Word lookup(unicode word)
cpdef list tokenize(unicode string)

View File

@ -1,126 +0,0 @@
# cython: profile=True
# cython: embedsignature=True
'''Tokenize German text, using a scheme based on the Negra corpus.
Tokenization is generally similar to English text, and the same set of orthographic
flags are used.
An abbreviation list is used to handle common abbreviations. Hyphenated words
are not split, following the Treebank usage.
'''
from __future__ import unicode_literals
from libc.stdint cimport uint64_t
cimport spacy
from spacy.orth import is_alpha, is_digit, is_punct, is_space, is_lower, is_ascii
from spacy.orth import canonicalize_case, get_string_shape, asciify, get_non_sparse
from spacy.common cimport check_punct
# Python-readable flag constants --- can't read an enum from Python
# Don't want to manually assign these numbers, or we'll insert one and have to
# change them all.
# Don't use "i", as we don't want it in the global scope!
cdef size_t __i = 0
ALPHA = __i; i += 1
DIGIT = __i; __i += 1
PUNCT = __i; __i += 1
SPACE = __i; __i += 1
LOWER = __i; __i += 1
UPPER = __i; __i += 1
TITLE = __i; __i += 1
ASCII = __i; __i += 1
OFT_LOWER = __i; __i += 1
OFT_UPPER = __i; __i += 1
OFT_TITLE = __i; __i += 1
PUNCT = __i; __i += 1
CONJ = __i; __i += 1
NUM = __i; __i += 1
X = __i; __i += 1
DET = __i; __i += 1
ADP = __i; __i += 1
ADJ = __i; __i += 1
ADV = __i; __i += 1
VERB = __i; __i += 1
NOUN = __i; __i += 1
PDT = __i; __i += 1
POS = __i; __i += 1
PRON = __i; __i += 1
PRT = __i; __i += 1
# These are for the string views
__i = 0
SIC = __i; __i += 1
CANON_CASED = __i; __i += 1
NON_SPARSE = __i; __i += 1
SHAPE = __i; __i += 1
NR_STRING_VIEWS = __i
def get_string_views(unicode string, lexeme):
views = ['' for _ in range(NR_STRING_VIEWS)]
views[SIC] = string
views[CANON_CASED] = canonicalize_case(string, lexeme)
views[SHAPE] = get_string_shape(string)
views[ASCIIFIED] = get_asciified(string)
views[FIXED_VOCAB] = get_non_sparse(string, views[ASCIIFIED], views[CANON_CASED],
views[SHAPE], lexeme)
return views
def set_orth_flags(unicode string, flags_t flags)
setters = [
(ALPHA, is_alpha),
(DIGIT, is_digit),
(PUNCT, is_punct),
(SPACE, is_space),
(LOWER, is_lower),
(UPPER, is_upper),
(SPACE, is_space)
]
for bit, setter in setters:
if setter(string):
flags |= 1 << bit
return flags
cdef class German(spacy.Language):
cdef Lexeme new_lexeme(self, unicode string, cluster=0, case_stats=None,
tag_freqs=None):
return Lexeme(s, length, views, prob=prob, cluster=cluster,
flags=self.get_flags(string)
cdef int find_split(self, unicode word):
cdef size_t length = len(word)
cdef int i = 0
if word.startswith("'s") or word.startswith("'S"):
return 2
# Contractions
if word.endswith("'s") and length >= 3:
return length - 2
# Leading punctuation
if check_punct(word, 0, length):
return 1
elif length >= 1:
# Split off all trailing punctuation characters
i = 0
while i < length and not check_punct(word, i, length):
i += 1
return i
DE = German('de')
lookup = DE.lookup
tokenize = DE.tokenize
load_clusters = DE.load_clusters
load_unigram_probs = DE.load_unigram_probs
load_case_stats = DE.load_case_stats
load_tag_stats = DE.load_tag_stats

View File

@ -1,5 +1,4 @@
from spacy.lang cimport Language from spacy.lang cimport Language
from spacy.word cimport Lexeme
from spacy.tokens cimport Tokens from spacy.tokens cimport Tokens

View File

@ -1,14 +1,12 @@
from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libc.stdint cimport uint64_t, int64_t
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from .word cimport Lexeme from .typedefs cimport hash_t
from .tokens cimport Tokens from .tokens cimport Tokens
from .lexeme cimport LexemeC from .lexeme cimport Lexeme
from .utf8string cimport StringStore
cdef extern from "Python.h": cdef extern from "Python.h":
@ -21,23 +19,25 @@ cdef extern from "Python.h":
cdef struct String: cdef struct String:
Py_UNICODE* chars Py_UNICODE* chars
size_t n size_t n
uint64_t key hash_t key
cdef class Lexicon: cdef class Lexicon:
cdef Pool mem cdef Pool mem
cpdef readonly size_t size cpdef readonly size_t size
cpdef readonly StringStore strings
cdef vector[LexemeC*] lexemes cdef vector[Lexeme*] lexemes
cpdef Lexeme lookup(self, unicode string) cpdef Lexeme lookup(self, unicode string)
cdef LexemeC* get(self, String* s) except NULL cdef Lexeme* get(self, String* s) except NULL
cdef PreshMap _dict cdef PreshMap _dict
cdef list _string_features cdef list _string_features
cdef list _flag_features cdef list _flag_features
cdef class Language: cdef class Language:
cdef Pool _mem cdef Pool _mem
cdef unicode name cdef unicode name
@ -52,12 +52,12 @@ cdef class Language:
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1 cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes, cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
vector[LexemeC*] *suffixes) except NULL vector[Lexeme*] *suffixes) except NULL
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string, cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _save_cached(self, LexemeC** tokens, uint64_t key, int n) except -1 cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1

View File

@ -13,22 +13,21 @@ import random
from os import path from os import path
import re import re
from .util import read_lang_data
from .tokens import Tokens
from .lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
from .lexeme cimport LexStr_orig
from murmurhash.mrmr cimport hash64
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from cython.operator cimport preincrement as preinc from cython.operator cimport preincrement as preinc
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from murmurhash.mrmr cimport hash64
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from spacy import orth
from spacy import util from .lexeme cimport Lexeme
from .lexeme cimport from_dict as lexeme_from_dict
from .lexeme cimport from_string as lexeme_from_string
from . import orth
from . import util
from .util import read_lang_data
from .tokens import Tokens
cdef class Language: cdef class Language:
@ -64,7 +63,7 @@ cdef class Language:
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes. tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
""" """
cdef int length = len(string) cdef int length = len(string)
cdef Tokens tokens = Tokens(length) cdef Tokens tokens = Tokens(self.lexicon.strings, length)
if length == 0: if length == 0:
return tokens return tokens
cdef int i = 0 cdef int i = 0
@ -76,7 +75,7 @@ cdef class Language:
if Py_UNICODE_ISSPACE(chars[i]) != in_ws: if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
if start < i: if start < i:
string_slice(&span, chars, start, i) string_slice(&span, chars, start, i)
lexemes = <LexemeC**>self.cache.get(span.key) lexemes = <Lexeme**>self.cache.get(span.key)
if lexemes != NULL: if lexemes != NULL:
tokens.extend(start, lexemes, 0) tokens.extend(start, lexemes, 0)
else: else:
@ -88,7 +87,7 @@ cdef class Language:
i += 1 i += 1
if start < i: if start < i:
string_slice(&span, chars, start, i) string_slice(&span, chars, start, i)
lexemes = <LexemeC**>self.cache.get(span.key) lexemes = <Lexeme**>self.cache.get(span.key)
if lexemes != NULL: if lexemes != NULL:
tokens.extend(start, lexemes, 0) tokens.extend(start, lexemes, 0)
else: else:
@ -96,9 +95,9 @@ cdef class Language:
return tokens return tokens
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1: cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
cdef vector[LexemeC*] prefixes cdef vector[Lexeme*] prefixes
cdef vector[LexemeC*] suffixes cdef vector[Lexeme*] suffixes
cdef uint64_t orig_key cdef hash_t orig_key
cdef int orig_size cdef int orig_size
orig_key = span.key orig_key = span.key
orig_size = tokens.length orig_size = tokens.length
@ -106,8 +105,8 @@ cdef class Language:
self._attach_tokens(tokens, start, span, &prefixes, &suffixes) self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size) self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes, cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
vector[LexemeC*] *suffixes) except NULL: vector[Lexeme*] *suffixes) except NULL:
cdef size_t i cdef size_t i
cdef String prefix cdef String prefix
cdef String suffix cdef String suffix
@ -150,15 +149,15 @@ cdef class Language:
cdef int _attach_tokens(self, Tokens tokens, cdef int _attach_tokens(self, Tokens tokens,
int idx, String* string, int idx, String* string,
vector[LexemeC*] *prefixes, vector[Lexeme*] *prefixes,
vector[LexemeC*] *suffixes) except -1: vector[Lexeme*] *suffixes) except -1:
cdef int split cdef int split
cdef LexemeC** lexemes cdef Lexeme** lexemes
cdef LexemeC* lexeme cdef Lexeme* lexeme
cdef String span cdef String span
idx = tokens.extend(idx, prefixes.data(), prefixes.size()) idx = tokens.extend(idx, prefixes.data(), prefixes.size())
if string.n != 0: if string.n != 0:
lexemes = <LexemeC**>self.cache.get(string.key) lexemes = <Lexeme**>self.cache.get(string.key)
if lexemes != NULL: if lexemes != NULL:
idx = tokens.extend(idx, lexemes, 0) idx = tokens.extend(idx, lexemes, 0)
else: else:
@ -172,13 +171,13 @@ cdef class Language:
idx = tokens.push_back(idx, self.lexicon.get(&span)) idx = tokens.push_back(idx, self.lexicon.get(&span))
string_slice(&span, string.chars, split + 1, string.n) string_slice(&span, string.chars, split + 1, string.n)
idx = tokens.push_back(idx, self.lexicon.get(&span)) idx = tokens.push_back(idx, self.lexicon.get(&span))
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin() cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend(): while it != suffixes.rend():
idx = tokens.push_back(idx, deref(it)) idx = tokens.push_back(idx, deref(it))
preinc(it) preinc(it)
cdef int _save_cached(self, LexemeC** tokens, uint64_t key, int n) except -1: cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1:
lexemes = <LexemeC**>self._mem.alloc(n + 1, sizeof(LexemeC**)) lexemes = <Lexeme**>self._mem.alloc(n + 1, sizeof(Lexeme**))
cdef int i cdef int i
for i in range(n): for i in range(n):
lexemes[i] = tokens[i] lexemes[i] = tokens[i]
@ -212,14 +211,14 @@ cdef class Language:
token_rules (list): A list of (chunk, tokens) pairs, where chunk is token_rules (list): A list of (chunk, tokens) pairs, where chunk is
a string and tokens is a list of strings. a string and tokens is a list of strings.
''' '''
cdef LexemeC** lexemes cdef Lexeme** lexemes
cdef uint64_t hashed cdef hash_t hashed
cdef String string cdef String string
for uni_string, substrings in token_rules: for uni_string, substrings in token_rules:
lexemes = <LexemeC**>self._mem.alloc(len(substrings) + 1, sizeof(LexemeC*)) lexemes = <Lexeme**>self._mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
for i, substring in enumerate(substrings): for i, substring in enumerate(substrings):
string_from_unicode(&string, substring) string_from_unicode(&string, substring)
lexemes[i] = <LexemeC*>self.lexicon.get(&string) lexemes[i] = <Lexeme*>self.lexicon.get(&string)
lexemes[i + 1] = NULL lexemes[i + 1] = NULL
string_from_unicode(&string, uni_string) string_from_unicode(&string, uni_string)
self.specials.set(string.key, lexemes) self.specials.set(string.key, lexemes)
@ -227,33 +226,29 @@ cdef class Language:
cdef class Lexicon: cdef class Lexicon:
def __cinit__(self, lexemes): def __init__(self, lexemes):
self.mem = Pool() self.mem = Pool()
self._dict = PreshMap(2 ** 20) self._dict = PreshMap(2 ** 20)
self.strings = StringStore()
self.size = 0 self.size = 0
cdef String string cdef String string
cdef dict lexeme_dict cdef Lexeme* lexeme
cdef LexemeC* lexeme #for py_string, lexeme_dict in lexemes.iteritems():
for py_string, lexeme_dict in lexemes.iteritems(): # string_from_unicode(&string, py_string)
string_from_unicode(&string, py_string) # lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC)) # lexeme_from_dict(lexeme, lexeme_dict, self.strings)
lexeme_unpack(lexeme, lexeme_dict) # self._dict.set(string.key, lexeme)
self._dict.set(string.key, lexeme) # self.lexemes.push_back(lexeme)
self.lexemes.push_back(lexeme) # self.size += 1
self.size += 1
def __getitem__(self, size_t i): cdef Lexeme* get(self, String* string) except NULL:
return Lexeme(<size_t>self.lexemes.at(i)) cdef Lexeme* lex
lex = <Lexeme*>self._dict.get(string.key)
cdef LexemeC* get(self, String* string) except NULL:
cdef LexemeC* lex
lex = <LexemeC*>self._dict.get(string.key)
if lex != NULL: if lex != NULL:
return lex return lex
lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC)) lex = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
cdef unicode unicode_string = string.chars[:string.n] lexeme_from_string(lex, string.chars[:string.n], self.strings)
lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
self._dict.set(string.key, lex) self._dict.set(string.key, lex)
self.lexemes.push_back(lex) self.lexemes.push_back(lex)
self.size += 1 self.size += 1
@ -270,8 +265,8 @@ cdef class Lexicon:
""" """
cdef String string cdef String string
string_from_unicode(&string, uni_string) string_from_unicode(&string, uni_string)
cdef LexemeC* lexeme = self.get(&string) cdef Lexeme* lexeme = self.get(&string)
return Lexeme(<size_t>lexeme) return lexeme[0]
cdef void string_from_unicode(String* s, unicode uni): cdef void string_from_unicode(String* s, unicode uni):

View File

@ -1,94 +1,55 @@
from .typedefs cimport hash_t, utf8_t, flag_t, id_t from .typedefs cimport hash_t, utf8_t, flag_t, id_t
from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t
from .utf8string cimport StringStore
cpdef flag_t OOV_DIST_FLAGS cpdef flag_t OOV_DIST_FLAGS
# Flags
cpdef enum:
IS_ALPHA
IS_ASCII
IS_DIGIT
IS_LOWER
IS_PUNCT
IS_SPACE
IS_TITLE
IS_UPPER
cpdef enum LexInts: OFT_LOWER
LexInt_id OFT_TITLE
LexInt_length OFT_UPPER
LexInt_cluster
LexInt_pos
LexInt_supersense
LexInt_N
cpdef enum LexFloats: cdef struct Lexeme:
LexFloat_prob atom_t id
LexFloat_sentiment atom_t length
LexFloat_N
atom_t norm
atom_t shape
atom_t vocab10k
atom_t asciied
atom_t prefix
atom_t suffix
atom_t cluster
atom_t pos
atom_t supersense
float prob
flag_t flags
cpdef enum LexStrs: cdef Lexeme EMPTY_LEXEME
LexStr_orig
LexStr_norm
LexStr_shape
LexStr_unsparse
LexStr_asciied
LexStr_pre
LexStr_suff
LexStr_N
cpdef enum LexOrthFlags: cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1
LexOrth_alpha
LexOrth_ascii
LexOrth_digit
LexOrth_lower
LexOrth_punct
LexOrth_space
LexOrth_title
LexOrth_upper
LexOrth_N
cpdef enum LexDistFlags: cdef int from_dict(Lexeme* lex, dict props, StringStore store) except -1
LexDist_adj
LexDist_adp
LexDist_adv
LexDist_conj
LexDist_det
LexDist_noun
LexDist_num
LexDist_pdt
LexDist_pos
LexDist_pron
LexDist_prt
LexDist_punct
LexDist_verb
LexDist_lower
LexDist_title
LexDist_upper
LexDist_N
cdef struct LexemeC: cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
int[<int>LexInt_N] ints return lexeme.flags & (1 << flag_id)
float[<int>LexFloat_N] floats
utf8_t[<int>LexStr_N] strings
flag_t orth_flags
flag_t dist_flags
cdef LexemeC EMPTY_LEXEME
cpdef dict get_lexeme_dict(size_t i, unicode string)
cdef char* intern_and_encode(unicode string, size_t* length) except NULL
cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *
cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *
cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i)
cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *
cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *
cdef dict lexeme_pack(LexemeC* lexeme)
cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1

View File

@ -5,106 +5,40 @@ from libc.string cimport memset
import orth import orth
from .utf8string cimport Utf8Str
OOV_DIST_FLAGS = 0 OOV_DIST_FLAGS = 0
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
cpdef dict get_lexeme_dict(size_t i, unicode string):
ints = [None for _ in range(LexInt_N)]
ints[<int>LexInt_id] = i
ints[<int>LexInt_length] = len(string)
ints[<int>LexInt_cluster] = 0
ints[<int>LexInt_pos] = 0
ints[<int>LexInt_supersense] = 0
floats = [None for _ in range(LexFloat_N)] def get_flags(unicode string):
floats[<int>LexFloat_prob] = 0
floats[<int>LexFloat_sentiment] = 0
strings = [None for _ in range(LexStr_N)]
strings[<int>LexStr_orig] = string
strings[<int>LexStr_norm] = strings[<int>LexStr_orig]
strings[<int>LexStr_shape] = orth.word_shape(string)
strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
strings[<int>LexStr_asciied] = orth.asciied(string)
strings[<int>LexStr_pre] = string[0]
strings[<int>LexStr_suff] = string[-3:]
orth_flags = get_orth_flags(string)
dist_flags = OOV_DIST_FLAGS
return {'ints': ints, 'floats': floats, 'strings': strings,
'orth_flags': orth_flags, 'dist_flags': dist_flags}
def get_orth_flags(unicode string):
cdef flag_t flags = 0 cdef flag_t flags = 0
flags |= orth.is_alpha(string) << IS_ALPHA
flags |= orth.is_ascii(string) << LexOrth_ascii flags |= orth.is_ascii(string) << IS_ASCII
flags |= orth.is_alpha(string) << LexOrth_alpha flags |= orth.is_digit(string) << IS_DIGIT
flags |= orth.is_digit(string) << LexOrth_digit flags |= orth.is_lower(string) << IS_LOWER
flags |= orth.is_lower(string) << LexOrth_lower flags |= orth.is_punct(string) << IS_PUNCT
flags |= orth.is_punct(string) << LexOrth_punct flags |= orth.is_space(string) << IS_SPACE
flags |= orth.is_space(string) << LexOrth_space flags |= orth.is_title(string) << IS_TITLE
flags |= orth.is_title(string) << LexOrth_title flags |= orth.is_upper(string) << IS_UPPER
flags |= orth.is_upper(string) << LexOrth_upper
return flags return flags
def get_dist_flags(unicode string): cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1:
return 0
cdef char* intern_and_encode(unicode string, size_t* length) except NULL:
cdef bytes byte_string = string.encode('utf8') cdef bytes byte_string = string.encode('utf8')
cdef bytes utf8_string = intern(byte_string) cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
Py_INCREF(utf8_string) lex.id = orig_str.i
length[0] = len(utf8_string) lex.cluster = 0
return <char*>utf8_string lex.length = len(string)
lex.flags = get_flags(string)
# TODO: Hook this up
#lex.norm = norm_str.i
#lex.shape = norm_str.i
#lex.asciied = asciied_str.i
#lex.prefix = prefix_str.i
#lex.suffix = suffix_str.i
cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *: cdef int from_dict(Lexeme* lex, dict props, StringStore stroe) except -1:
return lexeme.ints[i] pass
cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *:
return lexeme.floats[i]
cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i):
cdef bytes byte_string = lexeme.strings[i]
return byte_string.decode('utf8')
cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *:
return lexeme.orth_flags & (1 << flag_id)
cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *:
return lexeme.dist_flags & (1 << flag_id)
cdef dict lexeme_pack(LexemeC* lex):
cdef dict packed = {}
packed['ints'] = [lex.ints[i] for i in range(LexInt_N)]
packed['floats'] = [lex.floats[i] for i in range(LexFloat_N)]
packed['strings'] = [lex.strings[i].decode('utf8') for i in range(LexStr_N)]
packed['orth_flags'] = lex.orth_flags
packed['dist_flags'] = lex.orth_flags
return packed
cdef int lexeme_unpack(LexemeC* lex, dict p) except -1:
cdef size_t i
cdef int lex_int
cdef float lex_float
cdef unicode string
for i, lex_int in enumerate(p['ints']):
lex.ints[i] = lex_int
for i, lex_float in enumerate(p['floats']):
lex.floats[i] = lex_float
cdef size_t _
for i in range(LexStr_N):
lex_string = p['strings'][i]
lex.strings[i] = intern_and_encode(lex_string, &_)
lex.orth_flags = p['orth_flags']
lex.dist_flags = p['dist_flags']

View File

@ -113,8 +113,8 @@ cpdef enum:
CONTEXT_SIZE CONTEXT_SIZE
cdef int get_atoms(atom_t* atoms, LexemeC* p2, LexemeC* p1, LexemeC* n0, LexemeC* n1, cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1,
LexemeC* n2, class_t prev_tag, class_t prev_prev_tag) except -1: Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
_fill_token(&atoms[P2i], p2) _fill_token(&atoms[P2i], p2)
_fill_token(&atoms[P1i], p1) _fill_token(&atoms[P1i], p1)
_fill_token(&atoms[N0i], n0) _fill_token(&atoms[N0i], n0)
@ -124,16 +124,16 @@ cdef int get_atoms(atom_t* atoms, LexemeC* p2, LexemeC* p1, LexemeC* n0, LexemeC
atoms[P2t] = prev_prev_tag atoms[P2t] = prev_prev_tag
cdef inline void _fill_token(atom_t* atoms, LexemeC* lex) nogil: cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
atoms[0] = lex.ints[<int>LexInt_id] atoms[0] = lex.id
atoms[1] = lex.ints[<int>LexInt_cluster] atoms[1] = lex.cluster
atoms[2] = <atom_t>lex.strings[<int>LexStr_norm] atoms[2] = lex.norm
atoms[3] = <atom_t>lex.strings[<int>LexStr_shape] atoms[3] = lex.shape
atoms[4] = <atom_t>lex.strings[<int>LexStr_pre] atoms[4] = lex.prefix
atoms[5] = <atom_t>lex.strings[<int>LexStr_suff] atoms[5] = lex.suffix
atoms[6] = lex.dist_flags & (1 << LexDist_title) atoms[6] = lex.flags & (1 << OFT_TITLE)
atoms[7] = lex.dist_flags & (1 << LexDist_upper) atoms[7] = lex.flags & (1 << OFT_UPPER)
TEMPLATES = ( TEMPLATES = (

View File

@ -20,6 +20,8 @@ def realign_tagged(token_rules, tagged_line, sep='/'):
def read_tagged(detoken_rules, file_, sep='/'): def read_tagged(detoken_rules, file_, sep='/'):
sentences = [] sentences = []
for line in file_: for line in file_:
if not line.strip():
continue
line = realign_tagged(detoken_rules, line, sep=sep) line = realign_tagged(detoken_rules, line, sep=sep)
tokens, tags = _parse_line(line, sep) tokens, tags = _parse_line(line, sep)
assert len(tokens) == len(tags) assert len(tokens) == len(tags)
@ -39,7 +41,7 @@ def _parse_line(line, sep):
subtags.append('NULL') subtags.append('NULL')
assert len(subtags) == len(subtokens), [t.string for t in subtokens] assert len(subtags) == len(subtokens), [t.string for t in subtokens]
words.append(word) words.append(word)
tags.extend([Tagger.encode_pos(pos) for pos in subtags]) tags.extend([Tagger.encode_pos(ptb_to_univ(pos)) for pos in subtags])
return EN.tokenize(' '.join(words)), tags return EN.tokenize(' '.join(words)), tags
@ -53,3 +55,86 @@ def get_tagdict(train_sents):
tagdict.setdefault(word, {}).setdefault(tag, 0) tagdict.setdefault(word, {}).setdefault(tag, 0)
tagdict[word][tag] += 1 tagdict[word][tag] += 1
return tagdict return tagdict
def ptb_to_univ(tag):
mapping = dict(tuple(line.split()) for line in """
NULL NULL
HYPH .
ADD X
NFP .
AFX X
XX X
BES VERB
HVS VERB
GW X
! .
# .
$ .
'' .
( .
) .
, .
-LRB- .
-RRB- .
. .
: .
? .
CC CONJ
CD NUM
CD|RB X
DT DET
EX DET
FW X
IN ADP
IN|RP ADP
JJ ADJ
JJR ADJ
JJRJR ADJ
JJS ADJ
JJ|RB ADJ
JJ|VBG ADJ
LS X
MD VERB
NN NOUN
NNP NOUN
NNPS NOUN
NNS NOUN
NN|NNS NOUN
NN|SYM NOUN
NN|VBG NOUN
NP NOUN
PDT DET
POS PRT
PRP PRON
PRP$ PRON
PRP|VBP PRON
PRT PRT
RB ADV
RBR ADV
RBS ADV
RB|RP ADV
RB|VBG ADV
RN X
RP PRT
SYM X
TO PRT
UH X
VB VERB
VBD VERB
VBD|VBN VERB
VBG VERB
VBG|NN VERB
VBN VERB
VBP VERB
VBP|TO VERB
VBZ VERB
VP VERB
WDT DET
WH X
WP PRON
WP$ PRON
WRB ADV
`` .""".strip().split('\n'))
return mapping[tag]

View File

@ -1,5 +0,0 @@
from spacy.lang cimport Language
cdef class PennTreebank3(Language):
cdef list _split(self, unicode split)

View File

@ -1,161 +0,0 @@
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
so that strings can be retrieved from hashes. Use 64-bit hash values and
boldly assume no collisions.
'''
from __future__ import unicode_literals
from libc.stdint cimport uint64_t
cimport spacy
import re
from spacy import orth
TAG_THRESH = 0.5
UPPER_THRESH = 0.2
LOWER_THRESH = 0.5
TITLE_THRESH = 0.7
NR_FLAGS = 0
OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
OFT_TITLE = NR_FLAGS; NR_FLAGS += 1
IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
IS_SPACE = NR_FLAGS; NR_FLAGS += 1
IS_ASCII = NR_FLAGS; NR_FLAGS += 1
IS_TITLE = NR_FLAGS; NR_FLAGS += 1
IS_LOWER = NR_FLAGS; NR_FLAGS += 1
IS_UPPER = NR_FLAGS; NR_FLAGS += 1
CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
CAN_NUM = NR_FLAGS; NR_FLAGS += 1
CAN_DET = NR_FLAGS; NR_FLAGS += 1
CAN_ADP = NR_FLAGS; NR_FLAGS += 1
CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
CAN_ADV = NR_FLAGS; NR_FLAGS += 1
CAN_VERB = NR_FLAGS; NR_FLAGS += 1
CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
CAN_PDT = NR_FLAGS; NR_FLAGS += 1
CAN_POS = NR_FLAGS; NR_FLAGS += 1
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
CAN_PRT = NR_FLAGS; NR_FLAGS += 1
# List of contractions adapted from Robert MacIntyre's tokenizer.
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
re.compile(r"(?i)\b(d)('ye)\b"),
re.compile(r"(?i)\b(gim)(me)\b"),
re.compile(r"(?i)\b(gon)(na)\b"),
re.compile(r"(?i)\b(got)(ta)\b"),
re.compile(r"(?i)\b(lem)(me)\b"),
re.compile(r"(?i)\b(mor)('n)\b"),
re.compile(r"(?i)\b(wan)(na) ")]
CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
re.compile(r"(?i) ('t)(was)\b")]
CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
re.compile(r"(?i)\b(wha)(t)(cha)\b")]
def nltk_regex_tokenize(text):
# Implementation taken from NLTK 3.0, based on tokenizer.sed
#starting quotes
text = re.sub(r'^\"', r'``', text)
text = re.sub(r'(``)', r' \1 ', text)
text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
#punctuation
text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
text = re.sub(r'\.\.\.', r' ... ', text)
text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
text = re.sub(r'[?!]', r' \g<0> ', text)
text = re.sub(r"([^'])' ", r"\1 ' ", text)
#parens, brackets, etc.
text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
text = re.sub(r'--', r' -- ', text)
#add extra space to make things easier
text = " " + text + " "
#ending quotes
text = re.sub(r'"', " '' ", text)
text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
text)
for regexp in CONTRACTIONS2:
text = regexp.sub(r' \1 \2 ', text)
for regexp in CONTRACTIONS3:
text = regexp.sub(r' \1 \2 ', text)
# We are not using CONTRACTIONS4 since
# they are also commented out in the SED scripts
# for regexp in self.CONTRACTIONS4:
# text = regexp.sub(r' \1 \2 \3 ', text)
return text.split()
cdef class PennTreebank3(Language):
"""Fully PTB compatible English tokenizer, tightly coupled to lexicon.
Attributes:
name (unicode): The two letter code used by Wikipedia for the language.
lexicon (Lexicon): The lexicon. Exposes the lookup method.
"""
def __cinit__(self, name):
flag_funcs = [0 for _ in range(NR_FLAGS)]
flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
flag_funcs[IS_ALPHA] = orth.is_alpha
flag_funcs[IS_DIGIT] = orth.is_digit
flag_funcs[IS_PUNCT] = orth.is_punct
flag_funcs[IS_SPACE] = orth.is_space
flag_funcs[IS_TITLE] = orth.is_title
flag_funcs[IS_LOWER] = orth.is_lower
flag_funcs[IS_UPPER] = orth.is_upper
flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
Language.__init__(self, name, flag_funcs)
cdef list _split(self, unicode chunk):
strings = nltk_regex_tokenize(chunk)
if strings[-1] == '.':
strings.pop()
strings[-1] += '.'
assert strings
return strings
PTB3 = PennTreebank3('ptb3')

View File

@ -1,59 +1,49 @@
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from spacy.lexeme cimport LexemeC from .lexeme cimport Lexeme
from .typedefs cimport flag_t
from .utf8string cimport StringStore
from thinc.typedefs cimport atom_t from thinc.typedefs cimport atom_t
cdef class Tokens: cdef class Tokens:
cdef Pool mem cdef Pool mem
cdef StringStore _string_store
cdef LexemeC** _lex_ptr cdef Lexeme** _lex_ptr
cdef int* _idx_ptr cdef int* _idx_ptr
cdef int* _pos_ptr cdef int* _pos_ptr
cdef LexemeC** lex cdef Lexeme** lex
cdef int* idx cdef int* idx
cdef int* pos cdef int* pos
cdef int length cdef int length
cdef int max_length cdef int max_length
cdef int extend(self, int i, LexemeC** lexemes, int n) except -1 cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
cdef int push_back(self, int i, LexemeC* lexeme) except -1 cdef int push_back(self, int i, Lexeme* lexeme) except -1
cpdef int id(self, size_t i) except -1
cpdef float prob(self, size_t i) except 1
cpdef int cluster(self, size_t i) except *
cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *
cpdef unicode string_view(self, size_t i, size_t view_id)
cpdef unicode string(self, size_t i) cdef class Token:
cpdef unicode orig(self, size_t i) cdef StringStore _string_store
cpdef unicode norm(self, size_t i) cdef public int i
cpdef unicode shape(self, size_t i) cdef public int idx
cpdef unicode unsparse(self, size_t i) cdef public int pos
cpdef unicode asciied(self, size_t i)
cpdef bint is_alpha(self, size_t i) except * cdef public atom_t id
cpdef bint is_ascii(self, size_t i) except * cdef public atom_t cluster
cpdef bint is_digit(self, size_t i) except * cdef public atom_t length
cpdef bint is_lower(self, size_t i) except * cdef public atom_t lex_pos
cpdef bint is_punct(self, size_t i) except * cdef public atom_t lex_supersense
cpdef bint is_space(self, size_t i) except *
cpdef bint is_title(self, size_t i) except * cdef public atom_t norm
cpdef bint is_upper(self, size_t i) except * cdef public atom_t shape
cpdef bint can_adj(self, size_t i) except * cdef public atom_t vocab10k
cpdef bint can_adp(self, size_t i) except * cdef public atom_t asciied
cpdef bint can_adv(self, size_t i) except * cdef public atom_t prefix
cpdef bint can_conj(self, size_t i) except * cdef public atom_t suffix
cpdef bint can_det(self, size_t i) except *
cpdef bint can_noun(self, size_t i) except * cdef public float prob
cpdef bint can_num(self, size_t i) except *
cpdef bint can_pdt(self, size_t i) except * cdef public flag_t flags
cpdef bint can_pos(self, size_t i) except *
cpdef bint can_pron(self, size_t i) except *
cpdef bint can_prt(self, size_t i) except *
cpdef bint can_punct(self, size_t i) except *
cpdef bint can_verb(self, size_t i) except *
cpdef bint oft_lower(self, size_t i) except *
cpdef bint oft_title(self, size_t i) except *
cpdef bint oft_upper(self, size_t i) except *

View File

@ -1,10 +1,6 @@
# cython: profile=True # cython: profile=True
from .word cimport Lexeme
from .lexeme cimport * from .lexeme cimport *
cimport numpy
cimport cython cimport cython
import numpy
DEF PADDING = 5 DEF PADDING = 5
@ -34,7 +30,8 @@ cdef class Tokens:
>>> tokens.can_noun(1) >>> tokens.can_noun(1)
True True
""" """
def __init__(self, string_length=0): def __init__(self, StringStore string_store, string_length=0):
self._string_store = string_store
if string_length >= 3: if string_length >= 3:
size = int(string_length / 3.0) size = int(string_length / 3.0)
else: else:
@ -43,7 +40,7 @@ cdef class Tokens:
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
# However, we need to remember the true starting places, so that we can # However, we need to remember the true starting places, so that we can
# realloc. # realloc.
self._lex_ptr = <LexemeC**>self.mem.alloc(size + (PADDING*2), sizeof(LexemeC*)) self._lex_ptr = <Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int)) self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int)) self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
self.lex = self._lex_ptr self.lex = self._lex_ptr
@ -55,39 +52,26 @@ cdef class Tokens:
self.lex += PADDING self.lex += PADDING
self.idx += PADDING self.idx += PADDING
self.pos += PADDING self.pos += PADDING
self.max_length = size self.max_length = size
self.length = 0 self.length = 0
def __getitem__(self, i): def __getitem__(self, i):
bounds_check(i, self.length, PADDING) bounds_check(i, self.length, PADDING)
return Lexeme(<size_t>self.lex[i]) return Token(self._string_store, i, self.idx[i], self.pos[i], self.lex[i][0])
def __len__(self): def __len__(self):
return self.length return self.length
cdef int push_back(self, int idx, LexemeC* lexeme) except -1: cdef int push_back(self, int idx, Lexeme* lexeme) except -1:
if self.length == self.max_length: if self.length == self.max_length:
self._realloc(self.length * 2) self._realloc(self.length * 2)
self.lex[self.length] = lexeme self.lex[self.length] = lexeme
self.idx[self.length] = idx self.idx[self.length] = idx
self.pos[self.length] = 0 self.pos[self.length] = 0
self.length += 1 self.length += 1
return idx + lexeme.ints[<int>LexInt_length] return idx + lexeme.length
def _realloc(self, new_size): cdef int extend(self, int idx, Lexeme** lexemes, int n) except -1:
self.max_length = new_size
n = new_size + (PADDING * 2)
self._lex_ptr = <LexemeC**>self.mem.realloc(self._lex_ptr, n * sizeof(LexemeC*))
self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
self.lex = self._lex_ptr + PADDING
self.idx = self._idx_ptr + PADDING
self.pos = self._pos_ptr + PADDING
for i in range(self.length, self.max_length + PADDING):
self.lex[i] = &EMPTY_LEXEME
cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
cdef int i cdef int i
if lexemes == NULL: if lexemes == NULL:
return idx return idx
@ -101,154 +85,43 @@ cdef class Tokens:
idx = self.push_back(idx, lexemes[i]) idx = self.push_back(idx, lexemes[i])
return idx return idx
cpdef int id(self, size_t i) except -1: def _realloc(self, new_size):
bounds_check(i, self.length, PADDING) self.max_length = new_size
return self.lex[i].ints[<int>LexInt_id] n = new_size + (PADDING * 2)
self._lex_ptr = <Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
self.lex = self._lex_ptr + PADDING
self.idx = self._idx_ptr + PADDING
self.pos = self._pos_ptr + PADDING
for i in range(self.length, self.max_length + PADDING):
self.lex[i] = &EMPTY_LEXEME
cpdef float prob(self, size_t i) except 1:
bounds_check(i, self.length, PADDING)
return self.lex[i].floats[<int>LexFloat_prob]
cpdef int cluster(self, size_t i) except *: @cython.freelist(64)
bounds_check(i, self.length, PADDING) cdef class Token:
return self.lex[i].ints[<int>LexInt_cluster] def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex):
self._string_store = string_store
self.i = i
self.idx = idx
self.pos = pos
cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *: self.id = lex['id']
bounds_check(i, self.length, PADDING) self.cluster = lex['cluster']
return lexeme_check_orth_flag(self.lex[i], flag_id) self.length = lex['length']
self.lex_pos = lex['pos']
self.lex_supersense = lex['supersense']
self.norm = lex['norm']
self.shape = lex['shape']
self.vocab10k = lex['vocab10k']
self.suffix = lex['asciied']
self.prefix = lex['prefix']
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *: self.prob = lex['prob']
bounds_check(i, self.length, PADDING) self.flags = lex['flags']
return lexeme_check_dist_flag(self.lex[i], flag_id)
cpdef unicode string_view(self, size_t i, size_t view_id): property string:
bounds_check(i, self.length, PADDING) def __get__(self):
return lexeme_get_string(self.lex[i], view_id) cdef bytes utf8string = self._string_store[self.id]
return utf8string.decode('utf8')
# Provide accessor methods for the features supported by the language.
# Without these, clients have to use the underlying string_view and check_flag
# methods, which requires them to know the IDs.
cpdef unicode string(self, size_t i):
bounds_check(i, self.length, PADDING)
return self.orig(i)
cpdef unicode orig(self, size_t i):
bounds_check(i, self.length, PADDING)
cdef bytes utf8_string = self.lex[i].strings[<int>LexStr_orig]
cdef unicode string = utf8_string.decode('utf8')
return string
cpdef unicode norm(self, size_t i):
bounds_check(i, self.length, PADDING)
cdef bytes utf8_string = self.lex[i].strings[<int>LexStr_norm]
cdef unicode string = utf8_string.decode('utf8')
return string
cpdef unicode shape(self, size_t i):
bounds_check(i, self.length, PADDING)
return lexeme_get_string(self.lex[i], LexStr_shape)
cpdef unicode unsparse(self, size_t i):
bounds_check(i, self.length, PADDING)
return lexeme_get_string(self.lex[i], LexStr_unsparse)
cpdef unicode asciied(self, size_t i):
bounds_check(i, self.length, PADDING)
return lexeme_get_string(self.lex[i], LexStr_asciied)
cpdef bint is_alpha(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], LexOrth_alpha)
cpdef bint is_ascii(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], LexOrth_ascii)
cpdef bint is_digit(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], LexOrth_digit)
cpdef bint is_lower(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], LexOrth_lower)
cpdef bint is_punct(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], LexOrth_punct)
cpdef bint is_space(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], LexOrth_space)
cpdef bint is_title(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], LexOrth_title)
cpdef bint is_upper(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], LexOrth_upper)
cpdef bint can_adj(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_adj)
cpdef bint can_adp(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_adp)
cpdef bint can_adv(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_adv)
cpdef bint can_conj(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_conj)
cpdef bint can_det(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_det)
cpdef bint can_noun(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_noun)
cpdef bint can_num(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_num)
cpdef bint can_pdt(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_pdt)
cpdef bint can_pos(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_pos)
cpdef bint can_pron(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_pron)
cpdef bint can_prt(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_prt)
cpdef bint can_punct(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_punct)
cpdef bint can_verb(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_verb)
cpdef bint oft_lower(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_lower)
cpdef bint oft_title(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_title)
cpdef bint oft_upper(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_upper)

View File

@ -1,12 +0,0 @@
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
from spacy.lexeme cimport LexemeC
DEF MAX_FLAG = 64
cdef class Lexeme:
cdef LexemeC* _c
cpdef bint check_orth_flag(self, size_t flag_id) except *
cpdef bint check_dist_flag(self, size_t flag_id) except *
cpdef unicode string_view(self, size_t view_id)

View File

@ -1,80 +0,0 @@
# cython: profile=True
# cython: embedsignature=True
from .lexeme cimport lexeme_get_string
from .lexeme cimport lexeme_check_orth_flag, lexeme_check_dist_flag
from .lexeme cimport *
cdef class Lexeme:
"""A lexical type --- a word, punctuation symbol, whitespace sequence, etc
keyed by a case-sensitive unicode string. All tokens with the same string,
e.g. all instances of "dog", ",", "NASA" etc should be mapped to the same
Lexeme.
You should avoid instantiating Lexemes directly, and instead use the
:py:meth:`space.lang.Language.tokenize` and :py:meth:`spacy.lang.Language.lookup`
methods on the global object exposed by the language you're working with,
e.g. :py:data:`spacy.en.EN`.
Attributes:
string (unicode):
The unicode string.
Implemented as a property; relatively expensive.
length (size_t):
The number of unicode code-points in the string.
prob (double):
An estimate of the word's unigram log probability.
Probabilities are calculated from a large text corpus, and smoothed using
simple Good-Turing. Estimates are read from data/en/probabilities, and
can be replaced using spacy.en.load_probabilities.
cluster (size_t):
An integer representation of the word's Brown cluster.
A Brown cluster is an address into a binary tree, which gives some (noisy)
information about the word's distributional context.
>>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
>>> print ["{0:b"} % lookup(s).cluster for s in strings]
["100111110110", "100111100100", "01010111011001", "100111110110"]
The clusterings are unideal, but often slightly useful.
"pineapple" and "apple" share a long prefix, indicating a similar meaning,
while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like.
"""
def __cinit__(self, size_t lexeme_addr):
self._c = <LexemeC*>lexeme_addr
property string:
def __get__(self):
cdef bytes utf8_string = self._c.strings[<int>LexStr_orig]
cdef unicode string = utf8_string.decode('utf8')
return string
property prob:
def __get__(self):
return self._c.floats[<int>LexFloat_prob]
property cluster:
def __get__(self):
return self._c.ints[<int>LexInt_cluster]
property length:
def __get__(self):
return self._c.ints[<int>LexInt_length]
cpdef bint check_orth_flag(self, size_t flag_id) except *:
return lexeme_check_orth_flag(self._c, flag_id)
cpdef bint check_dist_flag(self, size_t flag_id) except *:
return lexeme_check_dist_flag(self._c, flag_id)
cpdef unicode string_view(self, size_t view_id):
return lexeme_get_string(self._c, view_id)

View File

@ -5,8 +5,8 @@ from spacy.en import EN
def test_possess(): def test_possess():
tokens = EN.tokenize("Mike's") tokens = EN.tokenize("Mike's")
assert tokens[0].string == "Mike" assert EN.lexicon.strings[tokens[0].id] == "Mike"
assert tokens[1].string == "'s" assert EN.lexicon.strings[tokens[1].id] == "'s"
assert len(tokens) == 2 assert len(tokens) == 2

View File

@ -8,19 +8,17 @@ from spacy.lexeme import *
def test_is_alpha(): def test_is_alpha():
the = EN.lexicon.lookup('the') the = EN.lexicon.lookup('the')
assert the.check_orth_flag(LexOrth_alpha) assert the['flags'] & (1 << IS_ALPHA)
year = EN.lexicon.lookup('1999') year = EN.lexicon.lookup('1999')
assert not year.check_orth_flag(LexOrth_alpha) assert not year['flags'] & (1 << IS_ALPHA)
mixed = EN.lexicon.lookup('hello1') mixed = EN.lexicon.lookup('hello1')
assert not mixed.check_orth_flag(LexOrth_alpha) assert not mixed['flags'] & (1 << IS_ALPHA)
def test_is_digit(): def test_is_digit():
the = EN.lexicon.lookup('the') the = EN.lexicon.lookup('the')
assert not the.check_orth_flag(LexOrth_digit) assert not the['flags'] & (1 << IS_DIGIT)
year = EN.lexicon.lookup('1999') year = EN.lexicon.lookup('1999')
assert year.check_orth_flag(LexOrth_digit) assert year['flags'] & (1 << IS_DIGIT)
mixed = EN.lexicon.lookup('hello1') mixed = EN.lexicon.lookup('hello1')
assert not mixed.check_orth_flag(LexOrth_digit) assert not mixed['flags'] & (1 << IS_DIGIT)

View File

@ -1,27 +0,0 @@
from __future__ import unicode_literals
import pytest
import spacy.word
from spacy.en import EN
from spacy.lexeme import *
@pytest.fixture
def C3P0():
return EN.lexicon.lookup("C3P0")
def test_shape(C3P0):
assert C3P0.string_view(LexStr_shape) == "XdXd"
def test_length():
t = EN.lexicon.lookup('the')
assert t.length == 3
t = EN.lexicon.lookup("n't")
assert t.length == 3
t = EN.lexicon.lookup("'s")
assert t.length == 2
t = EN.lexicon.lookup('Xxxx')
assert t.length == 4

View File

@ -8,9 +8,9 @@ from spacy.en import EN
def test_one(): def test_one():
tokens = EN.tokenize('Betty Botter bought a pound of butter.') tokens = EN.tokenize('Betty Botter bought a pound of butter.')
assert tokens.string(0) == 'Betty' assert tokens[0].string == 'Betty'
tokens2 = EN.tokenize('Betty also bought a pound of butter.') tokens2 = EN.tokenize('Betty also bought a pound of butter.')
assert tokens2.string(0) == 'Betty' assert tokens2[0].string == 'Betty'

View File

@ -5,41 +5,39 @@ from spacy.en import EN
def test_single_word(): def test_single_word():
lex_ids = EN.tokenize(u'hello') tokens = EN.tokenize(u'hello')
assert lex_ids[0].string == EN.lexicon.lookup(u'hello').string assert tokens[0].string == 'hello'
def test_two_words(): def test_two_words():
words = EN.tokenize('hello possums') tokens = EN.tokenize('hello possums')
assert len(words) == 2 assert len(tokens) == 2
assert words[0].string == EN.lexicon.lookup('hello').string assert tokens[0].string != tokens[1].string
assert words[0].string != words[1].string
def test_punct(): def test_punct():
tokens = EN.tokenize('hello, possums.') tokens = EN.tokenize('hello, possums.')
assert len(tokens) == 4 assert len(tokens) == 4
assert tokens[0].string == EN.lexicon.lookup('hello').string assert tokens[0].string == 'hello'
assert tokens[1].string == EN.lexicon.lookup(',').string assert tokens[1].string == ','
assert tokens[2].string == EN.lexicon.lookup('possums').string assert tokens[2].string == 'possums'
assert tokens[1].string != EN.lexicon.lookup('hello').string assert tokens[1].string != 'hello'
def test_digits(): def test_digits():
lex_ids = EN.tokenize('The year: 1984.') tokens = EN.tokenize('The year: 1984.')
assert lex_ids.orig(3) == "1984" assert len(tokens) == 5
assert len(lex_ids) == 5 assert tokens[0].id == EN.lexicon.lookup('The')['id']
assert lex_ids[0].string == EN.lexicon.lookup('The').string assert tokens[3].id == EN.lexicon.lookup('1984')['id']
assert lex_ids[3].string == EN.lexicon.lookup('1984').string
def test_contraction(): def test_contraction():
lex_ids = EN.tokenize("don't giggle") tokens = EN.tokenize("don't giggle")
assert len(lex_ids) == 3 assert len(tokens) == 3
assert lex_ids[1].string == EN.lexicon.lookup("not").string assert tokens[1].id == EN.lexicon.lookup("not")['id']
lex_ids = EN.tokenize("i said don't!") tokens = EN.tokenize("i said don't!")
assert len(lex_ids) == 5 assert len(tokens) == 5
assert lex_ids[4].string == EN.lexicon.lookup('!').string assert tokens[4].id == EN.lexicon.lookup('!')['id']
def test_contraction_punct(): def test_contraction_punct():

View File

@ -5,30 +5,19 @@ from spacy.en import EN
def test_neq(): def test_neq():
addr = EN.lexicon.lookup('Hello') addr = EN.lexicon.lookup('Hello')
assert EN.lexicon.lookup('bye').string != addr.string assert EN.lexicon.lookup('bye')['id'] != addr['id']
def test_eq(): def test_eq():
addr = EN.lexicon.lookup('Hello') addr = EN.lexicon.lookup('Hello')
assert EN.lexicon.lookup('Hello').string == addr.string assert EN.lexicon.lookup('Hello')['id'] == addr['id']
def test_round_trip():
hello = EN.lexicon.lookup('Hello')
assert hello.string == 'Hello'
def test_case_neq(): def test_case_neq():
addr = EN.lexicon.lookup('Hello') addr = EN.lexicon.lookup('Hello')
assert EN.lexicon.lookup('hello').string != addr.string assert EN.lexicon.lookup('hello')['id'] != addr['id']
def test_punct_neq(): def test_punct_neq():
addr = EN.lexicon.lookup('Hello') addr = EN.lexicon.lookup('Hello')
assert EN.lexicon.lookup('Hello,').string != addr.string assert EN.lexicon.lookup('Hello,')['id'] != addr['id']
def test_short():
addr = EN.lexicon.lookup('I')
assert addr.string == 'I'
assert addr.string != 'not'