* Large refactor, particularly to Python API

This commit is contained in:
Matthew Honnibal 2014-10-24 00:59:17 +11:00
parent 168b2b8cb2
commit 08ce602243
21 changed files with 327 additions and 958 deletions

View File

@ -1,42 +0,0 @@
from spacy.spacy cimport Language
from spacy.word cimport Lexeme
cimport cython
cpdef size_t ALPHA
cpdef size_t DIGIT
cpdef size_t PUNCT
cpdef size_t SPACE
cpdef size_t LOWER
cpdef size_t UPPER
cpdef size_t TITLE
cpdef size_t ASCII
cpdef size_t OFT_LOWER
cpdef size_t OFT_TITLE
cpdef size_t OFT_UPPER
cpdef size_t PUNCT
cpdef size_t CONJ
cpdef size_t NUM
cpdef size_t N
cpdef size_t DET
cpdef size_t ADP
cpdef size_t ADJ
cpdef size_t ADV
cpdef size_t VERB
cpdef size_t NOUN
cpdef size_t PDT
cpdef size_t POS
cpdef size_t PRON
cpdef size_t PRT
cdef class English(spacy.Language):
cdef int find_split(self, unicode word)
cdef English EN
cpdef Word lookup(unicode word)
cpdef list tokenize(unicode string)

View File

@ -1,126 +0,0 @@
# cython: profile=True
# cython: embedsignature=True
'''Tokenize German text, using a scheme based on the Negra corpus.
Tokenization is generally similar to English text, and the same set of orthographic
flags are used.
An abbreviation list is used to handle common abbreviations. Hyphenated words
are not split, following the Treebank usage.
'''
from __future__ import unicode_literals
from libc.stdint cimport uint64_t
cimport spacy
from spacy.orth import is_alpha, is_digit, is_punct, is_space, is_lower, is_ascii
from spacy.orth import canonicalize_case, get_string_shape, asciify, get_non_sparse
from spacy.common cimport check_punct
# Python-readable flag constants --- can't read an enum from Python
# Don't want to manually assign these numbers, or we'll insert one and have to
# change them all.
# Don't use "i", as we don't want it in the global scope!
cdef size_t __i = 0
ALPHA = __i; i += 1
DIGIT = __i; __i += 1
PUNCT = __i; __i += 1
SPACE = __i; __i += 1
LOWER = __i; __i += 1
UPPER = __i; __i += 1
TITLE = __i; __i += 1
ASCII = __i; __i += 1
OFT_LOWER = __i; __i += 1
OFT_UPPER = __i; __i += 1
OFT_TITLE = __i; __i += 1
PUNCT = __i; __i += 1
CONJ = __i; __i += 1
NUM = __i; __i += 1
X = __i; __i += 1
DET = __i; __i += 1
ADP = __i; __i += 1
ADJ = __i; __i += 1
ADV = __i; __i += 1
VERB = __i; __i += 1
NOUN = __i; __i += 1
PDT = __i; __i += 1
POS = __i; __i += 1
PRON = __i; __i += 1
PRT = __i; __i += 1
# These are for the string views
__i = 0
SIC = __i; __i += 1
CANON_CASED = __i; __i += 1
NON_SPARSE = __i; __i += 1
SHAPE = __i; __i += 1
NR_STRING_VIEWS = __i
def get_string_views(unicode string, lexeme):
views = ['' for _ in range(NR_STRING_VIEWS)]
views[SIC] = string
views[CANON_CASED] = canonicalize_case(string, lexeme)
views[SHAPE] = get_string_shape(string)
views[ASCIIFIED] = get_asciified(string)
views[FIXED_VOCAB] = get_non_sparse(string, views[ASCIIFIED], views[CANON_CASED],
views[SHAPE], lexeme)
return views
def set_orth_flags(unicode string, flags_t flags)
setters = [
(ALPHA, is_alpha),
(DIGIT, is_digit),
(PUNCT, is_punct),
(SPACE, is_space),
(LOWER, is_lower),
(UPPER, is_upper),
(SPACE, is_space)
]
for bit, setter in setters:
if setter(string):
flags |= 1 << bit
return flags
cdef class German(spacy.Language):
cdef Lexeme new_lexeme(self, unicode string, cluster=0, case_stats=None,
tag_freqs=None):
return Lexeme(s, length, views, prob=prob, cluster=cluster,
flags=self.get_flags(string)
cdef int find_split(self, unicode word):
cdef size_t length = len(word)
cdef int i = 0
if word.startswith("'s") or word.startswith("'S"):
return 2
# Contractions
if word.endswith("'s") and length >= 3:
return length - 2
# Leading punctuation
if check_punct(word, 0, length):
return 1
elif length >= 1:
# Split off all trailing punctuation characters
i = 0
while i < length and not check_punct(word, i, length):
i += 1
return i
DE = German('de')
lookup = DE.lookup
tokenize = DE.tokenize
load_clusters = DE.load_clusters
load_unigram_probs = DE.load_unigram_probs
load_case_stats = DE.load_case_stats
load_tag_stats = DE.load_tag_stats

View File

@ -1,5 +1,4 @@
from spacy.lang cimport Language
from spacy.word cimport Lexeme
from spacy.tokens cimport Tokens

View File

@ -1,14 +1,12 @@
from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
from libc.stdint cimport uint64_t, int64_t
from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from .word cimport Lexeme
from .typedefs cimport hash_t
from .tokens cimport Tokens
from .lexeme cimport LexemeC
from .lexeme cimport Lexeme
from .utf8string cimport StringStore
cdef extern from "Python.h":
@ -21,23 +19,25 @@ cdef extern from "Python.h":
cdef struct String:
Py_UNICODE* chars
size_t n
uint64_t key
hash_t key
cdef class Lexicon:
cdef Pool mem
cpdef readonly size_t size
cpdef readonly StringStore strings
cdef vector[LexemeC*] lexemes
cdef vector[Lexeme*] lexemes
cpdef Lexeme lookup(self, unicode string)
cdef LexemeC* get(self, String* s) except NULL
cdef Lexeme* get(self, String* s) except NULL
cdef PreshMap _dict
cdef list _string_features
cdef list _flag_features
cdef class Language:
cdef Pool _mem
cdef unicode name
@ -52,12 +52,12 @@ cdef class Language:
cpdef Tokens tokenize(self, unicode text)
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except NULL
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
vector[Lexeme*] *suffixes) except NULL
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _save_cached(self, LexemeC** tokens, uint64_t key, int n) except -1
cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1

View File

@ -13,22 +13,21 @@ import random
from os import path
import re
from .util import read_lang_data
from .tokens import Tokens
from .lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
from .lexeme cimport LexStr_orig
from murmurhash.mrmr cimport hash64
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from cython.operator cimport preincrement as preinc
from cython.operator cimport dereference as deref
from murmurhash.mrmr cimport hash64
from preshed.maps cimport PreshMap
from spacy import orth
from spacy import util
from .lexeme cimport Lexeme
from .lexeme cimport from_dict as lexeme_from_dict
from .lexeme cimport from_string as lexeme_from_string
from . import orth
from . import util
from .util import read_lang_data
from .tokens import Tokens
cdef class Language:
@ -64,7 +63,7 @@ cdef class Language:
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
"""
cdef int length = len(string)
cdef Tokens tokens = Tokens(length)
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
if length == 0:
return tokens
cdef int i = 0
@ -76,7 +75,7 @@ cdef class Language:
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
if start < i:
string_slice(&span, chars, start, i)
lexemes = <LexemeC**>self.cache.get(span.key)
lexemes = <Lexeme**>self.cache.get(span.key)
if lexemes != NULL:
tokens.extend(start, lexemes, 0)
else:
@ -88,7 +87,7 @@ cdef class Language:
i += 1
if start < i:
string_slice(&span, chars, start, i)
lexemes = <LexemeC**>self.cache.get(span.key)
lexemes = <Lexeme**>self.cache.get(span.key)
if lexemes != NULL:
tokens.extend(start, lexemes, 0)
else:
@ -96,9 +95,9 @@ cdef class Language:
return tokens
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes
cdef uint64_t orig_key
cdef vector[Lexeme*] prefixes
cdef vector[Lexeme*] suffixes
cdef hash_t orig_key
cdef int orig_size
orig_key = span.key
orig_size = tokens.length
@ -106,8 +105,8 @@ cdef class Language:
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except NULL:
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
vector[Lexeme*] *suffixes) except NULL:
cdef size_t i
cdef String prefix
cdef String suffix
@ -150,15 +149,15 @@ cdef class Language:
cdef int _attach_tokens(self, Tokens tokens,
int idx, String* string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except -1:
vector[Lexeme*] *prefixes,
vector[Lexeme*] *suffixes) except -1:
cdef int split
cdef LexemeC** lexemes
cdef LexemeC* lexeme
cdef Lexeme** lexemes
cdef Lexeme* lexeme
cdef String span
idx = tokens.extend(idx, prefixes.data(), prefixes.size())
if string.n != 0:
lexemes = <LexemeC**>self.cache.get(string.key)
lexemes = <Lexeme**>self.cache.get(string.key)
if lexemes != NULL:
idx = tokens.extend(idx, lexemes, 0)
else:
@ -172,13 +171,13 @@ cdef class Language:
idx = tokens.push_back(idx, self.lexicon.get(&span))
string_slice(&span, string.chars, split + 1, string.n)
idx = tokens.push_back(idx, self.lexicon.get(&span))
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend():
idx = tokens.push_back(idx, deref(it))
preinc(it)
cdef int _save_cached(self, LexemeC** tokens, uint64_t key, int n) except -1:
lexemes = <LexemeC**>self._mem.alloc(n + 1, sizeof(LexemeC**))
cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1:
lexemes = <Lexeme**>self._mem.alloc(n + 1, sizeof(Lexeme**))
cdef int i
for i in range(n):
lexemes[i] = tokens[i]
@ -212,14 +211,14 @@ cdef class Language:
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
a string and tokens is a list of strings.
'''
cdef LexemeC** lexemes
cdef uint64_t hashed
cdef Lexeme** lexemes
cdef hash_t hashed
cdef String string
for uni_string, substrings in token_rules:
lexemes = <LexemeC**>self._mem.alloc(len(substrings) + 1, sizeof(LexemeC*))
lexemes = <Lexeme**>self._mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
for i, substring in enumerate(substrings):
string_from_unicode(&string, substring)
lexemes[i] = <LexemeC*>self.lexicon.get(&string)
lexemes[i] = <Lexeme*>self.lexicon.get(&string)
lexemes[i + 1] = NULL
string_from_unicode(&string, uni_string)
self.specials.set(string.key, lexemes)
@ -227,33 +226,29 @@ cdef class Language:
cdef class Lexicon:
def __cinit__(self, lexemes):
def __init__(self, lexemes):
self.mem = Pool()
self._dict = PreshMap(2 ** 20)
self.strings = StringStore()
self.size = 0
cdef String string
cdef dict lexeme_dict
cdef LexemeC* lexeme
for py_string, lexeme_dict in lexemes.iteritems():
string_from_unicode(&string, py_string)
lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
lexeme_unpack(lexeme, lexeme_dict)
self._dict.set(string.key, lexeme)
self.lexemes.push_back(lexeme)
self.size += 1
cdef Lexeme* lexeme
#for py_string, lexeme_dict in lexemes.iteritems():
# string_from_unicode(&string, py_string)
# lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
# lexeme_from_dict(lexeme, lexeme_dict, self.strings)
# self._dict.set(string.key, lexeme)
# self.lexemes.push_back(lexeme)
# self.size += 1
def __getitem__(self, size_t i):
return Lexeme(<size_t>self.lexemes.at(i))
cdef LexemeC* get(self, String* string) except NULL:
cdef LexemeC* lex
lex = <LexemeC*>self._dict.get(string.key)
cdef Lexeme* get(self, String* string) except NULL:
cdef Lexeme* lex
lex = <Lexeme*>self._dict.get(string.key)
if lex != NULL:
return lex
lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
cdef unicode unicode_string = string.chars[:string.n]
lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
lex = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
lexeme_from_string(lex, string.chars[:string.n], self.strings)
self._dict.set(string.key, lex)
self.lexemes.push_back(lex)
self.size += 1
@ -270,8 +265,8 @@ cdef class Lexicon:
"""
cdef String string
string_from_unicode(&string, uni_string)
cdef LexemeC* lexeme = self.get(&string)
return Lexeme(<size_t>lexeme)
cdef Lexeme* lexeme = self.get(&string)
return lexeme[0]
cdef void string_from_unicode(String* s, unicode uni):

View File

@ -1,94 +1,55 @@
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t
from .utf8string cimport StringStore
cpdef flag_t OOV_DIST_FLAGS
# Flags
cpdef enum:
IS_ALPHA
IS_ASCII
IS_DIGIT
IS_LOWER
IS_PUNCT
IS_SPACE
IS_TITLE
IS_UPPER
cpdef enum LexInts:
LexInt_id
LexInt_length
LexInt_cluster
LexInt_pos
LexInt_supersense
LexInt_N
OFT_LOWER
OFT_TITLE
OFT_UPPER
cpdef enum LexFloats:
LexFloat_prob
LexFloat_sentiment
LexFloat_N
cdef struct Lexeme:
atom_t id
atom_t length
atom_t norm
atom_t shape
atom_t vocab10k
atom_t asciied
atom_t prefix
atom_t suffix
atom_t cluster
atom_t pos
atom_t supersense
float prob
flag_t flags
cpdef enum LexStrs:
LexStr_orig
LexStr_norm
LexStr_shape
LexStr_unsparse
LexStr_asciied
LexStr_pre
LexStr_suff
LexStr_N
cdef Lexeme EMPTY_LEXEME
cpdef enum LexOrthFlags:
LexOrth_alpha
LexOrth_ascii
LexOrth_digit
LexOrth_lower
LexOrth_punct
LexOrth_space
LexOrth_title
LexOrth_upper
LexOrth_N
cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1
cpdef enum LexDistFlags:
LexDist_adj
LexDist_adp
LexDist_adv
LexDist_conj
LexDist_det
LexDist_noun
LexDist_num
LexDist_pdt
LexDist_pos
LexDist_pron
LexDist_prt
LexDist_punct
LexDist_verb
LexDist_lower
LexDist_title
LexDist_upper
LexDist_N
cdef int from_dict(Lexeme* lex, dict props, StringStore store) except -1
cdef struct LexemeC:
int[<int>LexInt_N] ints
float[<int>LexFloat_N] floats
utf8_t[<int>LexStr_N] strings
flag_t orth_flags
flag_t dist_flags
cdef LexemeC EMPTY_LEXEME
cpdef dict get_lexeme_dict(size_t i, unicode string)
cdef char* intern_and_encode(unicode string, size_t* length) except NULL
cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *
cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *
cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i)
cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *
cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *
cdef dict lexeme_pack(LexemeC* lexeme)
cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1
cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
return lexeme.flags & (1 << flag_id)

View File

@ -5,106 +5,40 @@ from libc.string cimport memset
import orth
from .utf8string cimport Utf8Str
OOV_DIST_FLAGS = 0
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
cpdef dict get_lexeme_dict(size_t i, unicode string):
ints = [None for _ in range(LexInt_N)]
ints[<int>LexInt_id] = i
ints[<int>LexInt_length] = len(string)
ints[<int>LexInt_cluster] = 0
ints[<int>LexInt_pos] = 0
ints[<int>LexInt_supersense] = 0
floats = [None for _ in range(LexFloat_N)]
floats[<int>LexFloat_prob] = 0
floats[<int>LexFloat_sentiment] = 0
strings = [None for _ in range(LexStr_N)]
strings[<int>LexStr_orig] = string
strings[<int>LexStr_norm] = strings[<int>LexStr_orig]
strings[<int>LexStr_shape] = orth.word_shape(string)
strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
strings[<int>LexStr_asciied] = orth.asciied(string)
strings[<int>LexStr_pre] = string[0]
strings[<int>LexStr_suff] = string[-3:]
orth_flags = get_orth_flags(string)
dist_flags = OOV_DIST_FLAGS
return {'ints': ints, 'floats': floats, 'strings': strings,
'orth_flags': orth_flags, 'dist_flags': dist_flags}
def get_orth_flags(unicode string):
def get_flags(unicode string):
cdef flag_t flags = 0
flags |= orth.is_ascii(string) << LexOrth_ascii
flags |= orth.is_alpha(string) << LexOrth_alpha
flags |= orth.is_digit(string) << LexOrth_digit
flags |= orth.is_lower(string) << LexOrth_lower
flags |= orth.is_punct(string) << LexOrth_punct
flags |= orth.is_space(string) << LexOrth_space
flags |= orth.is_title(string) << LexOrth_title
flags |= orth.is_upper(string) << LexOrth_upper
flags |= orth.is_alpha(string) << IS_ALPHA
flags |= orth.is_ascii(string) << IS_ASCII
flags |= orth.is_digit(string) << IS_DIGIT
flags |= orth.is_lower(string) << IS_LOWER
flags |= orth.is_punct(string) << IS_PUNCT
flags |= orth.is_space(string) << IS_SPACE
flags |= orth.is_title(string) << IS_TITLE
flags |= orth.is_upper(string) << IS_UPPER
return flags
def get_dist_flags(unicode string):
return 0
cdef char* intern_and_encode(unicode string, size_t* length) except NULL:
cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1:
cdef bytes byte_string = string.encode('utf8')
cdef bytes utf8_string = intern(byte_string)
Py_INCREF(utf8_string)
length[0] = len(utf8_string)
return <char*>utf8_string
cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
lex.id = orig_str.i
lex.cluster = 0
lex.length = len(string)
lex.flags = get_flags(string)
# TODO: Hook this up
#lex.norm = norm_str.i
#lex.shape = norm_str.i
#lex.asciied = asciied_str.i
#lex.prefix = prefix_str.i
#lex.suffix = suffix_str.i
cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *:
return lexeme.ints[i]
cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *:
return lexeme.floats[i]
cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i):
cdef bytes byte_string = lexeme.strings[i]
return byte_string.decode('utf8')
cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *:
return lexeme.orth_flags & (1 << flag_id)
cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *:
return lexeme.dist_flags & (1 << flag_id)
cdef dict lexeme_pack(LexemeC* lex):
cdef dict packed = {}
packed['ints'] = [lex.ints[i] for i in range(LexInt_N)]
packed['floats'] = [lex.floats[i] for i in range(LexFloat_N)]
packed['strings'] = [lex.strings[i].decode('utf8') for i in range(LexStr_N)]
packed['orth_flags'] = lex.orth_flags
packed['dist_flags'] = lex.orth_flags
return packed
cdef int lexeme_unpack(LexemeC* lex, dict p) except -1:
cdef size_t i
cdef int lex_int
cdef float lex_float
cdef unicode string
for i, lex_int in enumerate(p['ints']):
lex.ints[i] = lex_int
for i, lex_float in enumerate(p['floats']):
lex.floats[i] = lex_float
cdef size_t _
for i in range(LexStr_N):
lex_string = p['strings'][i]
lex.strings[i] = intern_and_encode(lex_string, &_)
lex.orth_flags = p['orth_flags']
lex.dist_flags = p['dist_flags']
cdef int from_dict(Lexeme* lex, dict props, StringStore stroe) except -1:
pass

View File

@ -113,8 +113,8 @@ cpdef enum:
CONTEXT_SIZE
cdef int get_atoms(atom_t* atoms, LexemeC* p2, LexemeC* p1, LexemeC* n0, LexemeC* n1,
LexemeC* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1,
Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
_fill_token(&atoms[P2i], p2)
_fill_token(&atoms[P1i], p1)
_fill_token(&atoms[N0i], n0)
@ -124,16 +124,16 @@ cdef int get_atoms(atom_t* atoms, LexemeC* p2, LexemeC* p1, LexemeC* n0, LexemeC
atoms[P2t] = prev_prev_tag
cdef inline void _fill_token(atom_t* atoms, LexemeC* lex) nogil:
atoms[0] = lex.ints[<int>LexInt_id]
atoms[1] = lex.ints[<int>LexInt_cluster]
atoms[2] = <atom_t>lex.strings[<int>LexStr_norm]
atoms[3] = <atom_t>lex.strings[<int>LexStr_shape]
atoms[4] = <atom_t>lex.strings[<int>LexStr_pre]
atoms[5] = <atom_t>lex.strings[<int>LexStr_suff]
cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
atoms[0] = lex.id
atoms[1] = lex.cluster
atoms[2] = lex.norm
atoms[3] = lex.shape
atoms[4] = lex.prefix
atoms[5] = lex.suffix
atoms[6] = lex.dist_flags & (1 << LexDist_title)
atoms[7] = lex.dist_flags & (1 << LexDist_upper)
atoms[6] = lex.flags & (1 << OFT_TITLE)
atoms[7] = lex.flags & (1 << OFT_UPPER)
TEMPLATES = (

View File

@ -20,6 +20,8 @@ def realign_tagged(token_rules, tagged_line, sep='/'):
def read_tagged(detoken_rules, file_, sep='/'):
sentences = []
for line in file_:
if not line.strip():
continue
line = realign_tagged(detoken_rules, line, sep=sep)
tokens, tags = _parse_line(line, sep)
assert len(tokens) == len(tags)
@ -39,7 +41,7 @@ def _parse_line(line, sep):
subtags.append('NULL')
assert len(subtags) == len(subtokens), [t.string for t in subtokens]
words.append(word)
tags.extend([Tagger.encode_pos(pos) for pos in subtags])
tags.extend([Tagger.encode_pos(ptb_to_univ(pos)) for pos in subtags])
return EN.tokenize(' '.join(words)), tags
@ -53,3 +55,86 @@ def get_tagdict(train_sents):
tagdict.setdefault(word, {}).setdefault(tag, 0)
tagdict[word][tag] += 1
return tagdict
def ptb_to_univ(tag):
mapping = dict(tuple(line.split()) for line in """
NULL NULL
HYPH .
ADD X
NFP .
AFX X
XX X
BES VERB
HVS VERB
GW X
! .
# .
$ .
'' .
( .
) .
, .
-LRB- .
-RRB- .
. .
: .
? .
CC CONJ
CD NUM
CD|RB X
DT DET
EX DET
FW X
IN ADP
IN|RP ADP
JJ ADJ
JJR ADJ
JJRJR ADJ
JJS ADJ
JJ|RB ADJ
JJ|VBG ADJ
LS X
MD VERB
NN NOUN
NNP NOUN
NNPS NOUN
NNS NOUN
NN|NNS NOUN
NN|SYM NOUN
NN|VBG NOUN
NP NOUN
PDT DET
POS PRT
PRP PRON
PRP$ PRON
PRP|VBP PRON
PRT PRT
RB ADV
RBR ADV
RBS ADV
RB|RP ADV
RB|VBG ADV
RN X
RP PRT
SYM X
TO PRT
UH X
VB VERB
VBD VERB
VBD|VBN VERB
VBG VERB
VBG|NN VERB
VBN VERB
VBP VERB
VBP|TO VERB
VBZ VERB
VP VERB
WDT DET
WH X
WP PRON
WP$ PRON
WRB ADV
`` .""".strip().split('\n'))
return mapping[tag]

View File

@ -1,5 +0,0 @@
from spacy.lang cimport Language
cdef class PennTreebank3(Language):
cdef list _split(self, unicode split)

View File

@ -1,161 +0,0 @@
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
so that strings can be retrieved from hashes. Use 64-bit hash values and
boldly assume no collisions.
'''
from __future__ import unicode_literals
from libc.stdint cimport uint64_t
cimport spacy
import re
from spacy import orth
TAG_THRESH = 0.5
UPPER_THRESH = 0.2
LOWER_THRESH = 0.5
TITLE_THRESH = 0.7
NR_FLAGS = 0
OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
OFT_TITLE = NR_FLAGS; NR_FLAGS += 1
IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
IS_SPACE = NR_FLAGS; NR_FLAGS += 1
IS_ASCII = NR_FLAGS; NR_FLAGS += 1
IS_TITLE = NR_FLAGS; NR_FLAGS += 1
IS_LOWER = NR_FLAGS; NR_FLAGS += 1
IS_UPPER = NR_FLAGS; NR_FLAGS += 1
CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
CAN_NUM = NR_FLAGS; NR_FLAGS += 1
CAN_DET = NR_FLAGS; NR_FLAGS += 1
CAN_ADP = NR_FLAGS; NR_FLAGS += 1
CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
CAN_ADV = NR_FLAGS; NR_FLAGS += 1
CAN_VERB = NR_FLAGS; NR_FLAGS += 1
CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
CAN_PDT = NR_FLAGS; NR_FLAGS += 1
CAN_POS = NR_FLAGS; NR_FLAGS += 1
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
CAN_PRT = NR_FLAGS; NR_FLAGS += 1
# List of contractions adapted from Robert MacIntyre's tokenizer.
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
re.compile(r"(?i)\b(d)('ye)\b"),
re.compile(r"(?i)\b(gim)(me)\b"),
re.compile(r"(?i)\b(gon)(na)\b"),
re.compile(r"(?i)\b(got)(ta)\b"),
re.compile(r"(?i)\b(lem)(me)\b"),
re.compile(r"(?i)\b(mor)('n)\b"),
re.compile(r"(?i)\b(wan)(na) ")]
CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
re.compile(r"(?i) ('t)(was)\b")]
CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
re.compile(r"(?i)\b(wha)(t)(cha)\b")]
def nltk_regex_tokenize(text):
# Implementation taken from NLTK 3.0, based on tokenizer.sed
#starting quotes
text = re.sub(r'^\"', r'``', text)
text = re.sub(r'(``)', r' \1 ', text)
text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
#punctuation
text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
text = re.sub(r'\.\.\.', r' ... ', text)
text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
text = re.sub(r'[?!]', r' \g<0> ', text)
text = re.sub(r"([^'])' ", r"\1 ' ", text)
#parens, brackets, etc.
text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
text = re.sub(r'--', r' -- ', text)
#add extra space to make things easier
text = " " + text + " "
#ending quotes
text = re.sub(r'"', " '' ", text)
text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
text)
for regexp in CONTRACTIONS2:
text = regexp.sub(r' \1 \2 ', text)
for regexp in CONTRACTIONS3:
text = regexp.sub(r' \1 \2 ', text)
# We are not using CONTRACTIONS4 since
# they are also commented out in the SED scripts
# for regexp in self.CONTRACTIONS4:
# text = regexp.sub(r' \1 \2 \3 ', text)
return text.split()
cdef class PennTreebank3(Language):
"""Fully PTB compatible English tokenizer, tightly coupled to lexicon.
Attributes:
name (unicode): The two letter code used by Wikipedia for the language.
lexicon (Lexicon): The lexicon. Exposes the lookup method.
"""
def __cinit__(self, name):
flag_funcs = [0 for _ in range(NR_FLAGS)]
flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
flag_funcs[IS_ALPHA] = orth.is_alpha
flag_funcs[IS_DIGIT] = orth.is_digit
flag_funcs[IS_PUNCT] = orth.is_punct
flag_funcs[IS_SPACE] = orth.is_space
flag_funcs[IS_TITLE] = orth.is_title
flag_funcs[IS_LOWER] = orth.is_lower
flag_funcs[IS_UPPER] = orth.is_upper
flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
Language.__init__(self, name, flag_funcs)
cdef list _split(self, unicode chunk):
strings = nltk_regex_tokenize(chunk)
if strings[-1] == '.':
strings.pop()
strings[-1] += '.'
assert strings
return strings
PTB3 = PennTreebank3('ptb3')

View File

@ -1,59 +1,49 @@
from cymem.cymem cimport Pool
from spacy.lexeme cimport LexemeC
from .lexeme cimport Lexeme
from .typedefs cimport flag_t
from .utf8string cimport StringStore
from thinc.typedefs cimport atom_t
cdef class Tokens:
cdef Pool mem
cdef StringStore _string_store
cdef LexemeC** _lex_ptr
cdef Lexeme** _lex_ptr
cdef int* _idx_ptr
cdef int* _pos_ptr
cdef LexemeC** lex
cdef Lexeme** lex
cdef int* idx
cdef int* pos
cdef int length
cdef int max_length
cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
cdef int push_back(self, int i, LexemeC* lexeme) except -1
cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
cdef int push_back(self, int i, Lexeme* lexeme) except -1
cpdef int id(self, size_t i) except -1
cpdef float prob(self, size_t i) except 1
cpdef int cluster(self, size_t i) except *
cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *
cpdef unicode string_view(self, size_t i, size_t view_id)
cpdef unicode string(self, size_t i)
cpdef unicode orig(self, size_t i)
cpdef unicode norm(self, size_t i)
cpdef unicode shape(self, size_t i)
cpdef unicode unsparse(self, size_t i)
cpdef unicode asciied(self, size_t i)
cpdef bint is_alpha(self, size_t i) except *
cpdef bint is_ascii(self, size_t i) except *
cpdef bint is_digit(self, size_t i) except *
cpdef bint is_lower(self, size_t i) except *
cpdef bint is_punct(self, size_t i) except *
cpdef bint is_space(self, size_t i) except *
cpdef bint is_title(self, size_t i) except *
cpdef bint is_upper(self, size_t i) except *
cpdef bint can_adj(self, size_t i) except *
cpdef bint can_adp(self, size_t i) except *
cpdef bint can_adv(self, size_t i) except *
cpdef bint can_conj(self, size_t i) except *
cpdef bint can_det(self, size_t i) except *
cpdef bint can_noun(self, size_t i) except *
cpdef bint can_num(self, size_t i) except *
cpdef bint can_pdt(self, size_t i) except *
cpdef bint can_pos(self, size_t i) except *
cpdef bint can_pron(self, size_t i) except *
cpdef bint can_prt(self, size_t i) except *
cpdef bint can_punct(self, size_t i) except *
cpdef bint can_verb(self, size_t i) except *
cpdef bint oft_lower(self, size_t i) except *
cpdef bint oft_title(self, size_t i) except *
cpdef bint oft_upper(self, size_t i) except *
cdef class Token:
cdef StringStore _string_store
cdef public int i
cdef public int idx
cdef public int pos
cdef public atom_t id
cdef public atom_t cluster
cdef public atom_t length
cdef public atom_t lex_pos
cdef public atom_t lex_supersense
cdef public atom_t norm
cdef public atom_t shape
cdef public atom_t vocab10k
cdef public atom_t asciied
cdef public atom_t prefix
cdef public atom_t suffix
cdef public float prob
cdef public flag_t flags

View File

@ -1,10 +1,6 @@
# cython: profile=True
from .word cimport Lexeme
from .lexeme cimport *
cimport numpy
cimport cython
import numpy
DEF PADDING = 5
@ -34,7 +30,8 @@ cdef class Tokens:
>>> tokens.can_noun(1)
True
"""
def __init__(self, string_length=0):
def __init__(self, StringStore string_store, string_length=0):
self._string_store = string_store
if string_length >= 3:
size = int(string_length / 3.0)
else:
@ -43,7 +40,7 @@ cdef class Tokens:
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
# However, we need to remember the true starting places, so that we can
# realloc.
self._lex_ptr = <LexemeC**>self.mem.alloc(size + (PADDING*2), sizeof(LexemeC*))
self._lex_ptr = <Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
self.lex = self._lex_ptr
@ -55,39 +52,26 @@ cdef class Tokens:
self.lex += PADDING
self.idx += PADDING
self.pos += PADDING
self.max_length = size
self.length = 0
def __getitem__(self, i):
bounds_check(i, self.length, PADDING)
return Lexeme(<size_t>self.lex[i])
return Token(self._string_store, i, self.idx[i], self.pos[i], self.lex[i][0])
def __len__(self):
return self.length
cdef int push_back(self, int idx, LexemeC* lexeme) except -1:
cdef int push_back(self, int idx, Lexeme* lexeme) except -1:
if self.length == self.max_length:
self._realloc(self.length * 2)
self.lex[self.length] = lexeme
self.idx[self.length] = idx
self.pos[self.length] = 0
self.length += 1
return idx + lexeme.ints[<int>LexInt_length]
return idx + lexeme.length
def _realloc(self, new_size):
self.max_length = new_size
n = new_size + (PADDING * 2)
self._lex_ptr = <LexemeC**>self.mem.realloc(self._lex_ptr, n * sizeof(LexemeC*))
self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
self.lex = self._lex_ptr + PADDING
self.idx = self._idx_ptr + PADDING
self.pos = self._pos_ptr + PADDING
for i in range(self.length, self.max_length + PADDING):
self.lex[i] = &EMPTY_LEXEME
cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
cdef int extend(self, int idx, Lexeme** lexemes, int n) except -1:
cdef int i
if lexemes == NULL:
return idx
@ -101,154 +85,43 @@ cdef class Tokens:
idx = self.push_back(idx, lexemes[i])
return idx
cpdef int id(self, size_t i) except -1:
bounds_check(i, self.length, PADDING)
return self.lex[i].ints[<int>LexInt_id]
def _realloc(self, new_size):
self.max_length = new_size
n = new_size + (PADDING * 2)
self._lex_ptr = <Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
self.lex = self._lex_ptr + PADDING
self.idx = self._idx_ptr + PADDING
self.pos = self._pos_ptr + PADDING
for i in range(self.length, self.max_length + PADDING):
self.lex[i] = &EMPTY_LEXEME
cpdef float prob(self, size_t i) except 1:
bounds_check(i, self.length, PADDING)
return self.lex[i].floats[<int>LexFloat_prob]
cpdef int cluster(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return self.lex[i].ints[<int>LexInt_cluster]
@cython.freelist(64)
cdef class Token:
def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex):
self._string_store = string_store
self.i = i
self.idx = idx
self.pos = pos
cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], flag_id)
self.id = lex['id']
self.cluster = lex['cluster']
self.length = lex['length']
self.lex_pos = lex['pos']
self.lex_supersense = lex['supersense']
self.norm = lex['norm']
self.shape = lex['shape']
self.vocab10k = lex['vocab10k']
self.suffix = lex['asciied']
self.prefix = lex['prefix']
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], flag_id)
self.prob = lex['prob']
self.flags = lex['flags']
cpdef unicode string_view(self, size_t i, size_t view_id):
bounds_check(i, self.length, PADDING)
return lexeme_get_string(self.lex[i], view_id)
property string:
def __get__(self):
cdef bytes utf8string = self._string_store[self.id]
return utf8string.decode('utf8')
# Provide accessor methods for the features supported by the language.
# Without these, clients have to use the underlying string_view and check_flag
# methods, which requires them to know the IDs.
cpdef unicode string(self, size_t i):
bounds_check(i, self.length, PADDING)
return self.orig(i)
cpdef unicode orig(self, size_t i):
bounds_check(i, self.length, PADDING)
cdef bytes utf8_string = self.lex[i].strings[<int>LexStr_orig]
cdef unicode string = utf8_string.decode('utf8')
return string
cpdef unicode norm(self, size_t i):
bounds_check(i, self.length, PADDING)
cdef bytes utf8_string = self.lex[i].strings[<int>LexStr_norm]
cdef unicode string = utf8_string.decode('utf8')
return string
cpdef unicode shape(self, size_t i):
bounds_check(i, self.length, PADDING)
return lexeme_get_string(self.lex[i], LexStr_shape)
cpdef unicode unsparse(self, size_t i):
bounds_check(i, self.length, PADDING)
return lexeme_get_string(self.lex[i], LexStr_unsparse)
cpdef unicode asciied(self, size_t i):
bounds_check(i, self.length, PADDING)
return lexeme_get_string(self.lex[i], LexStr_asciied)
cpdef bint is_alpha(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], LexOrth_alpha)
cpdef bint is_ascii(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], LexOrth_ascii)
cpdef bint is_digit(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], LexOrth_digit)
cpdef bint is_lower(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], LexOrth_lower)
cpdef bint is_punct(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], LexOrth_punct)
cpdef bint is_space(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], LexOrth_space)
cpdef bint is_title(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], LexOrth_title)
cpdef bint is_upper(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_orth_flag(self.lex[i], LexOrth_upper)
cpdef bint can_adj(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_adj)
cpdef bint can_adp(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_adp)
cpdef bint can_adv(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_adv)
cpdef bint can_conj(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_conj)
cpdef bint can_det(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_det)
cpdef bint can_noun(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_noun)
cpdef bint can_num(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_num)
cpdef bint can_pdt(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_pdt)
cpdef bint can_pos(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_pos)
cpdef bint can_pron(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_pron)
cpdef bint can_prt(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_prt)
cpdef bint can_punct(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_punct)
cpdef bint can_verb(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_verb)
cpdef bint oft_lower(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_lower)
cpdef bint oft_title(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_title)
cpdef bint oft_upper(self, size_t i) except *:
bounds_check(i, self.length, PADDING)
return lexeme_check_dist_flag(self.lex[i], LexDist_upper)

View File

@ -1,12 +0,0 @@
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
from spacy.lexeme cimport LexemeC
DEF MAX_FLAG = 64
cdef class Lexeme:
cdef LexemeC* _c
cpdef bint check_orth_flag(self, size_t flag_id) except *
cpdef bint check_dist_flag(self, size_t flag_id) except *
cpdef unicode string_view(self, size_t view_id)

View File

@ -1,80 +0,0 @@
# cython: profile=True
# cython: embedsignature=True
from .lexeme cimport lexeme_get_string
from .lexeme cimport lexeme_check_orth_flag, lexeme_check_dist_flag
from .lexeme cimport *
cdef class Lexeme:
"""A lexical type --- a word, punctuation symbol, whitespace sequence, etc
keyed by a case-sensitive unicode string. All tokens with the same string,
e.g. all instances of "dog", ",", "NASA" etc should be mapped to the same
Lexeme.
You should avoid instantiating Lexemes directly, and instead use the
:py:meth:`space.lang.Language.tokenize` and :py:meth:`spacy.lang.Language.lookup`
methods on the global object exposed by the language you're working with,
e.g. :py:data:`spacy.en.EN`.
Attributes:
string (unicode):
The unicode string.
Implemented as a property; relatively expensive.
length (size_t):
The number of unicode code-points in the string.
prob (double):
An estimate of the word's unigram log probability.
Probabilities are calculated from a large text corpus, and smoothed using
simple Good-Turing. Estimates are read from data/en/probabilities, and
can be replaced using spacy.en.load_probabilities.
cluster (size_t):
An integer representation of the word's Brown cluster.
A Brown cluster is an address into a binary tree, which gives some (noisy)
information about the word's distributional context.
>>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
>>> print ["{0:b"} % lookup(s).cluster for s in strings]
["100111110110", "100111100100", "01010111011001", "100111110110"]
The clusterings are unideal, but often slightly useful.
"pineapple" and "apple" share a long prefix, indicating a similar meaning,
while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like.
"""
def __cinit__(self, size_t lexeme_addr):
self._c = <LexemeC*>lexeme_addr
property string:
def __get__(self):
cdef bytes utf8_string = self._c.strings[<int>LexStr_orig]
cdef unicode string = utf8_string.decode('utf8')
return string
property prob:
def __get__(self):
return self._c.floats[<int>LexFloat_prob]
property cluster:
def __get__(self):
return self._c.ints[<int>LexInt_cluster]
property length:
def __get__(self):
return self._c.ints[<int>LexInt_length]
cpdef bint check_orth_flag(self, size_t flag_id) except *:
return lexeme_check_orth_flag(self._c, flag_id)
cpdef bint check_dist_flag(self, size_t flag_id) except *:
return lexeme_check_dist_flag(self._c, flag_id)
cpdef unicode string_view(self, size_t view_id):
return lexeme_get_string(self._c, view_id)

View File

@ -5,8 +5,8 @@ from spacy.en import EN
def test_possess():
tokens = EN.tokenize("Mike's")
assert tokens[0].string == "Mike"
assert tokens[1].string == "'s"
assert EN.lexicon.strings[tokens[0].id] == "Mike"
assert EN.lexicon.strings[tokens[1].id] == "'s"
assert len(tokens) == 2

View File

@ -8,19 +8,17 @@ from spacy.lexeme import *
def test_is_alpha():
the = EN.lexicon.lookup('the')
assert the.check_orth_flag(LexOrth_alpha)
assert the['flags'] & (1 << IS_ALPHA)
year = EN.lexicon.lookup('1999')
assert not year.check_orth_flag(LexOrth_alpha)
assert not year['flags'] & (1 << IS_ALPHA)
mixed = EN.lexicon.lookup('hello1')
assert not mixed.check_orth_flag(LexOrth_alpha)
assert not mixed['flags'] & (1 << IS_ALPHA)
def test_is_digit():
the = EN.lexicon.lookup('the')
assert not the.check_orth_flag(LexOrth_digit)
assert not the['flags'] & (1 << IS_DIGIT)
year = EN.lexicon.lookup('1999')
assert year.check_orth_flag(LexOrth_digit)
assert year['flags'] & (1 << IS_DIGIT)
mixed = EN.lexicon.lookup('hello1')
assert not mixed.check_orth_flag(LexOrth_digit)
assert not mixed['flags'] & (1 << IS_DIGIT)

View File

@ -1,27 +0,0 @@
from __future__ import unicode_literals
import pytest
import spacy.word
from spacy.en import EN
from spacy.lexeme import *
@pytest.fixture
def C3P0():
return EN.lexicon.lookup("C3P0")
def test_shape(C3P0):
assert C3P0.string_view(LexStr_shape) == "XdXd"
def test_length():
t = EN.lexicon.lookup('the')
assert t.length == 3
t = EN.lexicon.lookup("n't")
assert t.length == 3
t = EN.lexicon.lookup("'s")
assert t.length == 2
t = EN.lexicon.lookup('Xxxx')
assert t.length == 4

View File

@ -8,9 +8,9 @@ from spacy.en import EN
def test_one():
tokens = EN.tokenize('Betty Botter bought a pound of butter.')
assert tokens.string(0) == 'Betty'
assert tokens[0].string == 'Betty'
tokens2 = EN.tokenize('Betty also bought a pound of butter.')
assert tokens2.string(0) == 'Betty'
assert tokens2[0].string == 'Betty'

View File

@ -5,41 +5,39 @@ from spacy.en import EN
def test_single_word():
lex_ids = EN.tokenize(u'hello')
assert lex_ids[0].string == EN.lexicon.lookup(u'hello').string
tokens = EN.tokenize(u'hello')
assert tokens[0].string == 'hello'
def test_two_words():
words = EN.tokenize('hello possums')
assert len(words) == 2
assert words[0].string == EN.lexicon.lookup('hello').string
assert words[0].string != words[1].string
tokens = EN.tokenize('hello possums')
assert len(tokens) == 2
assert tokens[0].string != tokens[1].string
def test_punct():
tokens = EN.tokenize('hello, possums.')
assert len(tokens) == 4
assert tokens[0].string == EN.lexicon.lookup('hello').string
assert tokens[1].string == EN.lexicon.lookup(',').string
assert tokens[2].string == EN.lexicon.lookup('possums').string
assert tokens[1].string != EN.lexicon.lookup('hello').string
assert tokens[0].string == 'hello'
assert tokens[1].string == ','
assert tokens[2].string == 'possums'
assert tokens[1].string != 'hello'
def test_digits():
lex_ids = EN.tokenize('The year: 1984.')
assert lex_ids.orig(3) == "1984"
assert len(lex_ids) == 5
assert lex_ids[0].string == EN.lexicon.lookup('The').string
assert lex_ids[3].string == EN.lexicon.lookup('1984').string
tokens = EN.tokenize('The year: 1984.')
assert len(tokens) == 5
assert tokens[0].id == EN.lexicon.lookup('The')['id']
assert tokens[3].id == EN.lexicon.lookup('1984')['id']
def test_contraction():
lex_ids = EN.tokenize("don't giggle")
assert len(lex_ids) == 3
assert lex_ids[1].string == EN.lexicon.lookup("not").string
lex_ids = EN.tokenize("i said don't!")
assert len(lex_ids) == 5
assert lex_ids[4].string == EN.lexicon.lookup('!').string
tokens = EN.tokenize("don't giggle")
assert len(tokens) == 3
assert tokens[1].id == EN.lexicon.lookup("not")['id']
tokens = EN.tokenize("i said don't!")
assert len(tokens) == 5
assert tokens[4].id == EN.lexicon.lookup('!')['id']
def test_contraction_punct():

View File

@ -5,30 +5,19 @@ from spacy.en import EN
def test_neq():
addr = EN.lexicon.lookup('Hello')
assert EN.lexicon.lookup('bye').string != addr.string
assert EN.lexicon.lookup('bye')['id'] != addr['id']
def test_eq():
addr = EN.lexicon.lookup('Hello')
assert EN.lexicon.lookup('Hello').string == addr.string
def test_round_trip():
hello = EN.lexicon.lookup('Hello')
assert hello.string == 'Hello'
assert EN.lexicon.lookup('Hello')['id'] == addr['id']
def test_case_neq():
addr = EN.lexicon.lookup('Hello')
assert EN.lexicon.lookup('hello').string != addr.string
assert EN.lexicon.lookup('hello')['id'] != addr['id']
def test_punct_neq():
addr = EN.lexicon.lookup('Hello')
assert EN.lexicon.lookup('Hello,').string != addr.string
def test_short():
addr = EN.lexicon.lookup('I')
assert addr.string == 'I'
assert addr.string != 'not'
assert EN.lexicon.lookup('Hello,')['id'] != addr['id']