* Tmp commit. Refactoring to create a Python Lexeme class.

This commit is contained in:
Matthew Honnibal 2015-01-12 10:26:22 +11:00
parent 61904e590f
commit ce2edd6312
16 changed files with 281 additions and 173 deletions

View File

@ -1,6 +1,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from os import path from os import path
from .. import orth
from ..vocab import Vocab from ..vocab import Vocab
from ..tokenizer import Tokenizer from ..tokenizer import Tokenizer
from ..syntax.parser import GreedyParser from ..syntax.parser import GreedyParser
@ -10,12 +11,10 @@ from .pos import POS_TAGS
from .attrs import get_flags from .attrs import get_flags
DATA_DIR = path.join(path.dirname(__file__), 'data')
def get_lex_props(string): def get_lex_props(string):
return {'flags': get_flags(string), 'dense': 1} return {'flags': get_flags(string), 'dense': 1}
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
class English(object): class English(object):
"""The English NLP pipeline. """The English NLP pipeline.
@ -44,14 +43,16 @@ class English(object):
parser (spacy.syntax.parser.GreedyParser): parser (spacy.syntax.parser.GreedyParser):
A greedy shift-reduce dependency parser. A greedy shift-reduce dependency parser.
""" """
def __init__(self, data_dir=None): def __init__(self, data_dir=LOCAL_DATA_DIR):
if data_dir is None:
data_dir = path.join(path.dirname(__file__), 'data')
self._data_dir = data_dir self._data_dir = data_dir
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'), self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
get_lex_props=get_lex_props) get_lex_props=get_lex_props)
tag_names = list(POS_TAGS.keys()) tag_names = list(POS_TAGS.keys())
tag_names.sort() tag_names.sort()
if data_dir is None:
self.tokenizer = Tokenizer(self.vocab, {}, None, None, None,
POS_TAGS, tag_names)
else:
self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'), self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'),
POS_TAGS, tag_names) POS_TAGS, tag_names)
self.strings = self.vocab.strings self.strings = self.vocab.strings

View File

@ -4,7 +4,7 @@ import tarfile
import shutil import shutil
import requests import requests
URL = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz' PARSER_URL = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz'
DEST_DIR = path.join(path.dirname(__file__), 'data', 'deps') DEST_DIR = path.join(path.dirname(__file__), 'data', 'deps')

View File

@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
from .._ml cimport Model from .._ml cimport Model
from ..strings cimport StringStore from ..strings cimport StringStore
from ..structs cimport TokenC, Lexeme, Morphology, PosTag from ..structs cimport TokenC, LexemeC, Morphology, PosTag
from ..typedefs cimport univ_tag_t from ..typedefs cimport univ_tag_t
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
@ -21,5 +21,5 @@ cdef class EnPosTagger:
cdef readonly int n_tags cdef readonly int n_tags
cdef int set_morph(self, const int i, TokenC* tokens) except -1 cdef int set_morph(self, const int i, TokenC* tokens) except -1
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1 cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1

View File

@ -12,7 +12,7 @@ from ..typedefs cimport univ_tag_t
from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from ..typedefs cimport X, PUNCT, EOL from ..typedefs cimport X, PUNCT, EOL
from ..typedefs cimport id_t from ..typedefs cimport id_t
from ..structs cimport TokenC, Morphology, Lexeme from ..structs cimport TokenC, Morphology, LexemeC
from ..tokens cimport Tokens from ..tokens cimport Tokens
from ..morphology cimport set_morph_from_dict from ..morphology cimport set_morph_from_dict
from .._ml cimport arg_max from .._ml cimport arg_max
@ -290,7 +290,7 @@ cdef class EnPosTagger:
tokens[i].lemma = cached.lemma tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph tokens[i].morph = cached.morph
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
if self.lemmatizer is None: if self.lemmatizer is None:
return lex.sic return lex.sic
cdef bytes py_string = self.strings[lex.sic] cdef bytes py_string = self.strings[lex.sic]

View File

@ -1,21 +1,21 @@
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
from .structs cimport Lexeme from .structs cimport LexemeC
from .strings cimport StringStore from .strings cimport StringStore
cdef Lexeme EMPTY_LEXEME cdef LexemeC EMPTY_LEXEME
cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store, cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store,
dict props) except * dict props) except *
cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil: cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id) return lexeme.flags & (1 << flag_id)
cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil: cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
if feat_name < (sizeof(flags_t) * 8): if feat_name < (sizeof(flags_t) * 8):
return check_flag(lex, feat_name) return check_flag(lex, feat_name)
elif feat_name == ID: elif feat_name == ID:

View File

@ -7,12 +7,12 @@ from libc.string cimport memset
from .orth cimport word_shape from .orth cimport word_shape
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme)) memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cpdef Lexeme init(id_t i, unicode string, hash_t hashed, cdef LexemeC init(id_t i, unicode string, hash_t hashed,
StringStore string_store, dict props) except *: StringStore string_store, dict props) except *:
cdef Lexeme lex cdef LexemeC lex
lex.id = i lex.id = i
lex.length = len(string) lex.length = len(string)
lex.sic = string_store[string] lex.sic = string_store[string]
@ -27,3 +27,6 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
lex.flags = props.get('flags', 0) lex.flags = props.get('flags', 0)
return lex return lex

View File

@ -1,4 +1,4 @@
from .structs cimport TokenC, Lexeme, Morphology, PosTag from .structs cimport TokenC, Morphology, PosTag
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1 cdef int set_morph_from_dict(Morphology* morph, dict props) except -1

View File

@ -3,6 +3,9 @@ from preshed.maps cimport PreshMap
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from .structs cimport Utf8Str, UniStr from .structs cimport Utf8Str, UniStr
from .typedefs cimport hash_t
cpdef hash_t hash_string(unicode string) except 0
cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil: cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:

View File

@ -1,6 +1,7 @@
import codecs import codecs
from libc.string cimport memcpy from libc.string cimport memcpy
from murmurhash.mrmr cimport hash64
from .typedefs cimport hash_t from .typedefs cimport hash_t
@ -9,6 +10,11 @@ from .typedefs cimport hash_t
SEPARATOR = '\n|-SEP-|\n' SEPARATOR = '\n|-SEP-|\n'
cpdef hash_t hash_string(unicode string) except 0:
chars = <Py_UNICODE*>string
return hash64(chars, len(string) * sizeof(Py_UNICODE), 0)
""" """
cdef class SymbolMap: cdef class SymbolMap:
def __init__(self): def __init__(self):

View File

@ -3,7 +3,9 @@ from libc.stdint cimport uint8_t, uint32_t
from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t
cdef struct Lexeme: cdef struct LexemeC:
const float* vec
flags_t flags flags_t flags
attr_t id attr_t id
@ -38,7 +40,7 @@ cdef struct PosTag:
cdef struct TokenC: cdef struct TokenC:
const Lexeme* lex const LexemeC* lex
Morphology morph Morphology morph
univ_tag_t pos univ_tag_t pos
int fine_pos int fine_pos

View File

@ -6,14 +6,14 @@ from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .structs cimport Lexeme, TokenC, Morphology, UniStr from .structs cimport LexemeC, TokenC, Morphology, UniStr
from .strings cimport StringStore from .strings cimport StringStore
from .tokens cimport Tokens from .tokens cimport Tokens
from .vocab cimport Vocab, _Cached from .vocab cimport Vocab, _Cached
cdef union LexemesOrTokens: cdef union LexemesOrTokens:
const Lexeme* const* lexemes const LexemeC* const* lexemes
TokenC* tokens TokenC* tokens
@ -33,10 +33,10 @@ cdef class Tokenizer:
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1 cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1 cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes, cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
vector[Lexeme*] *suffixes) except NULL vector[LexemeC*] *suffixes) except NULL
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1 vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1

View File

@ -53,7 +53,7 @@ cdef class Tokenizer:
cdef int idx = 0 cdef int idx = 0
for i, py_string in enumerate(strings): for i, py_string in enumerate(strings):
slice_unicode(&string_struct, py_string, 0, len(py_string)) slice_unicode(&string_struct, py_string, 0, len(py_string))
tokens.push_back(idx, <const Lexeme*>self.vocab.get(tokens.mem, &string_struct)) tokens.push_back(idx, <const LexemeC*>self.vocab.get(tokens.mem, &string_struct))
idx += len(py_string) + 1 idx += len(py_string) + 1
return tokens return tokens
@ -75,7 +75,7 @@ cdef class Tokenizer:
string (unicode): The string to be tokenized. string (unicode): The string to be tokenized.
Returns: Returns:
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes. tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs.
""" """
cdef int length = len(string) cdef int length = len(string)
cdef Tokens tokens = Tokens(self.vocab, length) cdef Tokens tokens = Tokens(self.vocab, length)
@ -121,8 +121,8 @@ cdef class Tokenizer:
return True return True
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1: cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
cdef vector[Lexeme*] prefixes cdef vector[LexemeC*] prefixes
cdef vector[Lexeme*] suffixes cdef vector[LexemeC*] suffixes
cdef hash_t orig_key cdef hash_t orig_key
cdef int orig_size cdef int orig_size
orig_key = span.key orig_key = span.key
@ -131,8 +131,8 @@ cdef class Tokenizer:
self._attach_tokens(tokens, start, span, &prefixes, &suffixes) self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size) self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes, cdef UniStr* _split_affixes(self, UniStr* string, vector[const LexemeC*] *prefixes,
vector[const Lexeme*] *suffixes) except NULL: vector[const LexemeC*] *suffixes) except NULL:
cdef size_t i cdef size_t i
cdef UniStr prefix cdef UniStr prefix
cdef UniStr suffix cdef UniStr suffix
@ -174,12 +174,12 @@ cdef class Tokenizer:
return string return string
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
vector[const Lexeme*] *prefixes, vector[const LexemeC*] *prefixes,
vector[const Lexeme*] *suffixes) except -1: vector[const LexemeC*] *suffixes) except -1:
cdef bint cache_hit cdef bint cache_hit
cdef int split cdef int split
cdef const Lexeme* const* lexemes cdef const LexemeC* const* lexemes
cdef Lexeme* lexeme cdef LexemeC* lexeme
cdef UniStr span cdef UniStr span
cdef int i cdef int i
if prefixes.size(): if prefixes.size():
@ -200,7 +200,7 @@ cdef class Tokenizer:
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span)) idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
slice_unicode(&span, string.chars, split + 1, string.n) slice_unicode(&span, string.chars, split + 1, string.n)
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span)) idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin() cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend(): while it != suffixes.rend():
idx = tokens.push_back(idx, deref(it)) idx = tokens.push_back(idx, deref(it))
preinc(it) preinc(it)
@ -213,10 +213,10 @@ cdef class Tokenizer:
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = n cached.length = n
cached.is_lex = True cached.is_lex = True
lexemes = <const Lexeme**>self.mem.alloc(n, sizeof(Lexeme**)) lexemes = <const LexemeC**>self.mem.alloc(n, sizeof(LexemeC**))
for i in range(n): for i in range(n):
lexemes[i] = tokens[i].lex lexemes[i] = tokens[i].lex
cached.data.lexemes = <const Lexeme* const*>lexemes cached.data.lexemes = <const LexemeC* const*>lexemes
self._cache.set(key, cached) self._cache.set(key, cached)
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1: cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
@ -243,7 +243,7 @@ cdef class Tokenizer:
cdef unicode form cdef unicode form
cdef unicode lemma cdef unicode lemma
cdef dict props cdef dict props
cdef Lexeme** lexemes cdef LexemeC** lexemes
cdef hash_t hashed cdef hash_t hashed
cdef UniStr string cdef UniStr string
for chunk, substrings in sorted(rules.items()): for chunk, substrings in sorted(rules.items()):
@ -252,7 +252,7 @@ cdef class Tokenizer:
form = props['F'] form = props['F']
lemma = props.get("L", None) lemma = props.get("L", None)
slice_unicode(&string, form, 0, len(form)) slice_unicode(&string, form, 0, len(form))
tokens[i].lex = <Lexeme*>self.vocab.get(self.vocab.mem, &string) tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, &string)
if lemma: if lemma:
tokens[i].lemma = self.vocab.strings[lemma] tokens[i].lemma = self.vocab.strings[lemma]
if 'pos' in props: if 'pos' in props:

View File

@ -5,13 +5,13 @@ from cython.view cimport array as cvarray
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t from thinc.typedefs cimport atom_t
from .typedefs cimport flags_t, attr_id_t, attr_t from .typedefs cimport flags_t, attr_id_t, attr_t, univ_tag_t
from .structs cimport Morphology, TokenC, Lexeme from .structs cimport Morphology, TokenC, LexemeC
from .vocab cimport Vocab from .vocab cimport Vocab
from .strings cimport StringStore from .strings cimport StringStore
ctypedef const Lexeme* const_Lexeme_ptr ctypedef const LexemeC* const_Lexeme_ptr
ctypedef TokenC* TokenC_ptr ctypedef TokenC* TokenC_ptr
ctypedef fused LexemeOrToken: ctypedef fused LexemeOrToken:
@ -19,10 +19,10 @@ ctypedef fused LexemeOrToken:
TokenC_ptr TokenC_ptr
cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil
cdef attr_t get_token_attr(const TokenC* lex, attr_id_t feat_name) nogil cdef attr_t get_token_attr(const TokenC* lex, attr_id_t feat_name) nogil
cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil: cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id) return lexeme.flags & (1 << flag_id)
@ -42,5 +42,32 @@ cdef class Tokens:
cdef class Token: cdef class Token:
cdef Tokens _seq cdef cvarray vec
cdef readonly int i
cdef readonly flags_t flags
cdef readonly attr_t id
cdef readonly attr_t sic
cdef readonly attr_t dense
cdef readonly attr_t shape
cdef readonly attr_t prefix
cdef readonly attr_t suffix
cdef readonly attr_t length
cdef readonly attr_t cluster
cdef readonly attr_t pos_type
cdef readonly float prob
cdef readonly float sentiment
cdef readonly Morphology morph
cdef readonly univ_tag_t pos
cdef readonly int fine_pos
cdef readonly int idx
cdef readonly int lemma
cdef readonly int sense
cdef readonly int dep_tag
cdef readonly int head_offset
cdef readonly uint32_t l_kids
cdef readonly uint32_t r_kids

View File

@ -32,7 +32,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
return get_lex_attr(token.lex, feat_name) return get_lex_attr(token.lex, feat_name)
cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil: cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
if feat_name < (sizeof(flags_t) * 8): if feat_name < (sizeof(flags_t) * 8):
return check_flag(lex, feat_name) return check_flag(lex, feat_name)
elif feat_name == ID: elif feat_name == ID:
@ -85,7 +85,7 @@ cdef class Tokens:
token (Token): token (Token):
""" """
bounds_check(i, self.length, PADDING) bounds_check(i, self.length, PADDING)
return Token(self, i) return cinit_token(&self.data[i])
def __iter__(self): def __iter__(self):
"""Iterate over the tokens. """Iterate over the tokens.
@ -174,26 +174,57 @@ cdef class Tokens:
self.data[i].lex = &EMPTY_LEXEME self.data[i].lex = &EMPTY_LEXEME
@cython.freelist(64) cdef Token cinit_token(const TokenC* c_tok):
cdef const LexemeC* lex = c_tok.lex
cdef Token py_tok = Token.__new__(Token)
cyarr = cvarray(shape=(300,), itemsize=sizeof(float), format="i")
py_tok.vec = cyarr
py_tok.flags = lex.flags
py_tok.id = lex.id
py_tok.sic = lex.sic
py_tok.dense = lex.dense
py_tok.shape = lex.shape
py_tok.prefix = lex.prefix
py_tok.suffix = lex.suffix
py_tok.length = lex.length
py_tok.cluster = lex.cluster
py_tok.pos_type = lex.pos_type
py_tok.prob = lex.prob
py_tok.sentiment = lex.sentiment
py_tok.morph = c_tok.morph
py_tok.pos = c_tok.pos
py_tok.fine_pos = c_tok.fine_pos
py_tok.idx = c_tok.idx
py_tok.lemma = c_tok.lemma
py_tok.sense = c_tok.sense
py_tok.dep_tag = c_tok.dep_tag
py_tok.head_offset = c_tok.head
py_tok.l_kids = c_tok.l_kids
py_tok.r_kids = c_tok.r_kids
return py_tok
cdef class Token: cdef class Token:
"""An individual token. """An individual token.
Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
object.
""" """
def __init__(self, Tokens tokens, int i): def __init__(self):
self._seq = tokens pass
self.i = i #self._seq = tokens
#self.i = i
def __unicode__(self): #def __unicode__(self):
cdef const TokenC* t = &self._seq.data[self.i] # cdef const TokenC* t = &self._seq.data[self.i]
cdef int end_idx = t.idx + t.lex.length # cdef int end_idx = t.idx + t.lex.length
if self.i + 1 == self._seq.length: # if self.i + 1 == self._seq.length:
return self.string # return self.string
if end_idx == t[1].idx: # if end_idx == t[1].idx:
return self.string # return self.string
else: # else:
return self.string + ' ' # return self.string + ' '
def __len__(self): def __len__(self):
"""The number of unicode code-points in the original string. """The number of unicode code-points in the original string.
@ -201,87 +232,87 @@ cdef class Token:
Returns: Returns:
length (int): length (int):
""" """
return self._seq.data[self.i].lex.length return self.length
property idx: #property idx:
"""The index into the original string at which the token starts. # """The index into the original string at which the token starts.
The following is supposed to always be true: # The following is supposed to always be true:
#
# >>> original_string[token.idx:token.idx len(token) == token.string
# """
# def __get__(self):
# return self._seq.data[self.i].idx
>>> original_string[token.idx:token.idx len(token) == token.string #property cluster:
""" # """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
def __get__(self): #
return self._seq.data[self.i].idx # Similar words have better-than-chance likelihood of having similar cluster
# IDs, although the clustering is quite noisy. Cluster IDs make good features,
# and help to make models slightly more robust to domain variation.
property cluster: # A common trick is to use only the first N bits of a cluster ID in a feature,
"""The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering # as the more general part of the hierarchical clustering is often more accurate
# than the lower categories.
Similar words have better-than-chance likelihood of having similar cluster # To assist in this, I encode the cluster IDs little-endian, to allow a simple
IDs, although the clustering is quite noisy. Cluster IDs make good features, # bit-mask:
and help to make models slightly more robust to domain variation.
A common trick is to use only the first N bits of a cluster ID in a feature, # >>> six_bits = cluster & (2**6 - 1)
as the more general part of the hierarchical clustering is often more accurate # """
than the lower categories. # def __get__(self):
# return self._seq.data[self.i].lex.cluster
To assist in this, I encode the cluster IDs little-endian, to allow a simple #property string:
bit-mask: # """The unicode string of the word, with no whitespace padding."""
# def __get__(self):
# cdef const TokenC* t = &self._seq.data[self.i]
# if t.lex.sic == 0:
# return ''
# cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
# return utf8string.decode('utf8')
>>> six_bits = cluster & (2**6 - 1) #property lemma:
""" # """The unicode string of the word's lemma. If no part-of-speech tag is
def __get__(self): # assigned, the most common part-of-speech tag of the word is used.
return self._seq.data[self.i].lex.cluster # """
# def __get__(self):
# cdef const TokenC* t = &self._seq.data[self.i]
# if t.lemma == 0:
# return self.string
# cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
# return utf8string.decode('utf8')
property string: #property dep_tag:
"""The unicode string of the word, with no whitespace padding.""" # """The ID integer of the word's dependency label. If no parse has been
def __get__(self): # assigned, defaults to 0.
cdef const TokenC* t = &self._seq.data[self.i] # """
if t.lex.sic == 0: # def __get__(self):
return '' # return self._seq.data[self.i].dep_tag
cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
return utf8string.decode('utf8')
property lemma: #property pos:
"""The unicode string of the word's lemma. If no part-of-speech tag is # """The ID integer of the word's part-of-speech tag, from the 13-tag
assigned, the most common part-of-speech tag of the word is used. # Google Universal Tag Set. Constants for this tag set are available in
""" # spacy.typedefs.
def __get__(self): # """
cdef const TokenC* t = &self._seq.data[self.i] # def __get__(self):
if t.lemma == 0: # return self._seq.data[self.i].pos
return self.string
cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
return utf8string.decode('utf8')
property dep_tag: #property fine_pos:
"""The ID integer of the word's dependency label. If no parse has been # """The ID integer of the word's fine-grained part-of-speech tag, as assigned
assigned, defaults to 0. # by the tagger model. Fine-grained tags include morphological information,
""" # and other distinctions, and allow a more accurate tagger to be trained.
def __get__(self): # """
return self._seq.data[self.i].dep_tag
property pos: # def __get__(self):
"""The ID integer of the word's part-of-speech tag, from the 13-tag # return self._seq.data[self.i].fine_pos
Google Universal Tag Set. Constants for this tag set are available in
spacy.typedefs.
"""
def __get__(self):
return self._seq.data[self.i].pos
property fine_pos: #property sic:
"""The ID integer of the word's fine-grained part-of-speech tag, as assigned # def __get__(self):
by the tagger model. Fine-grained tags include morphological information, # return self._seq.data[self.i].lex.sic
and other distinctions, and allow a more accurate tagger to be trained.
"""
def __get__(self): #property head:
return self._seq.data[self.i].fine_pos # """The token predicted by the parser to be the head of the current token."""
# def __get__(self):
property sic: # cdef const TokenC* t = &self._seq.data[self.i]
def __get__(self): # return Token(self._seq, self.i + t.head)
return self._seq.data[self.i].lex.sic
property head:
"""The token predicted by the parser to be the head of the current token."""
def __get__(self):
cdef const TokenC* t = &self._seq.data[self.i]
return Token(self._seq, self.i + t.head)

View File

@ -4,16 +4,16 @@ from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from .structs cimport Lexeme, TokenC, UniStr from .structs cimport LexemeC, TokenC, UniStr
from .typedefs cimport utf8_t, id_t, hash_t from .typedefs cimport utf8_t, id_t, hash_t
from .strings cimport StringStore from .strings cimport StringStore
cdef Lexeme EMPTY_LEXEME cdef LexemeC EMPTY_LEXEME
cdef union LexemesOrTokens: cdef union LexemesOrTokens:
const Lexeme* const* lexemes const LexemeC* const* lexemes
TokenC* tokens TokenC* tokens
@ -27,9 +27,9 @@ cdef class Vocab:
cpdef public get_lex_props cpdef public get_lex_props
cdef Pool mem cdef Pool mem
cpdef readonly StringStore strings cpdef readonly StringStore strings
cdef vector[Lexeme*] lexemes cdef vector[LexemeC*] lexemes
cdef const Lexeme* get(self, Pool mem, UniStr* s) except NULL cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
cdef PreshMap _map cdef PreshMap _map

View File

@ -2,20 +2,27 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from libc.string cimport memset from libc.string cimport memset
from os import path from os import path
import codecs
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport init as lexeme_init from .lexeme cimport init as lexeme_init
from .strings cimport slice_unicode from .strings cimport slice_unicode
from .typedefs cimport flags_t from .strings cimport hash_string
from .orth cimport word_shape from .orth cimport word_shape
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme)) DEF MAX_VEC_SIZE = 100000
cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed, cdef float[MAX_VEC_SIZE] EMPTY_VEC
memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC))
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
EMPTY_LEXEME.vec = EMPTY_VEC
cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
StringStore string_store, dict props) except *: StringStore string_store, dict props) except *:
cdef Lexeme lex cdef LexemeC lex
lex.id = i lex.id = i
lex.length = len(string) lex.length = len(string)
lex.sic = string_store[string] lex.sic = string_store[string]
@ -28,13 +35,12 @@ cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed,
lex.suffix = string_store[string[-3:]] lex.suffix = string_store[string[-3:]]
lex.shape = string_store[word_shape(string)] lex.shape = string_store[word_shape(string)]
cdef object flags_val = props.get('flags', 0) lex.flags = props.get('flags', 0)
lex.flags = <flags_t>flags_val
return lex return lex
cdef class Vocab: cdef class Vocab:
'''A map container for a language's Lexeme structs. '''A map container for a language's LexemeC structs.
''' '''
def __init__(self, data_dir=None, get_lex_props=None): def __init__(self, data_dir=None, get_lex_props=None):
self.mem = Pool() self.mem = Pool()
@ -50,24 +56,25 @@ cdef class Vocab:
if not path.isdir(data_dir): if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
self.strings.load(path.join(data_dir, 'strings.txt')) self.strings.load(path.join(data_dir, 'strings.txt'))
self.load(path.join(data_dir, 'lexemes.bin')) self.load_lexemes(path.join(data_dir, 'lexemes.bin'))
#self.load_vectors(path.join(data_dir, 'deps.words'))
def __len__(self): def __len__(self):
"""The current number of lexemes stored.""" """The current number of lexemes stored."""
return self.lexemes.size() return self.lexemes.size()
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL: cdef const LexemeC* get(self, Pool mem, UniStr* string) except NULL:
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.''' is the lexicon's own memory, the lexeme is saved in the lexicon.'''
cdef Lexeme* lex cdef LexemeC* lex
lex = <Lexeme*>self._map.get(string.key) lex = <LexemeC*>self._map.get(string.key)
if lex != NULL: if lex != NULL:
return lex return lex
if string.n < 3: if string.n < 3:
mem = self.mem mem = self.mem
cdef unicode py_string = string.chars[:string.n] cdef unicode py_string = string.chars[:string.n]
lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1) lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings, lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings,
self.get_lex_props(py_string)) self.get_lex_props(py_string))
if mem is self.mem: if mem is self.mem:
@ -81,13 +88,13 @@ cdef class Vocab:
def __getitem__(self, id_or_string): def __getitem__(self, id_or_string):
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously '''Retrieve a lexeme, given an int ID or a unicode string. If a previously
unseen unicode string is given, a new Lexeme is created and stored. unseen unicode string is given, a new LexemeC is created and stored.
This function relies on Cython's struct-to-dict conversion. Python clients This function relies on Cython's struct-to-dict conversion. Python clients
receive a dict keyed by strings (byte or unicode, depending on Python 2/3), receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
with int values. Cython clients can instead receive a Lexeme struct value. with int values. Cython clients can instead receive a LexemeC struct value.
More efficient Cython access is provided by Lexicon.get, which returns More efficient Cython access is provided by Lexicon.get, which returns
a Lexeme*. a LexemeC*.
Args: Args:
id_or_string (int or unicode): The integer ID of a word, or its unicode id_or_string (int or unicode): The integer ID of a word, or its unicode
@ -96,24 +103,26 @@ cdef class Vocab:
is raised. is raised.
Returns: Returns:
lexeme (dict): A Lexeme struct instance, which Cython translates into lexeme (dict): A LexemeC struct instance, which Cython translates into
a dict if the operator is called from Python. a dict if the operator is called from Python.
''' '''
if type(id_or_string) == int: if type(id_or_string) == int:
if id_or_string >= self.lexemes.size(): if id_or_string >= self.lexemes.size():
raise IndexError raise IndexError
return self.lexemes.at(id_or_string)[0] return {}
#return self.lexemes.at(id_or_string)[0]
cdef UniStr string cdef UniStr string
slice_unicode(&string, id_or_string, 0, len(id_or_string)) slice_unicode(&string, id_or_string, 0, len(id_or_string))
cdef const Lexeme* lexeme = self.get(self.mem, &string) cdef const LexemeC* lexeme = self.get(self.mem, &string)
return lexeme[0] return {}
#return lexeme[0]
def __setitem__(self, unicode uni_string, dict props): def __setitem__(self, unicode uni_string, dict props):
cdef UniStr s cdef UniStr s
slice_unicode(&s, uni_string, 0, len(uni_string)) slice_unicode(&s, uni_string, 0, len(uni_string))
# Cast through the const here, since we're allowed to change our own # Cast through the const here, since we're allowed to change our own
# Lexemes. # LexemeCs.
lex = <Lexeme*><void*>self.get(self.mem, &s) lex = <LexemeC*><void*>self.get(self.mem, &s)
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props) lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
def dump(self, loc): def dump(self, loc):
@ -128,30 +137,30 @@ cdef class Vocab:
key = self._map.c_map.cells[i].key key = self._map.c_map.cells[i].key
if key == 0: if key == 0:
continue continue
lexeme = <Lexeme*>self._map.c_map.cells[i].value lexeme = <LexemeC*>self._map.c_map.cells[i].value
st = fwrite(&key, sizeof(key), 1, fp) st = fwrite(&key, sizeof(key), 1, fp)
assert st == 1 assert st == 1
st = fwrite(lexeme, sizeof(Lexeme), 1, fp) st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
assert st == 1 assert st == 1
st = fclose(fp) st = fclose(fp)
assert st == 0 assert st == 0
def load(self, loc): def load_lexemes(self, loc):
if not path.exists(loc): if not path.exists(loc):
raise IOError('Lexemes file not found at %s' % loc) raise IOError('LexemeCs file not found at %s' % loc)
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
cdef FILE* fp = fopen(<char*>bytes_loc, 'rb') cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
assert fp != NULL assert fp != NULL
cdef size_t st cdef size_t st
cdef Lexeme* lexeme cdef LexemeC* lexeme
cdef hash_t key cdef hash_t key
i = 0 i = 0
while True: while True:
st = fread(&key, sizeof(key), 1, fp) st = fread(&key, sizeof(key), 1, fp)
if st != 1: if st != 1:
break break
lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1) lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
st = fread(lexeme, sizeof(Lexeme), 1, fp) st = fread(lexeme, sizeof(LexemeC), 1, fp)
if st != 1: if st != 1:
break break
self._map.set(key, lexeme) self._map.set(key, lexeme)
@ -160,3 +169,29 @@ cdef class Vocab:
self.lexemes[lexeme.id] = lexeme self.lexemes[lexeme.id] = lexeme
i += 1 i += 1
fclose(fp) fclose(fp)
def load_vectors(self, loc):
cdef int i
cdef unicode line
cdef unicode word
cdef unicode val_str
cdef hash_t key
cdef LexemeC* lex
cdef float* vec
with codecs.open(loc, 'r', 'utf8') as file_:
for line in file_:
pieces = line.split()
word = pieces.pop(0)
if len(pieces) >= MAX_VEC_SIZE:
sizes = (len(pieces), MAX_VEC_SIZE)
msg = ("Your vector is %d elements."
"The compile-time limit is %d elements." % sizes)
raise ValueError(msg)
key = hash_string(word)
lex = <LexemeC*>self._map.get(key)
if lex is not NULL:
vec = <float*>self.mem.alloc(len(pieces), sizeof(float))
for i, val_str in enumerate(pieces):
vec[i] = float(val_str)
lex.vec = vec