From ce2edd6312ee7fcf6bcafe235cb6cb4a1406d1e1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Jan 2015 10:26:22 +1100 Subject: [PATCH] * Tmp commit. Refactoring to create a Python Lexeme class. --- spacy/en/__init__.py | 17 ++-- spacy/en/download.py | 2 +- spacy/en/pos.pxd | 4 +- spacy/en/pos.pyx | 4 +- spacy/lexeme.pxd | 10 +-- spacy/lexeme.pyx | 9 +- spacy/morphology.pxd | 2 +- spacy/strings.pxd | 3 + spacy/strings.pyx | 6 ++ spacy/structs.pxd | 6 +- spacy/tokenizer.pxd | 10 +-- spacy/tokenizer.pyx | 30 +++---- spacy/tokens.pxd | 41 +++++++-- spacy/tokens.pyx | 207 +++++++++++++++++++++++++------------------ spacy/vocab.pxd | 10 +-- spacy/vocab.pyx | 93 +++++++++++++------ 16 files changed, 281 insertions(+), 173 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 94ab36291..633ba48e4 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals from os import path +from .. import orth from ..vocab import Vocab from ..tokenizer import Tokenizer from ..syntax.parser import GreedyParser @@ -10,12 +11,10 @@ from .pos import POS_TAGS from .attrs import get_flags -DATA_DIR = path.join(path.dirname(__file__), 'data') - - def get_lex_props(string): return {'flags': get_flags(string), 'dense': 1} +LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') class English(object): """The English NLP pipeline. @@ -44,16 +43,18 @@ class English(object): parser (spacy.syntax.parser.GreedyParser): A greedy shift-reduce dependency parser. """ - def __init__(self, data_dir=None): - if data_dir is None: - data_dir = path.join(path.dirname(__file__), 'data') + def __init__(self, data_dir=LOCAL_DATA_DIR): self._data_dir = data_dir self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'), get_lex_props=get_lex_props) tag_names = list(POS_TAGS.keys()) tag_names.sort() - self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'), - POS_TAGS, tag_names) + if data_dir is None: + self.tokenizer = Tokenizer(self.vocab, {}, None, None, None, + POS_TAGS, tag_names) + else: + self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'), + POS_TAGS, tag_names) self.strings = self.vocab.strings self._tagger = None self._parser = None diff --git a/spacy/en/download.py b/spacy/en/download.py index 9f74f0620..709fd7cb4 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -4,7 +4,7 @@ import tarfile import shutil import requests -URL = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz' +PARSER_URL = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz' DEST_DIR = path.join(path.dirname(__file__), 'data', 'deps') diff --git a/spacy/en/pos.pxd b/spacy/en/pos.pxd index 223b7aef3..d3697b97e 100644 --- a/spacy/en/pos.pxd +++ b/spacy/en/pos.pxd @@ -3,7 +3,7 @@ from cymem.cymem cimport Pool from .._ml cimport Model from ..strings cimport StringStore -from ..structs cimport TokenC, Lexeme, Morphology, PosTag +from ..structs cimport TokenC, LexemeC, Morphology, PosTag from ..typedefs cimport univ_tag_t from .lemmatizer import Lemmatizer @@ -21,5 +21,5 @@ cdef class EnPosTagger: cdef readonly int n_tags cdef int set_morph(self, const int i, TokenC* tokens) except -1 - cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1 + cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1 diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index b3b5b8d4b..114aea2ce 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -12,7 +12,7 @@ from ..typedefs cimport univ_tag_t from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB from ..typedefs cimport X, PUNCT, EOL from ..typedefs cimport id_t -from ..structs cimport TokenC, Morphology, Lexeme +from ..structs cimport TokenC, Morphology, LexemeC from ..tokens cimport Tokens from ..morphology cimport set_morph_from_dict from .._ml cimport arg_max @@ -290,7 +290,7 @@ cdef class EnPosTagger: tokens[i].lemma = cached.lemma tokens[i].morph = cached.morph - cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: + cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1: if self.lemmatizer is None: return lex.sic cdef bytes py_string = self.strings[lex.sic] diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 35826ef55..5f26ec266 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,21 +1,21 @@ from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE -from .structs cimport Lexeme +from .structs cimport LexemeC from .strings cimport StringStore -cdef Lexeme EMPTY_LEXEME +cdef LexemeC EMPTY_LEXEME -cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store, +cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store, dict props) except * -cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil: +cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: return lexeme.flags & (1 << flag_id) -cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil: +cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil: if feat_name < (sizeof(flags_t) * 8): return check_flag(lex, feat_name) elif feat_name == ID: diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 1423f30c9..e77c90ead 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -7,12 +7,12 @@ from libc.string cimport memset from .orth cimport word_shape -memset(&EMPTY_LEXEME, 0, sizeof(Lexeme)) +memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) -cpdef Lexeme init(id_t i, unicode string, hash_t hashed, +cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore string_store, dict props) except *: - cdef Lexeme lex + cdef LexemeC lex lex.id = i lex.length = len(string) lex.sic = string_store[string] @@ -27,3 +27,6 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed, lex.flags = props.get('flags', 0) return lex + + + diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index f2cb22b74..5dfee4250 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,4 +1,4 @@ -from .structs cimport TokenC, Lexeme, Morphology, PosTag +from .structs cimport TokenC, Morphology, PosTag cdef int set_morph_from_dict(Morphology* morph, dict props) except -1 diff --git a/spacy/strings.pxd b/spacy/strings.pxd index 9c16cfe1c..178ae51b6 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -3,6 +3,9 @@ from preshed.maps cimport PreshMap from murmurhash.mrmr cimport hash64 from .structs cimport Utf8Str, UniStr +from .typedefs cimport hash_t + +cpdef hash_t hash_string(unicode string) except 0 cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil: diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 67d375ed7..29afde45c 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,6 +1,7 @@ import codecs from libc.string cimport memcpy +from murmurhash.mrmr cimport hash64 from .typedefs cimport hash_t @@ -9,6 +10,11 @@ from .typedefs cimport hash_t SEPARATOR = '\n|-SEP-|\n' +cpdef hash_t hash_string(unicode string) except 0: + chars = string + return hash64(chars, len(string) * sizeof(Py_UNICODE), 0) + + """ cdef class SymbolMap: def __init__(self): diff --git a/spacy/structs.pxd b/spacy/structs.pxd index ee476eed6..8ddddf4d2 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -3,7 +3,9 @@ from libc.stdint cimport uint8_t, uint32_t from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t -cdef struct Lexeme: +cdef struct LexemeC: + const float* vec + flags_t flags attr_t id @@ -38,7 +40,7 @@ cdef struct PosTag: cdef struct TokenC: - const Lexeme* lex + const LexemeC* lex Morphology morph univ_tag_t pos int fine_pos diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index add47425c..2837a4c47 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -6,14 +6,14 @@ from preshed.maps cimport PreshMap from cymem.cymem cimport Pool from .typedefs cimport hash_t -from .structs cimport Lexeme, TokenC, Morphology, UniStr +from .structs cimport LexemeC, TokenC, Morphology, UniStr from .strings cimport StringStore from .tokens cimport Tokens from .vocab cimport Vocab, _Cached cdef union LexemesOrTokens: - const Lexeme* const* lexemes + const LexemeC* const* lexemes TokenC* tokens @@ -33,10 +33,10 @@ cdef class Tokenizer: cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1 cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1 - cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes, - vector[Lexeme*] *suffixes) except NULL + cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes, + vector[LexemeC*] *suffixes) except NULL cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, - vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1 + vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index bede109c7..d0494917a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -53,7 +53,7 @@ cdef class Tokenizer: cdef int idx = 0 for i, py_string in enumerate(strings): slice_unicode(&string_struct, py_string, 0, len(py_string)) - tokens.push_back(idx, self.vocab.get(tokens.mem, &string_struct)) + tokens.push_back(idx, self.vocab.get(tokens.mem, &string_struct)) idx += len(py_string) + 1 return tokens @@ -75,7 +75,7 @@ cdef class Tokenizer: string (unicode): The string to be tokenized. Returns: - tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes. + tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs. """ cdef int length = len(string) cdef Tokens tokens = Tokens(self.vocab, length) @@ -121,8 +121,8 @@ cdef class Tokenizer: return True cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1: - cdef vector[Lexeme*] prefixes - cdef vector[Lexeme*] suffixes + cdef vector[LexemeC*] prefixes + cdef vector[LexemeC*] suffixes cdef hash_t orig_key cdef int orig_size orig_key = span.key @@ -131,8 +131,8 @@ cdef class Tokenizer: self._attach_tokens(tokens, start, span, &prefixes, &suffixes) self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size) - cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes, - vector[const Lexeme*] *suffixes) except NULL: + cdef UniStr* _split_affixes(self, UniStr* string, vector[const LexemeC*] *prefixes, + vector[const LexemeC*] *suffixes) except NULL: cdef size_t i cdef UniStr prefix cdef UniStr suffix @@ -174,12 +174,12 @@ cdef class Tokenizer: return string cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, - vector[const Lexeme*] *prefixes, - vector[const Lexeme*] *suffixes) except -1: + vector[const LexemeC*] *prefixes, + vector[const LexemeC*] *suffixes) except -1: cdef bint cache_hit cdef int split - cdef const Lexeme* const* lexemes - cdef Lexeme* lexeme + cdef const LexemeC* const* lexemes + cdef LexemeC* lexeme cdef UniStr span cdef int i if prefixes.size(): @@ -200,7 +200,7 @@ cdef class Tokenizer: idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span)) slice_unicode(&span, string.chars, split + 1, string.n) idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span)) - cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin() + cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): idx = tokens.push_back(idx, deref(it)) preinc(it) @@ -213,10 +213,10 @@ cdef class Tokenizer: cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = n cached.is_lex = True - lexemes = self.mem.alloc(n, sizeof(Lexeme**)) + lexemes = self.mem.alloc(n, sizeof(LexemeC**)) for i in range(n): lexemes[i] = tokens[i].lex - cached.data.lexemes = lexemes + cached.data.lexemes = lexemes self._cache.set(key, cached) cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1: @@ -243,7 +243,7 @@ cdef class Tokenizer: cdef unicode form cdef unicode lemma cdef dict props - cdef Lexeme** lexemes + cdef LexemeC** lexemes cdef hash_t hashed cdef UniStr string for chunk, substrings in sorted(rules.items()): @@ -252,7 +252,7 @@ cdef class Tokenizer: form = props['F'] lemma = props.get("L", None) slice_unicode(&string, form, 0, len(form)) - tokens[i].lex = self.vocab.get(self.vocab.mem, &string) + tokens[i].lex = self.vocab.get(self.vocab.mem, &string) if lemma: tokens[i].lemma = self.vocab.strings[lemma] if 'pos' in props: diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index e50e688ac..35a7c2b63 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -5,13 +5,13 @@ from cython.view cimport array as cvarray from cymem.cymem cimport Pool from thinc.typedefs cimport atom_t -from .typedefs cimport flags_t, attr_id_t, attr_t -from .structs cimport Morphology, TokenC, Lexeme +from .typedefs cimport flags_t, attr_id_t, attr_t, univ_tag_t +from .structs cimport Morphology, TokenC, LexemeC from .vocab cimport Vocab from .strings cimport StringStore -ctypedef const Lexeme* const_Lexeme_ptr +ctypedef const LexemeC* const_Lexeme_ptr ctypedef TokenC* TokenC_ptr ctypedef fused LexemeOrToken: @@ -19,10 +19,10 @@ ctypedef fused LexemeOrToken: TokenC_ptr -cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil +cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil cdef attr_t get_token_attr(const TokenC* lex, attr_id_t feat_name) nogil -cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil: +cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: return lexeme.flags & (1 << flag_id) @@ -42,5 +42,32 @@ cdef class Tokens: cdef class Token: - cdef Tokens _seq - cdef readonly int i + cdef cvarray vec + + cdef readonly flags_t flags + + cdef readonly attr_t id + cdef readonly attr_t sic + cdef readonly attr_t dense + cdef readonly attr_t shape + cdef readonly attr_t prefix + cdef readonly attr_t suffix + + cdef readonly attr_t length + cdef readonly attr_t cluster + cdef readonly attr_t pos_type + + cdef readonly float prob + cdef readonly float sentiment + + cdef readonly Morphology morph + cdef readonly univ_tag_t pos + cdef readonly int fine_pos + cdef readonly int idx + cdef readonly int lemma + cdef readonly int sense + cdef readonly int dep_tag + + cdef readonly int head_offset + cdef readonly uint32_t l_kids + cdef readonly uint32_t r_kids diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 0d8bc91b0..7e73ab4f8 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -32,7 +32,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: return get_lex_attr(token.lex, feat_name) -cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil: +cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil: if feat_name < (sizeof(flags_t) * 8): return check_flag(lex, feat_name) elif feat_name == ID: @@ -85,7 +85,7 @@ cdef class Tokens: token (Token): """ bounds_check(i, self.length, PADDING) - return Token(self, i) + return cinit_token(&self.data[i]) def __iter__(self): """Iterate over the tokens. @@ -174,26 +174,57 @@ cdef class Tokens: self.data[i].lex = &EMPTY_LEXEME -@cython.freelist(64) +cdef Token cinit_token(const TokenC* c_tok): + cdef const LexemeC* lex = c_tok.lex + cdef Token py_tok = Token.__new__(Token) + + cyarr = cvarray(shape=(300,), itemsize=sizeof(float), format="i") + py_tok.vec = cyarr + + py_tok.flags = lex.flags + py_tok.id = lex.id + py_tok.sic = lex.sic + py_tok.dense = lex.dense + py_tok.shape = lex.shape + py_tok.prefix = lex.prefix + py_tok.suffix = lex.suffix + py_tok.length = lex.length + py_tok.cluster = lex.cluster + py_tok.pos_type = lex.pos_type + + py_tok.prob = lex.prob + py_tok.sentiment = lex.sentiment + + py_tok.morph = c_tok.morph + py_tok.pos = c_tok.pos + py_tok.fine_pos = c_tok.fine_pos + py_tok.idx = c_tok.idx + py_tok.lemma = c_tok.lemma + py_tok.sense = c_tok.sense + py_tok.dep_tag = c_tok.dep_tag + py_tok.head_offset = c_tok.head + py_tok.l_kids = c_tok.l_kids + py_tok.r_kids = c_tok.r_kids + return py_tok + + cdef class Token: """An individual token. - - Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens - object. """ - def __init__(self, Tokens tokens, int i): - self._seq = tokens - self.i = i + def __init__(self): + pass + #self._seq = tokens + #self.i = i - def __unicode__(self): - cdef const TokenC* t = &self._seq.data[self.i] - cdef int end_idx = t.idx + t.lex.length - if self.i + 1 == self._seq.length: - return self.string - if end_idx == t[1].idx: - return self.string - else: - return self.string + ' ' + #def __unicode__(self): + # cdef const TokenC* t = &self._seq.data[self.i] + # cdef int end_idx = t.idx + t.lex.length + # if self.i + 1 == self._seq.length: + # return self.string + # if end_idx == t[1].idx: + # return self.string + # else: + # return self.string + ' ' def __len__(self): """The number of unicode code-points in the original string. @@ -201,87 +232,87 @@ cdef class Token: Returns: length (int): """ - return self._seq.data[self.i].lex.length + return self.length - property idx: - """The index into the original string at which the token starts. + #property idx: + # """The index into the original string at which the token starts. - The following is supposed to always be true: - - >>> original_string[token.idx:token.idx len(token) == token.string - """ - def __get__(self): - return self._seq.data[self.i].idx + # The following is supposed to always be true: + # + # >>> original_string[token.idx:token.idx len(token) == token.string + # """ + # def __get__(self): + # return self._seq.data[self.i].idx - property cluster: - """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering - - Similar words have better-than-chance likelihood of having similar cluster - IDs, although the clustering is quite noisy. Cluster IDs make good features, - and help to make models slightly more robust to domain variation. + #property cluster: + # """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering + # + # Similar words have better-than-chance likelihood of having similar cluster + # IDs, although the clustering is quite noisy. Cluster IDs make good features, + # and help to make models slightly more robust to domain variation. - A common trick is to use only the first N bits of a cluster ID in a feature, - as the more general part of the hierarchical clustering is often more accurate - than the lower categories. + # A common trick is to use only the first N bits of a cluster ID in a feature, + # as the more general part of the hierarchical clustering is often more accurate + # than the lower categories. - To assist in this, I encode the cluster IDs little-endian, to allow a simple - bit-mask: + # To assist in this, I encode the cluster IDs little-endian, to allow a simple + # bit-mask: - >>> six_bits = cluster & (2**6 - 1) - """ - def __get__(self): - return self._seq.data[self.i].lex.cluster + # >>> six_bits = cluster & (2**6 - 1) + # """ + # def __get__(self): + # return self._seq.data[self.i].lex.cluster - property string: - """The unicode string of the word, with no whitespace padding.""" - def __get__(self): - cdef const TokenC* t = &self._seq.data[self.i] - if t.lex.sic == 0: - return '' - cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic] - return utf8string.decode('utf8') + #property string: + # """The unicode string of the word, with no whitespace padding.""" + # def __get__(self): + # cdef const TokenC* t = &self._seq.data[self.i] + # if t.lex.sic == 0: + # return '' + # cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic] + # return utf8string.decode('utf8') - property lemma: - """The unicode string of the word's lemma. If no part-of-speech tag is - assigned, the most common part-of-speech tag of the word is used. - """ - def __get__(self): - cdef const TokenC* t = &self._seq.data[self.i] - if t.lemma == 0: - return self.string - cdef bytes utf8string = self._seq.vocab.strings[t.lemma] - return utf8string.decode('utf8') + #property lemma: + # """The unicode string of the word's lemma. If no part-of-speech tag is + # assigned, the most common part-of-speech tag of the word is used. + # """ + # def __get__(self): + # cdef const TokenC* t = &self._seq.data[self.i] + # if t.lemma == 0: + # return self.string + # cdef bytes utf8string = self._seq.vocab.strings[t.lemma] + # return utf8string.decode('utf8') - property dep_tag: - """The ID integer of the word's dependency label. If no parse has been - assigned, defaults to 0. - """ - def __get__(self): - return self._seq.data[self.i].dep_tag + #property dep_tag: + # """The ID integer of the word's dependency label. If no parse has been + # assigned, defaults to 0. + # """ + # def __get__(self): + # return self._seq.data[self.i].dep_tag - property pos: - """The ID integer of the word's part-of-speech tag, from the 13-tag - Google Universal Tag Set. Constants for this tag set are available in - spacy.typedefs. - """ - def __get__(self): - return self._seq.data[self.i].pos + #property pos: + # """The ID integer of the word's part-of-speech tag, from the 13-tag + # Google Universal Tag Set. Constants for this tag set are available in + # spacy.typedefs. + # """ + # def __get__(self): + # return self._seq.data[self.i].pos - property fine_pos: - """The ID integer of the word's fine-grained part-of-speech tag, as assigned - by the tagger model. Fine-grained tags include morphological information, - and other distinctions, and allow a more accurate tagger to be trained. - """ + #property fine_pos: + # """The ID integer of the word's fine-grained part-of-speech tag, as assigned + # by the tagger model. Fine-grained tags include morphological information, + # and other distinctions, and allow a more accurate tagger to be trained. + # """ - def __get__(self): - return self._seq.data[self.i].fine_pos + # def __get__(self): + # return self._seq.data[self.i].fine_pos - property sic: - def __get__(self): - return self._seq.data[self.i].lex.sic + #property sic: + # def __get__(self): + # return self._seq.data[self.i].lex.sic - property head: - """The token predicted by the parser to be the head of the current token.""" - def __get__(self): - cdef const TokenC* t = &self._seq.data[self.i] - return Token(self._seq, self.i + t.head) + #property head: + # """The token predicted by the parser to be the head of the current token.""" + # def __get__(self): + # cdef const TokenC* t = &self._seq.data[self.i] + # return Token(self._seq, self.i + t.head) diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index dc3eb7aba..203d3c7a5 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -4,16 +4,16 @@ from preshed.maps cimport PreshMap from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 -from .structs cimport Lexeme, TokenC, UniStr +from .structs cimport LexemeC, TokenC, UniStr from .typedefs cimport utf8_t, id_t, hash_t from .strings cimport StringStore -cdef Lexeme EMPTY_LEXEME +cdef LexemeC EMPTY_LEXEME cdef union LexemesOrTokens: - const Lexeme* const* lexemes + const LexemeC* const* lexemes TokenC* tokens @@ -27,9 +27,9 @@ cdef class Vocab: cpdef public get_lex_props cdef Pool mem cpdef readonly StringStore strings - cdef vector[Lexeme*] lexemes + cdef vector[LexemeC*] lexemes - cdef const Lexeme* get(self, Pool mem, UniStr* s) except NULL + cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL cdef PreshMap _map diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 6b6fee922..a63edb6b4 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -2,20 +2,27 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE from libc.string cimport memset from os import path +import codecs from .lexeme cimport EMPTY_LEXEME from .lexeme cimport init as lexeme_init from .strings cimport slice_unicode -from .typedefs cimport flags_t +from .strings cimport hash_string from .orth cimport word_shape -memset(&EMPTY_LEXEME, 0, sizeof(Lexeme)) +DEF MAX_VEC_SIZE = 100000 -cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed, +cdef float[MAX_VEC_SIZE] EMPTY_VEC +memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC)) +memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) +EMPTY_LEXEME.vec = EMPTY_VEC + + +cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed, StringStore string_store, dict props) except *: - cdef Lexeme lex + cdef LexemeC lex lex.id = i lex.length = len(string) lex.sic = string_store[string] @@ -28,13 +35,12 @@ cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed, lex.suffix = string_store[string[-3:]] lex.shape = string_store[word_shape(string)] - cdef object flags_val = props.get('flags', 0) - lex.flags = flags_val + lex.flags = props.get('flags', 0) return lex cdef class Vocab: - '''A map container for a language's Lexeme structs. + '''A map container for a language's LexemeC structs. ''' def __init__(self, data_dir=None, get_lex_props=None): self.mem = Pool() @@ -50,24 +56,25 @@ cdef class Vocab: if not path.isdir(data_dir): raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) self.strings.load(path.join(data_dir, 'strings.txt')) - self.load(path.join(data_dir, 'lexemes.bin')) + self.load_lexemes(path.join(data_dir, 'lexemes.bin')) + #self.load_vectors(path.join(data_dir, 'deps.words')) def __len__(self): """The current number of lexemes stored.""" return self.lexemes.size() - cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL: - '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme + cdef const LexemeC* get(self, Pool mem, UniStr* string) except NULL: + '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme if necessary, using memory acquired from the given pool. If the pool is the lexicon's own memory, the lexeme is saved in the lexicon.''' - cdef Lexeme* lex - lex = self._map.get(string.key) + cdef LexemeC* lex + lex = self._map.get(string.key) if lex != NULL: return lex if string.n < 3: mem = self.mem cdef unicode py_string = string.chars[:string.n] - lex = mem.alloc(sizeof(Lexeme), 1) + lex = mem.alloc(sizeof(LexemeC), 1) lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings, self.get_lex_props(py_string)) if mem is self.mem: @@ -81,13 +88,13 @@ cdef class Vocab: def __getitem__(self, id_or_string): '''Retrieve a lexeme, given an int ID or a unicode string. If a previously - unseen unicode string is given, a new Lexeme is created and stored. + unseen unicode string is given, a new LexemeC is created and stored. This function relies on Cython's struct-to-dict conversion. Python clients receive a dict keyed by strings (byte or unicode, depending on Python 2/3), - with int values. Cython clients can instead receive a Lexeme struct value. + with int values. Cython clients can instead receive a LexemeC struct value. More efficient Cython access is provided by Lexicon.get, which returns - a Lexeme*. + a LexemeC*. Args: id_or_string (int or unicode): The integer ID of a word, or its unicode @@ -96,24 +103,26 @@ cdef class Vocab: is raised. Returns: - lexeme (dict): A Lexeme struct instance, which Cython translates into + lexeme (dict): A LexemeC struct instance, which Cython translates into a dict if the operator is called from Python. ''' if type(id_or_string) == int: if id_or_string >= self.lexemes.size(): raise IndexError - return self.lexemes.at(id_or_string)[0] + return {} + #return self.lexemes.at(id_or_string)[0] cdef UniStr string slice_unicode(&string, id_or_string, 0, len(id_or_string)) - cdef const Lexeme* lexeme = self.get(self.mem, &string) - return lexeme[0] + cdef const LexemeC* lexeme = self.get(self.mem, &string) + return {} + #return lexeme[0] def __setitem__(self, unicode uni_string, dict props): cdef UniStr s slice_unicode(&s, uni_string, 0, len(uni_string)) # Cast through the const here, since we're allowed to change our own - # Lexemes. - lex = self.get(self.mem, &s) + # LexemeCs. + lex = self.get(self.mem, &s) lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props) def dump(self, loc): @@ -128,30 +137,30 @@ cdef class Vocab: key = self._map.c_map.cells[i].key if key == 0: continue - lexeme = self._map.c_map.cells[i].value + lexeme = self._map.c_map.cells[i].value st = fwrite(&key, sizeof(key), 1, fp) assert st == 1 - st = fwrite(lexeme, sizeof(Lexeme), 1, fp) + st = fwrite(lexeme, sizeof(LexemeC), 1, fp) assert st == 1 st = fclose(fp) assert st == 0 - def load(self, loc): + def load_lexemes(self, loc): if not path.exists(loc): - raise IOError('Lexemes file not found at %s' % loc) + raise IOError('LexemeCs file not found at %s' % loc) cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc cdef FILE* fp = fopen(bytes_loc, 'rb') assert fp != NULL cdef size_t st - cdef Lexeme* lexeme + cdef LexemeC* lexeme cdef hash_t key i = 0 while True: st = fread(&key, sizeof(key), 1, fp) if st != 1: break - lexeme = self.mem.alloc(sizeof(Lexeme), 1) - st = fread(lexeme, sizeof(Lexeme), 1, fp) + lexeme = self.mem.alloc(sizeof(LexemeC), 1) + st = fread(lexeme, sizeof(LexemeC), 1, fp) if st != 1: break self._map.set(key, lexeme) @@ -160,3 +169,29 @@ cdef class Vocab: self.lexemes[lexeme.id] = lexeme i += 1 fclose(fp) + + def load_vectors(self, loc): + cdef int i + cdef unicode line + cdef unicode word + cdef unicode val_str + cdef hash_t key + cdef LexemeC* lex + cdef float* vec + + with codecs.open(loc, 'r', 'utf8') as file_: + for line in file_: + pieces = line.split() + word = pieces.pop(0) + if len(pieces) >= MAX_VEC_SIZE: + sizes = (len(pieces), MAX_VEC_SIZE) + msg = ("Your vector is %d elements." + "The compile-time limit is %d elements." % sizes) + raise ValueError(msg) + key = hash_string(word) + lex = self._map.get(key) + if lex is not NULL: + vec = self.mem.alloc(len(pieces), sizeof(float)) + for i, val_str in enumerate(pieces): + vec[i] = float(val_str) + lex.vec = vec