diff --git a/spacy/de.pxd b/spacy/de.pxd deleted file mode 100644 index b4c8bf0c8..000000000 --- a/spacy/de.pxd +++ /dev/null @@ -1,42 +0,0 @@ -from spacy.spacy cimport Language -from spacy.word cimport Lexeme -cimport cython - - -cpdef size_t ALPHA -cpdef size_t DIGIT -cpdef size_t PUNCT -cpdef size_t SPACE -cpdef size_t LOWER -cpdef size_t UPPER -cpdef size_t TITLE -cpdef size_t ASCII - -cpdef size_t OFT_LOWER -cpdef size_t OFT_TITLE -cpdef size_t OFT_UPPER - -cpdef size_t PUNCT -cpdef size_t CONJ -cpdef size_t NUM -cpdef size_t N -cpdef size_t DET -cpdef size_t ADP -cpdef size_t ADJ -cpdef size_t ADV -cpdef size_t VERB -cpdef size_t NOUN -cpdef size_t PDT -cpdef size_t POS -cpdef size_t PRON -cpdef size_t PRT - -cdef class English(spacy.Language): - cdef int find_split(self, unicode word) - - -cdef English EN - - -cpdef Word lookup(unicode word) -cpdef list tokenize(unicode string) diff --git a/spacy/de.pyx b/spacy/de.pyx deleted file mode 100644 index 90c64f163..000000000 --- a/spacy/de.pyx +++ /dev/null @@ -1,126 +0,0 @@ -# cython: profile=True -# cython: embedsignature=True -'''Tokenize German text, using a scheme based on the Negra corpus. - -Tokenization is generally similar to English text, and the same set of orthographic -flags are used. - -An abbreviation list is used to handle common abbreviations. Hyphenated words -are not split, following the Treebank usage. -''' -from __future__ import unicode_literals - -from libc.stdint cimport uint64_t - -cimport spacy - -from spacy.orth import is_alpha, is_digit, is_punct, is_space, is_lower, is_ascii -from spacy.orth import canonicalize_case, get_string_shape, asciify, get_non_sparse -from spacy.common cimport check_punct - -# Python-readable flag constants --- can't read an enum from Python - -# Don't want to manually assign these numbers, or we'll insert one and have to -# change them all. -# Don't use "i", as we don't want it in the global scope! -cdef size_t __i = 0 - -ALPHA = __i; i += 1 -DIGIT = __i; __i += 1 -PUNCT = __i; __i += 1 -SPACE = __i; __i += 1 -LOWER = __i; __i += 1 -UPPER = __i; __i += 1 -TITLE = __i; __i += 1 -ASCII = __i; __i += 1 - -OFT_LOWER = __i; __i += 1 -OFT_UPPER = __i; __i += 1 -OFT_TITLE = __i; __i += 1 - -PUNCT = __i; __i += 1 -CONJ = __i; __i += 1 -NUM = __i; __i += 1 -X = __i; __i += 1 -DET = __i; __i += 1 -ADP = __i; __i += 1 -ADJ = __i; __i += 1 -ADV = __i; __i += 1 -VERB = __i; __i += 1 -NOUN = __i; __i += 1 -PDT = __i; __i += 1 -POS = __i; __i += 1 -PRON = __i; __i += 1 -PRT = __i; __i += 1 - - -# These are for the string views -__i = 0 -SIC = __i; __i += 1 -CANON_CASED = __i; __i += 1 -NON_SPARSE = __i; __i += 1 -SHAPE = __i; __i += 1 -NR_STRING_VIEWS = __i - - -def get_string_views(unicode string, lexeme): - views = ['' for _ in range(NR_STRING_VIEWS)] - views[SIC] = string - views[CANON_CASED] = canonicalize_case(string, lexeme) - views[SHAPE] = get_string_shape(string) - views[ASCIIFIED] = get_asciified(string) - views[FIXED_VOCAB] = get_non_sparse(string, views[ASCIIFIED], views[CANON_CASED], - views[SHAPE], lexeme) - return views - - -def set_orth_flags(unicode string, flags_t flags) - setters = [ - (ALPHA, is_alpha), - (DIGIT, is_digit), - (PUNCT, is_punct), - (SPACE, is_space), - (LOWER, is_lower), - (UPPER, is_upper), - (SPACE, is_space) - ] - - for bit, setter in setters: - if setter(string): - flags |= 1 << bit - return flags - - -cdef class German(spacy.Language): - cdef Lexeme new_lexeme(self, unicode string, cluster=0, case_stats=None, - tag_freqs=None): - return Lexeme(s, length, views, prob=prob, cluster=cluster, - flags=self.get_flags(string) - - cdef int find_split(self, unicode word): - cdef size_t length = len(word) - cdef int i = 0 - if word.startswith("'s") or word.startswith("'S"): - return 2 - # Contractions - if word.endswith("'s") and length >= 3: - return length - 2 - # Leading punctuation - if check_punct(word, 0, length): - return 1 - elif length >= 1: - # Split off all trailing punctuation characters - i = 0 - while i < length and not check_punct(word, i, length): - i += 1 - return i - - -DE = German('de') - -lookup = DE.lookup -tokenize = DE.tokenize -load_clusters = DE.load_clusters -load_unigram_probs = DE.load_unigram_probs -load_case_stats = DE.load_case_stats -load_tag_stats = DE.load_tag_stats diff --git a/spacy/en.pxd b/spacy/en.pxd index 5160a1177..a7c643eba 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -1,5 +1,4 @@ from spacy.lang cimport Language -from spacy.word cimport Lexeme from spacy.tokens cimport Tokens diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 82078ff12..b03024847 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -1,14 +1,12 @@ -from libc.stdint cimport uint32_t -from libc.stdint cimport uint64_t from libcpp.vector cimport vector -from libc.stdint cimport uint64_t, int64_t from preshed.maps cimport PreshMap from cymem.cymem cimport Pool -from .word cimport Lexeme +from .typedefs cimport hash_t from .tokens cimport Tokens -from .lexeme cimport LexemeC +from .lexeme cimport Lexeme +from .utf8string cimport StringStore cdef extern from "Python.h": @@ -21,23 +19,25 @@ cdef extern from "Python.h": cdef struct String: Py_UNICODE* chars size_t n - uint64_t key + hash_t key cdef class Lexicon: cdef Pool mem cpdef readonly size_t size + cpdef readonly StringStore strings - cdef vector[LexemeC*] lexemes + cdef vector[Lexeme*] lexemes cpdef Lexeme lookup(self, unicode string) - cdef LexemeC* get(self, String* s) except NULL + cdef Lexeme* get(self, String* s) except NULL cdef PreshMap _dict cdef list _string_features cdef list _flag_features + cdef class Language: cdef Pool _mem cdef unicode name @@ -52,12 +52,12 @@ cdef class Language: cpdef Tokens tokenize(self, unicode text) cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1 - cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes, - vector[LexemeC*] *suffixes) except NULL + cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes, + vector[Lexeme*] *suffixes) except NULL cdef int _attach_tokens(self, Tokens tokens, int idx, String* string, - vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 + vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1 cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1 - cdef int _save_cached(self, LexemeC** tokens, uint64_t key, int n) except -1 + cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1 diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 0241413d0..0f8d21a24 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -13,22 +13,21 @@ import random from os import path import re -from .util import read_lang_data -from .tokens import Tokens -from .lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack -from .lexeme cimport LexStr_orig -from murmurhash.mrmr cimport hash64 - -from cpython.ref cimport Py_INCREF - from cymem.cymem cimport Pool - from cython.operator cimport preincrement as preinc from cython.operator cimport dereference as deref +from murmurhash.mrmr cimport hash64 from preshed.maps cimport PreshMap -from spacy import orth -from spacy import util + +from .lexeme cimport Lexeme +from .lexeme cimport from_dict as lexeme_from_dict +from .lexeme cimport from_string as lexeme_from_string + +from . import orth +from . import util +from .util import read_lang_data +from .tokens import Tokens cdef class Language: @@ -64,7 +63,7 @@ cdef class Language: tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes. """ cdef int length = len(string) - cdef Tokens tokens = Tokens(length) + cdef Tokens tokens = Tokens(self.lexicon.strings, length) if length == 0: return tokens cdef int i = 0 @@ -76,7 +75,7 @@ cdef class Language: if Py_UNICODE_ISSPACE(chars[i]) != in_ws: if start < i: string_slice(&span, chars, start, i) - lexemes = self.cache.get(span.key) + lexemes = self.cache.get(span.key) if lexemes != NULL: tokens.extend(start, lexemes, 0) else: @@ -88,7 +87,7 @@ cdef class Language: i += 1 if start < i: string_slice(&span, chars, start, i) - lexemes = self.cache.get(span.key) + lexemes = self.cache.get(span.key) if lexemes != NULL: tokens.extend(start, lexemes, 0) else: @@ -96,9 +95,9 @@ cdef class Language: return tokens cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1: - cdef vector[LexemeC*] prefixes - cdef vector[LexemeC*] suffixes - cdef uint64_t orig_key + cdef vector[Lexeme*] prefixes + cdef vector[Lexeme*] suffixes + cdef hash_t orig_key cdef int orig_size orig_key = span.key orig_size = tokens.length @@ -106,8 +105,8 @@ cdef class Language: self._attach_tokens(tokens, start, span, &prefixes, &suffixes) self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size) - cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes, - vector[LexemeC*] *suffixes) except NULL: + cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes, + vector[Lexeme*] *suffixes) except NULL: cdef size_t i cdef String prefix cdef String suffix @@ -150,15 +149,15 @@ cdef class Language: cdef int _attach_tokens(self, Tokens tokens, int idx, String* string, - vector[LexemeC*] *prefixes, - vector[LexemeC*] *suffixes) except -1: + vector[Lexeme*] *prefixes, + vector[Lexeme*] *suffixes) except -1: cdef int split - cdef LexemeC** lexemes - cdef LexemeC* lexeme + cdef Lexeme** lexemes + cdef Lexeme* lexeme cdef String span idx = tokens.extend(idx, prefixes.data(), prefixes.size()) if string.n != 0: - lexemes = self.cache.get(string.key) + lexemes = self.cache.get(string.key) if lexemes != NULL: idx = tokens.extend(idx, lexemes, 0) else: @@ -172,13 +171,13 @@ cdef class Language: idx = tokens.push_back(idx, self.lexicon.get(&span)) string_slice(&span, string.chars, split + 1, string.n) idx = tokens.push_back(idx, self.lexicon.get(&span)) - cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin() + cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): idx = tokens.push_back(idx, deref(it)) preinc(it) - cdef int _save_cached(self, LexemeC** tokens, uint64_t key, int n) except -1: - lexemes = self._mem.alloc(n + 1, sizeof(LexemeC**)) + cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1: + lexemes = self._mem.alloc(n + 1, sizeof(Lexeme**)) cdef int i for i in range(n): lexemes[i] = tokens[i] @@ -212,14 +211,14 @@ cdef class Language: token_rules (list): A list of (chunk, tokens) pairs, where chunk is a string and tokens is a list of strings. ''' - cdef LexemeC** lexemes - cdef uint64_t hashed + cdef Lexeme** lexemes + cdef hash_t hashed cdef String string for uni_string, substrings in token_rules: - lexemes = self._mem.alloc(len(substrings) + 1, sizeof(LexemeC*)) + lexemes = self._mem.alloc(len(substrings) + 1, sizeof(Lexeme*)) for i, substring in enumerate(substrings): string_from_unicode(&string, substring) - lexemes[i] = self.lexicon.get(&string) + lexemes[i] = self.lexicon.get(&string) lexemes[i + 1] = NULL string_from_unicode(&string, uni_string) self.specials.set(string.key, lexemes) @@ -227,33 +226,29 @@ cdef class Language: cdef class Lexicon: - def __cinit__(self, lexemes): + def __init__(self, lexemes): self.mem = Pool() self._dict = PreshMap(2 ** 20) + self.strings = StringStore() self.size = 0 cdef String string - cdef dict lexeme_dict - cdef LexemeC* lexeme - for py_string, lexeme_dict in lexemes.iteritems(): - string_from_unicode(&string, py_string) - lexeme = self.mem.alloc(1, sizeof(LexemeC)) - lexeme_unpack(lexeme, lexeme_dict) - self._dict.set(string.key, lexeme) - self.lexemes.push_back(lexeme) - self.size += 1 + cdef Lexeme* lexeme + #for py_string, lexeme_dict in lexemes.iteritems(): + # string_from_unicode(&string, py_string) + # lexeme = self.mem.alloc(1, sizeof(Lexeme)) + # lexeme_from_dict(lexeme, lexeme_dict, self.strings) + # self._dict.set(string.key, lexeme) + # self.lexemes.push_back(lexeme) + # self.size += 1 - def __getitem__(self, size_t i): - return Lexeme(self.lexemes.at(i)) - - cdef LexemeC* get(self, String* string) except NULL: - cdef LexemeC* lex - lex = self._dict.get(string.key) + cdef Lexeme* get(self, String* string) except NULL: + cdef Lexeme* lex + lex = self._dict.get(string.key) if lex != NULL: return lex - lex = self.mem.alloc(1, sizeof(LexemeC)) - cdef unicode unicode_string = string.chars[:string.n] - lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string)) + lex = self.mem.alloc(1, sizeof(Lexeme)) + lexeme_from_string(lex, string.chars[:string.n], self.strings) self._dict.set(string.key, lex) self.lexemes.push_back(lex) self.size += 1 @@ -270,8 +265,8 @@ cdef class Lexicon: """ cdef String string string_from_unicode(&string, uni_string) - cdef LexemeC* lexeme = self.get(&string) - return Lexeme(lexeme) + cdef Lexeme* lexeme = self.get(&string) + return lexeme[0] cdef void string_from_unicode(String* s, unicode uni): diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index fe68a96ea..3cd65c995 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,94 +1,55 @@ from .typedefs cimport hash_t, utf8_t, flag_t, id_t -from cymem.cymem cimport Pool +from thinc.typedefs cimport atom_t + +from .utf8string cimport StringStore cpdef flag_t OOV_DIST_FLAGS +# Flags +cpdef enum: + IS_ALPHA + IS_ASCII + IS_DIGIT + IS_LOWER + IS_PUNCT + IS_SPACE + IS_TITLE + IS_UPPER -cpdef enum LexInts: - LexInt_id - LexInt_length - LexInt_cluster - LexInt_pos - LexInt_supersense - LexInt_N + OFT_LOWER + OFT_TITLE + OFT_UPPER -cpdef enum LexFloats: - LexFloat_prob - LexFloat_sentiment - LexFloat_N +cdef struct Lexeme: + atom_t id + atom_t length + + atom_t norm + atom_t shape + atom_t vocab10k + atom_t asciied + atom_t prefix + atom_t suffix + + atom_t cluster + atom_t pos + atom_t supersense + + float prob + + flag_t flags -cpdef enum LexStrs: - LexStr_orig - LexStr_norm - LexStr_shape - LexStr_unsparse - LexStr_asciied - LexStr_pre - LexStr_suff - LexStr_N +cdef Lexeme EMPTY_LEXEME -cpdef enum LexOrthFlags: - LexOrth_alpha - LexOrth_ascii - LexOrth_digit - LexOrth_lower - LexOrth_punct - LexOrth_space - LexOrth_title - LexOrth_upper - LexOrth_N +cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1 -cpdef enum LexDistFlags: - LexDist_adj - LexDist_adp - LexDist_adv - LexDist_conj - LexDist_det - LexDist_noun - LexDist_num - LexDist_pdt - LexDist_pos - LexDist_pron - LexDist_prt - LexDist_punct - LexDist_verb - - LexDist_lower - LexDist_title - LexDist_upper - - LexDist_N +cdef int from_dict(Lexeme* lex, dict props, StringStore store) except -1 -cdef struct LexemeC: - int[LexInt_N] ints - float[LexFloat_N] floats - utf8_t[LexStr_N] strings - flag_t orth_flags - flag_t dist_flags - - -cdef LexemeC EMPTY_LEXEME - - -cpdef dict get_lexeme_dict(size_t i, unicode string) - -cdef char* intern_and_encode(unicode string, size_t* length) except NULL - -cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except * - -cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except * - -cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i) - -cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except * - -cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except * - -cdef dict lexeme_pack(LexemeC* lexeme) -cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1 +cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil: + return lexeme.flags & (1 << flag_id) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 1f1d793ad..d442a262e 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -5,106 +5,40 @@ from libc.string cimport memset import orth +from .utf8string cimport Utf8Str + OOV_DIST_FLAGS = 0 -memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) +memset(&EMPTY_LEXEME, 0, sizeof(Lexeme)) -cpdef dict get_lexeme_dict(size_t i, unicode string): - ints = [None for _ in range(LexInt_N)] - ints[LexInt_id] = i - ints[LexInt_length] = len(string) - ints[LexInt_cluster] = 0 - ints[LexInt_pos] = 0 - ints[LexInt_supersense] = 0 - - floats = [None for _ in range(LexFloat_N)] - floats[LexFloat_prob] = 0 - floats[LexFloat_sentiment] = 0 - strings = [None for _ in range(LexStr_N)] - strings[LexStr_orig] = string - strings[LexStr_norm] = strings[LexStr_orig] - strings[LexStr_shape] = orth.word_shape(string) - strings[LexStr_unsparse] = strings[LexStr_shape] - strings[LexStr_asciied] = orth.asciied(string) - strings[LexStr_pre] = string[0] - strings[LexStr_suff] = string[-3:] - - orth_flags = get_orth_flags(string) - dist_flags = OOV_DIST_FLAGS - - return {'ints': ints, 'floats': floats, 'strings': strings, - 'orth_flags': orth_flags, 'dist_flags': dist_flags} - -def get_orth_flags(unicode string): +def get_flags(unicode string): cdef flag_t flags = 0 - - flags |= orth.is_ascii(string) << LexOrth_ascii - flags |= orth.is_alpha(string) << LexOrth_alpha - flags |= orth.is_digit(string) << LexOrth_digit - flags |= orth.is_lower(string) << LexOrth_lower - flags |= orth.is_punct(string) << LexOrth_punct - flags |= orth.is_space(string) << LexOrth_space - flags |= orth.is_title(string) << LexOrth_title - flags |= orth.is_upper(string) << LexOrth_upper + flags |= orth.is_alpha(string) << IS_ALPHA + flags |= orth.is_ascii(string) << IS_ASCII + flags |= orth.is_digit(string) << IS_DIGIT + flags |= orth.is_lower(string) << IS_LOWER + flags |= orth.is_punct(string) << IS_PUNCT + flags |= orth.is_space(string) << IS_SPACE + flags |= orth.is_title(string) << IS_TITLE + flags |= orth.is_upper(string) << IS_UPPER return flags -def get_dist_flags(unicode string): - return 0 - - -cdef char* intern_and_encode(unicode string, size_t* length) except NULL: +cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1: cdef bytes byte_string = string.encode('utf8') - cdef bytes utf8_string = intern(byte_string) - Py_INCREF(utf8_string) - length[0] = len(utf8_string) - return utf8_string + cdef Utf8Str* orig_str = store.intern(byte_string, len(byte_string)) + lex.id = orig_str.i + lex.cluster = 0 + lex.length = len(string) + lex.flags = get_flags(string) + # TODO: Hook this up + #lex.norm = norm_str.i + #lex.shape = norm_str.i + #lex.asciied = asciied_str.i + #lex.prefix = prefix_str.i + #lex.suffix = suffix_str.i -cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *: - return lexeme.ints[i] - - -cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *: - return lexeme.floats[i] - - -cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i): - cdef bytes byte_string = lexeme.strings[i] - return byte_string.decode('utf8') - - -cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *: - return lexeme.orth_flags & (1 << flag_id) - - -cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *: - return lexeme.dist_flags & (1 << flag_id) - - -cdef dict lexeme_pack(LexemeC* lex): - cdef dict packed = {} - packed['ints'] = [lex.ints[i] for i in range(LexInt_N)] - packed['floats'] = [lex.floats[i] for i in range(LexFloat_N)] - packed['strings'] = [lex.strings[i].decode('utf8') for i in range(LexStr_N)] - packed['orth_flags'] = lex.orth_flags - packed['dist_flags'] = lex.orth_flags - return packed - - -cdef int lexeme_unpack(LexemeC* lex, dict p) except -1: - cdef size_t i - cdef int lex_int - cdef float lex_float - cdef unicode string - for i, lex_int in enumerate(p['ints']): - lex.ints[i] = lex_int - for i, lex_float in enumerate(p['floats']): - lex.floats[i] = lex_float - cdef size_t _ - for i in range(LexStr_N): - lex_string = p['strings'][i] - lex.strings[i] = intern_and_encode(lex_string, &_) - lex.orth_flags = p['orth_flags'] - lex.dist_flags = p['dist_flags'] +cdef int from_dict(Lexeme* lex, dict props, StringStore stroe) except -1: + pass diff --git a/spacy/pos.pyx b/spacy/pos.pyx index 3fce25bc5..0e79cddd7 100644 --- a/spacy/pos.pyx +++ b/spacy/pos.pyx @@ -113,8 +113,8 @@ cpdef enum: CONTEXT_SIZE -cdef int get_atoms(atom_t* atoms, LexemeC* p2, LexemeC* p1, LexemeC* n0, LexemeC* n1, - LexemeC* n2, class_t prev_tag, class_t prev_prev_tag) except -1: +cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1, + Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1: _fill_token(&atoms[P2i], p2) _fill_token(&atoms[P1i], p1) _fill_token(&atoms[N0i], n0) @@ -124,16 +124,16 @@ cdef int get_atoms(atom_t* atoms, LexemeC* p2, LexemeC* p1, LexemeC* n0, LexemeC atoms[P2t] = prev_prev_tag -cdef inline void _fill_token(atom_t* atoms, LexemeC* lex) nogil: - atoms[0] = lex.ints[LexInt_id] - atoms[1] = lex.ints[LexInt_cluster] - atoms[2] = lex.strings[LexStr_norm] - atoms[3] = lex.strings[LexStr_shape] - atoms[4] = lex.strings[LexStr_pre] - atoms[5] = lex.strings[LexStr_suff] +cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil: + atoms[0] = lex.id + atoms[1] = lex.cluster + atoms[2] = lex.norm + atoms[3] = lex.shape + atoms[4] = lex.prefix + atoms[5] = lex.suffix - atoms[6] = lex.dist_flags & (1 << LexDist_title) - atoms[7] = lex.dist_flags & (1 << LexDist_upper) + atoms[6] = lex.flags & (1 << OFT_TITLE) + atoms[7] = lex.flags & (1 << OFT_UPPER) TEMPLATES = ( diff --git a/spacy/pos_util.py b/spacy/pos_util.py index 5acb1fc64..5b020eaed 100644 --- a/spacy/pos_util.py +++ b/spacy/pos_util.py @@ -20,6 +20,8 @@ def realign_tagged(token_rules, tagged_line, sep='/'): def read_tagged(detoken_rules, file_, sep='/'): sentences = [] for line in file_: + if not line.strip(): + continue line = realign_tagged(detoken_rules, line, sep=sep) tokens, tags = _parse_line(line, sep) assert len(tokens) == len(tags) @@ -39,7 +41,7 @@ def _parse_line(line, sep): subtags.append('NULL') assert len(subtags) == len(subtokens), [t.string for t in subtokens] words.append(word) - tags.extend([Tagger.encode_pos(pos) for pos in subtags]) + tags.extend([Tagger.encode_pos(ptb_to_univ(pos)) for pos in subtags]) return EN.tokenize(' '.join(words)), tags @@ -53,3 +55,86 @@ def get_tagdict(train_sents): tagdict.setdefault(word, {}).setdefault(tag, 0) tagdict[word][tag] += 1 return tagdict + + +def ptb_to_univ(tag): + mapping = dict(tuple(line.split()) for line in """ +NULL NULL +HYPH . +ADD X +NFP . +AFX X +XX X +BES VERB +HVS VERB +GW X +! . +# . +$ . +'' . +( . +) . +, . +-LRB- . +-RRB- . +. . +: . +? . +CC CONJ +CD NUM +CD|RB X +DT DET +EX DET +FW X +IN ADP +IN|RP ADP +JJ ADJ +JJR ADJ +JJRJR ADJ +JJS ADJ +JJ|RB ADJ +JJ|VBG ADJ +LS X +MD VERB +NN NOUN +NNP NOUN +NNPS NOUN +NNS NOUN +NN|NNS NOUN +NN|SYM NOUN +NN|VBG NOUN +NP NOUN +PDT DET +POS PRT +PRP PRON +PRP$ PRON +PRP|VBP PRON +PRT PRT +RB ADV +RBR ADV +RBS ADV +RB|RP ADV +RB|VBG ADV +RN X +RP PRT +SYM X +TO PRT +UH X +VB VERB +VBD VERB +VBD|VBN VERB +VBG VERB +VBG|NN VERB +VBN VERB +VBP VERB +VBP|TO VERB +VBZ VERB +VP VERB +WDT DET +WH X +WP PRON +WP$ PRON +WRB ADV +`` .""".strip().split('\n')) + return mapping[tag] + diff --git a/spacy/ptb3.pxd b/spacy/ptb3.pxd deleted file mode 100644 index f39c18c81..000000000 --- a/spacy/ptb3.pxd +++ /dev/null @@ -1,5 +0,0 @@ -from spacy.lang cimport Language - - -cdef class PennTreebank3(Language): - cdef list _split(self, unicode split) diff --git a/spacy/ptb3.pyx b/spacy/ptb3.pyx deleted file mode 100644 index cd60e062a..000000000 --- a/spacy/ptb3.pyx +++ /dev/null @@ -1,161 +0,0 @@ -'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index, -so that strings can be retrieved from hashes. Use 64-bit hash values and -boldly assume no collisions. -''' -from __future__ import unicode_literals - - -from libc.stdint cimport uint64_t - - -cimport spacy - -import re - -from spacy import orth - -TAG_THRESH = 0.5 -UPPER_THRESH = 0.2 -LOWER_THRESH = 0.5 -TITLE_THRESH = 0.7 - -NR_FLAGS = 0 - -OFT_UPPER = NR_FLAGS; NR_FLAGS += 1 -OFT_LOWER = NR_FLAGS; NR_FLAGS += 1 -OFT_TITLE = NR_FLAGS; NR_FLAGS += 1 - -IS_ALPHA = NR_FLAGS; NR_FLAGS += 1 -IS_DIGIT = NR_FLAGS; NR_FLAGS += 1 -IS_PUNCT = NR_FLAGS; NR_FLAGS += 1 -IS_SPACE = NR_FLAGS; NR_FLAGS += 1 -IS_ASCII = NR_FLAGS; NR_FLAGS += 1 -IS_TITLE = NR_FLAGS; NR_FLAGS += 1 -IS_LOWER = NR_FLAGS; NR_FLAGS += 1 -IS_UPPER = NR_FLAGS; NR_FLAGS += 1 - -CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1 -CAN_CONJ = NR_FLAGS; NR_FLAGS += 1 -CAN_NUM = NR_FLAGS; NR_FLAGS += 1 -CAN_DET = NR_FLAGS; NR_FLAGS += 1 -CAN_ADP = NR_FLAGS; NR_FLAGS += 1 -CAN_ADJ = NR_FLAGS; NR_FLAGS += 1 -CAN_ADV = NR_FLAGS; NR_FLAGS += 1 -CAN_VERB = NR_FLAGS; NR_FLAGS += 1 -CAN_NOUN = NR_FLAGS; NR_FLAGS += 1 -CAN_PDT = NR_FLAGS; NR_FLAGS += 1 -CAN_POS = NR_FLAGS; NR_FLAGS += 1 -CAN_PRON = NR_FLAGS; NR_FLAGS += 1 -CAN_PRT = NR_FLAGS; NR_FLAGS += 1 - - -# List of contractions adapted from Robert MacIntyre's tokenizer. -CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"), - re.compile(r"(?i)\b(d)('ye)\b"), - re.compile(r"(?i)\b(gim)(me)\b"), - re.compile(r"(?i)\b(gon)(na)\b"), - re.compile(r"(?i)\b(got)(ta)\b"), - re.compile(r"(?i)\b(lem)(me)\b"), - re.compile(r"(?i)\b(mor)('n)\b"), - re.compile(r"(?i)\b(wan)(na) ")] - -CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"), - re.compile(r"(?i) ('t)(was)\b")] - -CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"), - re.compile(r"(?i)\b(wha)(t)(cha)\b")] - -def nltk_regex_tokenize(text): - # Implementation taken from NLTK 3.0, based on tokenizer.sed - - #starting quotes - text = re.sub(r'^\"', r'``', text) - text = re.sub(r'(``)', r' \1 ', text) - text = re.sub(r'([ (\[{<])"', r'\1 `` ', text) - - #punctuation - text = re.sub(r'([:,])([^\d])', r' \1 \2', text) - text = re.sub(r'\.\.\.', r' ... ', text) - text = re.sub(r'[;@#$%&]', r' \g<0> ', text) - text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text) - text = re.sub(r'[?!]', r' \g<0> ', text) - - text = re.sub(r"([^'])' ", r"\1 ' ", text) - - #parens, brackets, etc. - text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text) - text = re.sub(r'--', r' -- ', text) - - #add extra space to make things easier - text = " " + text + " " - - #ending quotes - text = re.sub(r'"', " '' ", text) - text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text) - - text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text) - text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ", - text) - - for regexp in CONTRACTIONS2: - text = regexp.sub(r' \1 \2 ', text) - for regexp in CONTRACTIONS3: - text = regexp.sub(r' \1 \2 ', text) - - # We are not using CONTRACTIONS4 since - # they are also commented out in the SED scripts - # for regexp in self.CONTRACTIONS4: - # text = regexp.sub(r' \1 \2 \3 ', text) - - return text.split() - - -cdef class PennTreebank3(Language): - """Fully PTB compatible English tokenizer, tightly coupled to lexicon. - - Attributes: - name (unicode): The two letter code used by Wikipedia for the language. - lexicon (Lexicon): The lexicon. Exposes the lookup method. - """ - - - def __cinit__(self, name): - flag_funcs = [0 for _ in range(NR_FLAGS)] - - flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH) - flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH) - flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH) - - flag_funcs[IS_ALPHA] = orth.is_alpha - flag_funcs[IS_DIGIT] = orth.is_digit - flag_funcs[IS_PUNCT] = orth.is_punct - flag_funcs[IS_SPACE] = orth.is_space - flag_funcs[IS_TITLE] = orth.is_title - flag_funcs[IS_LOWER] = orth.is_lower - flag_funcs[IS_UPPER] = orth.is_upper - - flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH) - flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH) - flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH) - flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH) - flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH) - flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH) - flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH) - flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH) - flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH) - flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH) - flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH) - - Language.__init__(self, name, flag_funcs) - - - cdef list _split(self, unicode chunk): - strings = nltk_regex_tokenize(chunk) - if strings[-1] == '.': - strings.pop() - strings[-1] += '.' - assert strings - return strings - - -PTB3 = PennTreebank3('ptb3') diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index b39324fe2..9847cdc3c 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -1,59 +1,49 @@ from cymem.cymem cimport Pool -from spacy.lexeme cimport LexemeC +from .lexeme cimport Lexeme +from .typedefs cimport flag_t +from .utf8string cimport StringStore + from thinc.typedefs cimport atom_t cdef class Tokens: cdef Pool mem + cdef StringStore _string_store - cdef LexemeC** _lex_ptr + cdef Lexeme** _lex_ptr cdef int* _idx_ptr cdef int* _pos_ptr - cdef LexemeC** lex + cdef Lexeme** lex cdef int* idx cdef int* pos cdef int length cdef int max_length - cdef int extend(self, int i, LexemeC** lexemes, int n) except -1 - cdef int push_back(self, int i, LexemeC* lexeme) except -1 + cdef int extend(self, int i, Lexeme** lexemes, int n) except -1 + cdef int push_back(self, int i, Lexeme* lexeme) except -1 - cpdef int id(self, size_t i) except -1 - cpdef float prob(self, size_t i) except 1 - cpdef int cluster(self, size_t i) except * - cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except * - cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except * - cpdef unicode string_view(self, size_t i, size_t view_id) - cpdef unicode string(self, size_t i) - cpdef unicode orig(self, size_t i) - cpdef unicode norm(self, size_t i) - cpdef unicode shape(self, size_t i) - cpdef unicode unsparse(self, size_t i) - cpdef unicode asciied(self, size_t i) - cpdef bint is_alpha(self, size_t i) except * - cpdef bint is_ascii(self, size_t i) except * - cpdef bint is_digit(self, size_t i) except * - cpdef bint is_lower(self, size_t i) except * - cpdef bint is_punct(self, size_t i) except * - cpdef bint is_space(self, size_t i) except * - cpdef bint is_title(self, size_t i) except * - cpdef bint is_upper(self, size_t i) except * - cpdef bint can_adj(self, size_t i) except * - cpdef bint can_adp(self, size_t i) except * - cpdef bint can_adv(self, size_t i) except * - cpdef bint can_conj(self, size_t i) except * - cpdef bint can_det(self, size_t i) except * - cpdef bint can_noun(self, size_t i) except * - cpdef bint can_num(self, size_t i) except * - cpdef bint can_pdt(self, size_t i) except * - cpdef bint can_pos(self, size_t i) except * - cpdef bint can_pron(self, size_t i) except * - cpdef bint can_prt(self, size_t i) except * - cpdef bint can_punct(self, size_t i) except * - cpdef bint can_verb(self, size_t i) except * - cpdef bint oft_lower(self, size_t i) except * - cpdef bint oft_title(self, size_t i) except * - cpdef bint oft_upper(self, size_t i) except * +cdef class Token: + cdef StringStore _string_store + cdef public int i + cdef public int idx + cdef public int pos + + cdef public atom_t id + cdef public atom_t cluster + cdef public atom_t length + cdef public atom_t lex_pos + cdef public atom_t lex_supersense + + cdef public atom_t norm + cdef public atom_t shape + cdef public atom_t vocab10k + cdef public atom_t asciied + cdef public atom_t prefix + cdef public atom_t suffix + + cdef public float prob + + cdef public flag_t flags diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index d20e6c498..56ffc343f 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -1,10 +1,6 @@ # cython: profile=True -from .word cimport Lexeme - from .lexeme cimport * -cimport numpy cimport cython -import numpy DEF PADDING = 5 @@ -34,7 +30,8 @@ cdef class Tokens: >>> tokens.can_noun(1) True """ - def __init__(self, string_length=0): + def __init__(self, StringStore string_store, string_length=0): + self._string_store = string_store if string_length >= 3: size = int(string_length / 3.0) else: @@ -43,7 +40,7 @@ cdef class Tokens: # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds # However, we need to remember the true starting places, so that we can # realloc. - self._lex_ptr = self.mem.alloc(size + (PADDING*2), sizeof(LexemeC*)) + self._lex_ptr = self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*)) self._idx_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) self._pos_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) self.lex = self._lex_ptr @@ -55,39 +52,26 @@ cdef class Tokens: self.lex += PADDING self.idx += PADDING self.pos += PADDING - self.max_length = size self.length = 0 def __getitem__(self, i): bounds_check(i, self.length, PADDING) - return Lexeme(self.lex[i]) + return Token(self._string_store, i, self.idx[i], self.pos[i], self.lex[i][0]) def __len__(self): return self.length - cdef int push_back(self, int idx, LexemeC* lexeme) except -1: + cdef int push_back(self, int idx, Lexeme* lexeme) except -1: if self.length == self.max_length: self._realloc(self.length * 2) self.lex[self.length] = lexeme self.idx[self.length] = idx self.pos[self.length] = 0 self.length += 1 - return idx + lexeme.ints[LexInt_length] + return idx + lexeme.length - def _realloc(self, new_size): - self.max_length = new_size - n = new_size + (PADDING * 2) - self._lex_ptr = self.mem.realloc(self._lex_ptr, n * sizeof(LexemeC*)) - self._idx_ptr = self.mem.realloc(self._idx_ptr, n * sizeof(int)) - self._pos_ptr = self.mem.realloc(self._pos_ptr, n * sizeof(int)) - self.lex = self._lex_ptr + PADDING - self.idx = self._idx_ptr + PADDING - self.pos = self._pos_ptr + PADDING - for i in range(self.length, self.max_length + PADDING): - self.lex[i] = &EMPTY_LEXEME - - cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1: + cdef int extend(self, int idx, Lexeme** lexemes, int n) except -1: cdef int i if lexemes == NULL: return idx @@ -101,154 +85,43 @@ cdef class Tokens: idx = self.push_back(idx, lexemes[i]) return idx - cpdef int id(self, size_t i) except -1: - bounds_check(i, self.length, PADDING) - return self.lex[i].ints[LexInt_id] + def _realloc(self, new_size): + self.max_length = new_size + n = new_size + (PADDING * 2) + self._lex_ptr = self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*)) + self._idx_ptr = self.mem.realloc(self._idx_ptr, n * sizeof(int)) + self._pos_ptr = self.mem.realloc(self._pos_ptr, n * sizeof(int)) + self.lex = self._lex_ptr + PADDING + self.idx = self._idx_ptr + PADDING + self.pos = self._pos_ptr + PADDING + for i in range(self.length, self.max_length + PADDING): + self.lex[i] = &EMPTY_LEXEME - cpdef float prob(self, size_t i) except 1: - bounds_check(i, self.length, PADDING) - return self.lex[i].floats[LexFloat_prob] - cpdef int cluster(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return self.lex[i].ints[LexInt_cluster] +@cython.freelist(64) +cdef class Token: + def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex): + self._string_store = string_store + self.i = i + self.idx = idx + self.pos = pos + + self.id = lex['id'] + self.cluster = lex['cluster'] + self.length = lex['length'] + self.lex_pos = lex['pos'] + self.lex_supersense = lex['supersense'] + self.norm = lex['norm'] + self.shape = lex['shape'] + self.vocab10k = lex['vocab10k'] + self.suffix = lex['asciied'] + self.prefix = lex['prefix'] - cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_orth_flag(self.lex[i], flag_id) + self.prob = lex['prob'] + self.flags = lex['flags'] - cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], flag_id) + property string: + def __get__(self): + cdef bytes utf8string = self._string_store[self.id] + return utf8string.decode('utf8') - cpdef unicode string_view(self, size_t i, size_t view_id): - bounds_check(i, self.length, PADDING) - return lexeme_get_string(self.lex[i], view_id) - - # Provide accessor methods for the features supported by the language. - # Without these, clients have to use the underlying string_view and check_flag - # methods, which requires them to know the IDs. - - cpdef unicode string(self, size_t i): - bounds_check(i, self.length, PADDING) - return self.orig(i) - - cpdef unicode orig(self, size_t i): - bounds_check(i, self.length, PADDING) - cdef bytes utf8_string = self.lex[i].strings[LexStr_orig] - cdef unicode string = utf8_string.decode('utf8') - return string - - cpdef unicode norm(self, size_t i): - bounds_check(i, self.length, PADDING) - cdef bytes utf8_string = self.lex[i].strings[LexStr_norm] - cdef unicode string = utf8_string.decode('utf8') - return string - - cpdef unicode shape(self, size_t i): - bounds_check(i, self.length, PADDING) - return lexeme_get_string(self.lex[i], LexStr_shape) - - cpdef unicode unsparse(self, size_t i): - bounds_check(i, self.length, PADDING) - return lexeme_get_string(self.lex[i], LexStr_unsparse) - - cpdef unicode asciied(self, size_t i): - bounds_check(i, self.length, PADDING) - return lexeme_get_string(self.lex[i], LexStr_asciied) - - cpdef bint is_alpha(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_orth_flag(self.lex[i], LexOrth_alpha) - - cpdef bint is_ascii(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_orth_flag(self.lex[i], LexOrth_ascii) - - cpdef bint is_digit(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_orth_flag(self.lex[i], LexOrth_digit) - - cpdef bint is_lower(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_orth_flag(self.lex[i], LexOrth_lower) - - cpdef bint is_punct(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_orth_flag(self.lex[i], LexOrth_punct) - - cpdef bint is_space(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_orth_flag(self.lex[i], LexOrth_space) - - cpdef bint is_title(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_orth_flag(self.lex[i], LexOrth_title) - - cpdef bint is_upper(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_orth_flag(self.lex[i], LexOrth_upper) - - cpdef bint can_adj(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], LexDist_adj) - - cpdef bint can_adp(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], LexDist_adp) - - cpdef bint can_adv(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], LexDist_adv) - - cpdef bint can_conj(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], LexDist_conj) - - cpdef bint can_det(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], LexDist_det) - - cpdef bint can_noun(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], LexDist_noun) - - cpdef bint can_num(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], LexDist_num) - - cpdef bint can_pdt(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], LexDist_pdt) - - cpdef bint can_pos(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], LexDist_pos) - - cpdef bint can_pron(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], LexDist_pron) - - cpdef bint can_prt(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], LexDist_prt) - - cpdef bint can_punct(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], LexDist_punct) - - cpdef bint can_verb(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], LexDist_verb) - - cpdef bint oft_lower(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], LexDist_lower) - - cpdef bint oft_title(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], LexDist_title) - - cpdef bint oft_upper(self, size_t i) except *: - bounds_check(i, self.length, PADDING) - return lexeme_check_dist_flag(self.lex[i], LexDist_upper) diff --git a/spacy/word.pxd b/spacy/word.pxd deleted file mode 100644 index ac9c7bb0e..000000000 --- a/spacy/word.pxd +++ /dev/null @@ -1,12 +0,0 @@ -from .typedefs cimport hash_t, utf8_t, flag_t, id_t -from spacy.lexeme cimport LexemeC - -DEF MAX_FLAG = 64 - - -cdef class Lexeme: - cdef LexemeC* _c - - cpdef bint check_orth_flag(self, size_t flag_id) except * - cpdef bint check_dist_flag(self, size_t flag_id) except * - cpdef unicode string_view(self, size_t view_id) diff --git a/spacy/word.pyx b/spacy/word.pyx deleted file mode 100644 index ab4ee6b68..000000000 --- a/spacy/word.pyx +++ /dev/null @@ -1,80 +0,0 @@ -# cython: profile=True -# cython: embedsignature=True - -from .lexeme cimport lexeme_get_string -from .lexeme cimport lexeme_check_orth_flag, lexeme_check_dist_flag - -from .lexeme cimport * - - -cdef class Lexeme: - """A lexical type --- a word, punctuation symbol, whitespace sequence, etc - keyed by a case-sensitive unicode string. All tokens with the same string, - e.g. all instances of "dog", ",", "NASA" etc should be mapped to the same - Lexeme. - - You should avoid instantiating Lexemes directly, and instead use the - :py:meth:`space.lang.Language.tokenize` and :py:meth:`spacy.lang.Language.lookup` - methods on the global object exposed by the language you're working with, - e.g. :py:data:`spacy.en.EN`. - - Attributes: - string (unicode): - The unicode string. - - Implemented as a property; relatively expensive. - - length (size_t): - The number of unicode code-points in the string. - - prob (double): - An estimate of the word's unigram log probability. - - Probabilities are calculated from a large text corpus, and smoothed using - simple Good-Turing. Estimates are read from data/en/probabilities, and - can be replaced using spacy.en.load_probabilities. - - cluster (size_t): - An integer representation of the word's Brown cluster. - - A Brown cluster is an address into a binary tree, which gives some (noisy) - information about the word's distributional context. - - >>> strings = (u'pineapple', u'apple', u'dapple', u'scalable') - >>> print ["{0:b"} % lookup(s).cluster for s in strings] - ["100111110110", "100111100100", "01010111011001", "100111110110"] - - The clusterings are unideal, but often slightly useful. - "pineapple" and "apple" share a long prefix, indicating a similar meaning, - while "dapple" is totally different. On the other hand, "scalable" receives - the same cluster ID as "pineapple", which is not what we'd like. - """ - def __cinit__(self, size_t lexeme_addr): - self._c = lexeme_addr - - property string: - def __get__(self): - cdef bytes utf8_string = self._c.strings[LexStr_orig] - cdef unicode string = utf8_string.decode('utf8') - return string - - property prob: - def __get__(self): - return self._c.floats[LexFloat_prob] - - property cluster: - def __get__(self): - return self._c.ints[LexInt_cluster] - - property length: - def __get__(self): - return self._c.ints[LexInt_length] - - cpdef bint check_orth_flag(self, size_t flag_id) except *: - return lexeme_check_orth_flag(self._c, flag_id) - - cpdef bint check_dist_flag(self, size_t flag_id) except *: - return lexeme_check_dist_flag(self._c, flag_id) - - cpdef unicode string_view(self, size_t view_id): - return lexeme_get_string(self._c, view_id) diff --git a/tests/test_contractions.py b/tests/test_contractions.py index a3e89cb67..5a2eaf3a9 100644 --- a/tests/test_contractions.py +++ b/tests/test_contractions.py @@ -5,8 +5,8 @@ from spacy.en import EN def test_possess(): tokens = EN.tokenize("Mike's") - assert tokens[0].string == "Mike" - assert tokens[1].string == "'s" + assert EN.lexicon.strings[tokens[0].id] == "Mike" + assert EN.lexicon.strings[tokens[1].id] == "'s" assert len(tokens) == 2 diff --git a/tests/test_lexeme_flags.py b/tests/test_lexeme_flags.py index f6c77dc43..6b9c20d11 100644 --- a/tests/test_lexeme_flags.py +++ b/tests/test_lexeme_flags.py @@ -8,19 +8,17 @@ from spacy.lexeme import * def test_is_alpha(): the = EN.lexicon.lookup('the') - assert the.check_orth_flag(LexOrth_alpha) + assert the['flags'] & (1 << IS_ALPHA) year = EN.lexicon.lookup('1999') - assert not year.check_orth_flag(LexOrth_alpha) + assert not year['flags'] & (1 << IS_ALPHA) mixed = EN.lexicon.lookup('hello1') - assert not mixed.check_orth_flag(LexOrth_alpha) + assert not mixed['flags'] & (1 << IS_ALPHA) def test_is_digit(): the = EN.lexicon.lookup('the') - assert not the.check_orth_flag(LexOrth_digit) + assert not the['flags'] & (1 << IS_DIGIT) year = EN.lexicon.lookup('1999') - assert year.check_orth_flag(LexOrth_digit) + assert year['flags'] & (1 << IS_DIGIT) mixed = EN.lexicon.lookup('hello1') - assert not mixed.check_orth_flag(LexOrth_digit) - - + assert not mixed['flags'] & (1 << IS_DIGIT) diff --git a/tests/test_orth.py b/tests/test_orth.py deleted file mode 100644 index fb6f56b94..000000000 --- a/tests/test_orth.py +++ /dev/null @@ -1,27 +0,0 @@ -from __future__ import unicode_literals - -import pytest - -import spacy.word -from spacy.en import EN -from spacy.lexeme import * - - -@pytest.fixture -def C3P0(): - return EN.lexicon.lookup("C3P0") - - -def test_shape(C3P0): - assert C3P0.string_view(LexStr_shape) == "XdXd" - - -def test_length(): - t = EN.lexicon.lookup('the') - assert t.length == 3 - t = EN.lexicon.lookup("n't") - assert t.length == 3 - t = EN.lexicon.lookup("'s") - assert t.length == 2 - t = EN.lexicon.lookup('Xxxx') - assert t.length == 4 diff --git a/tests/test_string_loading.py b/tests/test_string_loading.py index 5efcf7f9b..e2fa2429a 100644 --- a/tests/test_string_loading.py +++ b/tests/test_string_loading.py @@ -8,9 +8,9 @@ from spacy.en import EN def test_one(): tokens = EN.tokenize('Betty Botter bought a pound of butter.') - assert tokens.string(0) == 'Betty' + assert tokens[0].string == 'Betty' tokens2 = EN.tokenize('Betty also bought a pound of butter.') - assert tokens2.string(0) == 'Betty' + assert tokens2[0].string == 'Betty' diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 59a7fe524..73ac91261 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -5,41 +5,39 @@ from spacy.en import EN def test_single_word(): - lex_ids = EN.tokenize(u'hello') - assert lex_ids[0].string == EN.lexicon.lookup(u'hello').string + tokens = EN.tokenize(u'hello') + assert tokens[0].string == 'hello' def test_two_words(): - words = EN.tokenize('hello possums') - assert len(words) == 2 - assert words[0].string == EN.lexicon.lookup('hello').string - assert words[0].string != words[1].string + tokens = EN.tokenize('hello possums') + assert len(tokens) == 2 + assert tokens[0].string != tokens[1].string def test_punct(): tokens = EN.tokenize('hello, possums.') assert len(tokens) == 4 - assert tokens[0].string == EN.lexicon.lookup('hello').string - assert tokens[1].string == EN.lexicon.lookup(',').string - assert tokens[2].string == EN.lexicon.lookup('possums').string - assert tokens[1].string != EN.lexicon.lookup('hello').string + assert tokens[0].string == 'hello' + assert tokens[1].string == ',' + assert tokens[2].string == 'possums' + assert tokens[1].string != 'hello' def test_digits(): - lex_ids = EN.tokenize('The year: 1984.') - assert lex_ids.orig(3) == "1984" - assert len(lex_ids) == 5 - assert lex_ids[0].string == EN.lexicon.lookup('The').string - assert lex_ids[3].string == EN.lexicon.lookup('1984').string + tokens = EN.tokenize('The year: 1984.') + assert len(tokens) == 5 + assert tokens[0].id == EN.lexicon.lookup('The')['id'] + assert tokens[3].id == EN.lexicon.lookup('1984')['id'] def test_contraction(): - lex_ids = EN.tokenize("don't giggle") - assert len(lex_ids) == 3 - assert lex_ids[1].string == EN.lexicon.lookup("not").string - lex_ids = EN.tokenize("i said don't!") - assert len(lex_ids) == 5 - assert lex_ids[4].string == EN.lexicon.lookup('!').string + tokens = EN.tokenize("don't giggle") + assert len(tokens) == 3 + assert tokens[1].id == EN.lexicon.lookup("not")['id'] + tokens = EN.tokenize("i said don't!") + assert len(tokens) == 5 + assert tokens[4].id == EN.lexicon.lookup('!')['id'] def test_contraction_punct(): diff --git a/tests/test_vocab.py b/tests/test_vocab.py index 047df07b3..640fa5041 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -5,30 +5,19 @@ from spacy.en import EN def test_neq(): addr = EN.lexicon.lookup('Hello') - assert EN.lexicon.lookup('bye').string != addr.string + assert EN.lexicon.lookup('bye')['id'] != addr['id'] def test_eq(): addr = EN.lexicon.lookup('Hello') - assert EN.lexicon.lookup('Hello').string == addr.string - - -def test_round_trip(): - hello = EN.lexicon.lookup('Hello') - assert hello.string == 'Hello' + assert EN.lexicon.lookup('Hello')['id'] == addr['id'] def test_case_neq(): addr = EN.lexicon.lookup('Hello') - assert EN.lexicon.lookup('hello').string != addr.string + assert EN.lexicon.lookup('hello')['id'] != addr['id'] def test_punct_neq(): addr = EN.lexicon.lookup('Hello') - assert EN.lexicon.lookup('Hello,').string != addr.string - - -def test_short(): - addr = EN.lexicon.lookup('I') - assert addr.string == 'I' - assert addr.string != 'not' + assert EN.lexicon.lookup('Hello,')['id'] != addr['id']