mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Large refactor, particularly to Python API
This commit is contained in:
parent
168b2b8cb2
commit
08ce602243
42
spacy/de.pxd
42
spacy/de.pxd
|
@ -1,42 +0,0 @@
|
|||
from spacy.spacy cimport Language
|
||||
from spacy.word cimport Lexeme
|
||||
cimport cython
|
||||
|
||||
|
||||
cpdef size_t ALPHA
|
||||
cpdef size_t DIGIT
|
||||
cpdef size_t PUNCT
|
||||
cpdef size_t SPACE
|
||||
cpdef size_t LOWER
|
||||
cpdef size_t UPPER
|
||||
cpdef size_t TITLE
|
||||
cpdef size_t ASCII
|
||||
|
||||
cpdef size_t OFT_LOWER
|
||||
cpdef size_t OFT_TITLE
|
||||
cpdef size_t OFT_UPPER
|
||||
|
||||
cpdef size_t PUNCT
|
||||
cpdef size_t CONJ
|
||||
cpdef size_t NUM
|
||||
cpdef size_t N
|
||||
cpdef size_t DET
|
||||
cpdef size_t ADP
|
||||
cpdef size_t ADJ
|
||||
cpdef size_t ADV
|
||||
cpdef size_t VERB
|
||||
cpdef size_t NOUN
|
||||
cpdef size_t PDT
|
||||
cpdef size_t POS
|
||||
cpdef size_t PRON
|
||||
cpdef size_t PRT
|
||||
|
||||
cdef class English(spacy.Language):
|
||||
cdef int find_split(self, unicode word)
|
||||
|
||||
|
||||
cdef English EN
|
||||
|
||||
|
||||
cpdef Word lookup(unicode word)
|
||||
cpdef list tokenize(unicode string)
|
126
spacy/de.pyx
126
spacy/de.pyx
|
@ -1,126 +0,0 @@
|
|||
# cython: profile=True
|
||||
# cython: embedsignature=True
|
||||
'''Tokenize German text, using a scheme based on the Negra corpus.
|
||||
|
||||
Tokenization is generally similar to English text, and the same set of orthographic
|
||||
flags are used.
|
||||
|
||||
An abbreviation list is used to handle common abbreviations. Hyphenated words
|
||||
are not split, following the Treebank usage.
|
||||
'''
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.stdint cimport uint64_t
|
||||
|
||||
cimport spacy
|
||||
|
||||
from spacy.orth import is_alpha, is_digit, is_punct, is_space, is_lower, is_ascii
|
||||
from spacy.orth import canonicalize_case, get_string_shape, asciify, get_non_sparse
|
||||
from spacy.common cimport check_punct
|
||||
|
||||
# Python-readable flag constants --- can't read an enum from Python
|
||||
|
||||
# Don't want to manually assign these numbers, or we'll insert one and have to
|
||||
# change them all.
|
||||
# Don't use "i", as we don't want it in the global scope!
|
||||
cdef size_t __i = 0
|
||||
|
||||
ALPHA = __i; i += 1
|
||||
DIGIT = __i; __i += 1
|
||||
PUNCT = __i; __i += 1
|
||||
SPACE = __i; __i += 1
|
||||
LOWER = __i; __i += 1
|
||||
UPPER = __i; __i += 1
|
||||
TITLE = __i; __i += 1
|
||||
ASCII = __i; __i += 1
|
||||
|
||||
OFT_LOWER = __i; __i += 1
|
||||
OFT_UPPER = __i; __i += 1
|
||||
OFT_TITLE = __i; __i += 1
|
||||
|
||||
PUNCT = __i; __i += 1
|
||||
CONJ = __i; __i += 1
|
||||
NUM = __i; __i += 1
|
||||
X = __i; __i += 1
|
||||
DET = __i; __i += 1
|
||||
ADP = __i; __i += 1
|
||||
ADJ = __i; __i += 1
|
||||
ADV = __i; __i += 1
|
||||
VERB = __i; __i += 1
|
||||
NOUN = __i; __i += 1
|
||||
PDT = __i; __i += 1
|
||||
POS = __i; __i += 1
|
||||
PRON = __i; __i += 1
|
||||
PRT = __i; __i += 1
|
||||
|
||||
|
||||
# These are for the string views
|
||||
__i = 0
|
||||
SIC = __i; __i += 1
|
||||
CANON_CASED = __i; __i += 1
|
||||
NON_SPARSE = __i; __i += 1
|
||||
SHAPE = __i; __i += 1
|
||||
NR_STRING_VIEWS = __i
|
||||
|
||||
|
||||
def get_string_views(unicode string, lexeme):
|
||||
views = ['' for _ in range(NR_STRING_VIEWS)]
|
||||
views[SIC] = string
|
||||
views[CANON_CASED] = canonicalize_case(string, lexeme)
|
||||
views[SHAPE] = get_string_shape(string)
|
||||
views[ASCIIFIED] = get_asciified(string)
|
||||
views[FIXED_VOCAB] = get_non_sparse(string, views[ASCIIFIED], views[CANON_CASED],
|
||||
views[SHAPE], lexeme)
|
||||
return views
|
||||
|
||||
|
||||
def set_orth_flags(unicode string, flags_t flags)
|
||||
setters = [
|
||||
(ALPHA, is_alpha),
|
||||
(DIGIT, is_digit),
|
||||
(PUNCT, is_punct),
|
||||
(SPACE, is_space),
|
||||
(LOWER, is_lower),
|
||||
(UPPER, is_upper),
|
||||
(SPACE, is_space)
|
||||
]
|
||||
|
||||
for bit, setter in setters:
|
||||
if setter(string):
|
||||
flags |= 1 << bit
|
||||
return flags
|
||||
|
||||
|
||||
cdef class German(spacy.Language):
|
||||
cdef Lexeme new_lexeme(self, unicode string, cluster=0, case_stats=None,
|
||||
tag_freqs=None):
|
||||
return Lexeme(s, length, views, prob=prob, cluster=cluster,
|
||||
flags=self.get_flags(string)
|
||||
|
||||
cdef int find_split(self, unicode word):
|
||||
cdef size_t length = len(word)
|
||||
cdef int i = 0
|
||||
if word.startswith("'s") or word.startswith("'S"):
|
||||
return 2
|
||||
# Contractions
|
||||
if word.endswith("'s") and length >= 3:
|
||||
return length - 2
|
||||
# Leading punctuation
|
||||
if check_punct(word, 0, length):
|
||||
return 1
|
||||
elif length >= 1:
|
||||
# Split off all trailing punctuation characters
|
||||
i = 0
|
||||
while i < length and not check_punct(word, i, length):
|
||||
i += 1
|
||||
return i
|
||||
|
||||
|
||||
DE = German('de')
|
||||
|
||||
lookup = DE.lookup
|
||||
tokenize = DE.tokenize
|
||||
load_clusters = DE.load_clusters
|
||||
load_unigram_probs = DE.load_unigram_probs
|
||||
load_case_stats = DE.load_case_stats
|
||||
load_tag_stats = DE.load_tag_stats
|
|
@ -1,5 +1,4 @@
|
|||
from spacy.lang cimport Language
|
||||
from spacy.word cimport Lexeme
|
||||
from spacy.tokens cimport Tokens
|
||||
|
||||
|
||||
|
|
|
@ -1,14 +1,12 @@
|
|||
from libc.stdint cimport uint32_t
|
||||
from libc.stdint cimport uint64_t
|
||||
from libcpp.vector cimport vector
|
||||
from libc.stdint cimport uint64_t, int64_t
|
||||
|
||||
from preshed.maps cimport PreshMap
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from .word cimport Lexeme
|
||||
from .typedefs cimport hash_t
|
||||
from .tokens cimport Tokens
|
||||
from .lexeme cimport LexemeC
|
||||
from .lexeme cimport Lexeme
|
||||
from .utf8string cimport StringStore
|
||||
|
||||
|
||||
cdef extern from "Python.h":
|
||||
|
@ -21,23 +19,25 @@ cdef extern from "Python.h":
|
|||
cdef struct String:
|
||||
Py_UNICODE* chars
|
||||
size_t n
|
||||
uint64_t key
|
||||
hash_t key
|
||||
|
||||
|
||||
cdef class Lexicon:
|
||||
cdef Pool mem
|
||||
cpdef readonly size_t size
|
||||
cpdef readonly StringStore strings
|
||||
|
||||
cdef vector[LexemeC*] lexemes
|
||||
cdef vector[Lexeme*] lexemes
|
||||
|
||||
cpdef Lexeme lookup(self, unicode string)
|
||||
cdef LexemeC* get(self, String* s) except NULL
|
||||
cdef Lexeme* get(self, String* s) except NULL
|
||||
|
||||
cdef PreshMap _dict
|
||||
|
||||
cdef list _string_features
|
||||
cdef list _flag_features
|
||||
|
||||
|
||||
cdef class Language:
|
||||
cdef Pool _mem
|
||||
cdef unicode name
|
||||
|
@ -52,12 +52,12 @@ cdef class Language:
|
|||
cpdef Tokens tokenize(self, unicode text)
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
|
||||
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes) except NULL
|
||||
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
|
||||
vector[Lexeme*] *suffixes) except NULL
|
||||
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
|
||||
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
||||
vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
|
||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _save_cached(self, LexemeC** tokens, uint64_t key, int n) except -1
|
||||
cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1
|
||||
|
||||
|
|
|
@ -13,22 +13,21 @@ import random
|
|||
from os import path
|
||||
import re
|
||||
|
||||
from .util import read_lang_data
|
||||
from .tokens import Tokens
|
||||
from .lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
|
||||
from .lexeme cimport LexStr_orig
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from cpython.ref cimport Py_INCREF
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from cython.operator cimport preincrement as preinc
|
||||
from cython.operator cimport dereference as deref
|
||||
|
||||
from murmurhash.mrmr cimport hash64
|
||||
from preshed.maps cimport PreshMap
|
||||
from spacy import orth
|
||||
from spacy import util
|
||||
|
||||
from .lexeme cimport Lexeme
|
||||
from .lexeme cimport from_dict as lexeme_from_dict
|
||||
from .lexeme cimport from_string as lexeme_from_string
|
||||
|
||||
from . import orth
|
||||
from . import util
|
||||
from .util import read_lang_data
|
||||
from .tokens import Tokens
|
||||
|
||||
|
||||
cdef class Language:
|
||||
|
@ -64,7 +63,7 @@ cdef class Language:
|
|||
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
|
||||
"""
|
||||
cdef int length = len(string)
|
||||
cdef Tokens tokens = Tokens(length)
|
||||
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
|
||||
if length == 0:
|
||||
return tokens
|
||||
cdef int i = 0
|
||||
|
@ -76,7 +75,7 @@ cdef class Language:
|
|||
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
||||
if start < i:
|
||||
string_slice(&span, chars, start, i)
|
||||
lexemes = <LexemeC**>self.cache.get(span.key)
|
||||
lexemes = <Lexeme**>self.cache.get(span.key)
|
||||
if lexemes != NULL:
|
||||
tokens.extend(start, lexemes, 0)
|
||||
else:
|
||||
|
@ -88,7 +87,7 @@ cdef class Language:
|
|||
i += 1
|
||||
if start < i:
|
||||
string_slice(&span, chars, start, i)
|
||||
lexemes = <LexemeC**>self.cache.get(span.key)
|
||||
lexemes = <Lexeme**>self.cache.get(span.key)
|
||||
if lexemes != NULL:
|
||||
tokens.extend(start, lexemes, 0)
|
||||
else:
|
||||
|
@ -96,9 +95,9 @@ cdef class Language:
|
|||
return tokens
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
|
||||
cdef vector[LexemeC*] prefixes
|
||||
cdef vector[LexemeC*] suffixes
|
||||
cdef uint64_t orig_key
|
||||
cdef vector[Lexeme*] prefixes
|
||||
cdef vector[Lexeme*] suffixes
|
||||
cdef hash_t orig_key
|
||||
cdef int orig_size
|
||||
orig_key = span.key
|
||||
orig_size = tokens.length
|
||||
|
@ -106,8 +105,8 @@ cdef class Language:
|
|||
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
|
||||
self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
|
||||
|
||||
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes) except NULL:
|
||||
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
|
||||
vector[Lexeme*] *suffixes) except NULL:
|
||||
cdef size_t i
|
||||
cdef String prefix
|
||||
cdef String suffix
|
||||
|
@ -150,15 +149,15 @@ cdef class Language:
|
|||
|
||||
cdef int _attach_tokens(self, Tokens tokens,
|
||||
int idx, String* string,
|
||||
vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes) except -1:
|
||||
vector[Lexeme*] *prefixes,
|
||||
vector[Lexeme*] *suffixes) except -1:
|
||||
cdef int split
|
||||
cdef LexemeC** lexemes
|
||||
cdef LexemeC* lexeme
|
||||
cdef Lexeme** lexemes
|
||||
cdef Lexeme* lexeme
|
||||
cdef String span
|
||||
idx = tokens.extend(idx, prefixes.data(), prefixes.size())
|
||||
if string.n != 0:
|
||||
lexemes = <LexemeC**>self.cache.get(string.key)
|
||||
lexemes = <Lexeme**>self.cache.get(string.key)
|
||||
if lexemes != NULL:
|
||||
idx = tokens.extend(idx, lexemes, 0)
|
||||
else:
|
||||
|
@ -172,13 +171,13 @@ cdef class Language:
|
|||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
||||
string_slice(&span, string.chars, split + 1, string.n)
|
||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
||||
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
|
||||
cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
|
||||
while it != suffixes.rend():
|
||||
idx = tokens.push_back(idx, deref(it))
|
||||
preinc(it)
|
||||
|
||||
cdef int _save_cached(self, LexemeC** tokens, uint64_t key, int n) except -1:
|
||||
lexemes = <LexemeC**>self._mem.alloc(n + 1, sizeof(LexemeC**))
|
||||
cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1:
|
||||
lexemes = <Lexeme**>self._mem.alloc(n + 1, sizeof(Lexeme**))
|
||||
cdef int i
|
||||
for i in range(n):
|
||||
lexemes[i] = tokens[i]
|
||||
|
@ -212,14 +211,14 @@ cdef class Language:
|
|||
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
||||
a string and tokens is a list of strings.
|
||||
'''
|
||||
cdef LexemeC** lexemes
|
||||
cdef uint64_t hashed
|
||||
cdef Lexeme** lexemes
|
||||
cdef hash_t hashed
|
||||
cdef String string
|
||||
for uni_string, substrings in token_rules:
|
||||
lexemes = <LexemeC**>self._mem.alloc(len(substrings) + 1, sizeof(LexemeC*))
|
||||
lexemes = <Lexeme**>self._mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
|
||||
for i, substring in enumerate(substrings):
|
||||
string_from_unicode(&string, substring)
|
||||
lexemes[i] = <LexemeC*>self.lexicon.get(&string)
|
||||
lexemes[i] = <Lexeme*>self.lexicon.get(&string)
|
||||
lexemes[i + 1] = NULL
|
||||
string_from_unicode(&string, uni_string)
|
||||
self.specials.set(string.key, lexemes)
|
||||
|
@ -227,33 +226,29 @@ cdef class Language:
|
|||
|
||||
|
||||
cdef class Lexicon:
|
||||
def __cinit__(self, lexemes):
|
||||
def __init__(self, lexemes):
|
||||
self.mem = Pool()
|
||||
self._dict = PreshMap(2 ** 20)
|
||||
self.strings = StringStore()
|
||||
self.size = 0
|
||||
cdef String string
|
||||
cdef dict lexeme_dict
|
||||
cdef LexemeC* lexeme
|
||||
for py_string, lexeme_dict in lexemes.iteritems():
|
||||
string_from_unicode(&string, py_string)
|
||||
lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
|
||||
lexeme_unpack(lexeme, lexeme_dict)
|
||||
self._dict.set(string.key, lexeme)
|
||||
self.lexemes.push_back(lexeme)
|
||||
self.size += 1
|
||||
cdef Lexeme* lexeme
|
||||
#for py_string, lexeme_dict in lexemes.iteritems():
|
||||
# string_from_unicode(&string, py_string)
|
||||
# lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
|
||||
# lexeme_from_dict(lexeme, lexeme_dict, self.strings)
|
||||
# self._dict.set(string.key, lexeme)
|
||||
# self.lexemes.push_back(lexeme)
|
||||
# self.size += 1
|
||||
|
||||
def __getitem__(self, size_t i):
|
||||
return Lexeme(<size_t>self.lexemes.at(i))
|
||||
|
||||
cdef LexemeC* get(self, String* string) except NULL:
|
||||
cdef LexemeC* lex
|
||||
lex = <LexemeC*>self._dict.get(string.key)
|
||||
cdef Lexeme* get(self, String* string) except NULL:
|
||||
cdef Lexeme* lex
|
||||
lex = <Lexeme*>self._dict.get(string.key)
|
||||
if lex != NULL:
|
||||
return lex
|
||||
|
||||
lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
|
||||
cdef unicode unicode_string = string.chars[:string.n]
|
||||
lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
|
||||
lex = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
|
||||
lexeme_from_string(lex, string.chars[:string.n], self.strings)
|
||||
self._dict.set(string.key, lex)
|
||||
self.lexemes.push_back(lex)
|
||||
self.size += 1
|
||||
|
@ -270,8 +265,8 @@ cdef class Lexicon:
|
|||
"""
|
||||
cdef String string
|
||||
string_from_unicode(&string, uni_string)
|
||||
cdef LexemeC* lexeme = self.get(&string)
|
||||
return Lexeme(<size_t>lexeme)
|
||||
cdef Lexeme* lexeme = self.get(&string)
|
||||
return lexeme[0]
|
||||
|
||||
|
||||
cdef void string_from_unicode(String* s, unicode uni):
|
||||
|
|
117
spacy/lexeme.pxd
117
spacy/lexeme.pxd
|
@ -1,94 +1,55 @@
|
|||
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from thinc.typedefs cimport atom_t
|
||||
|
||||
from .utf8string cimport StringStore
|
||||
|
||||
cpdef flag_t OOV_DIST_FLAGS
|
||||
|
||||
# Flags
|
||||
cpdef enum:
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
IS_DIGIT
|
||||
IS_LOWER
|
||||
IS_PUNCT
|
||||
IS_SPACE
|
||||
IS_TITLE
|
||||
IS_UPPER
|
||||
|
||||
cpdef enum LexInts:
|
||||
LexInt_id
|
||||
LexInt_length
|
||||
LexInt_cluster
|
||||
LexInt_pos
|
||||
LexInt_supersense
|
||||
LexInt_N
|
||||
OFT_LOWER
|
||||
OFT_TITLE
|
||||
OFT_UPPER
|
||||
|
||||
|
||||
cpdef enum LexFloats:
|
||||
LexFloat_prob
|
||||
LexFloat_sentiment
|
||||
LexFloat_N
|
||||
cdef struct Lexeme:
|
||||
atom_t id
|
||||
atom_t length
|
||||
|
||||
atom_t norm
|
||||
atom_t shape
|
||||
atom_t vocab10k
|
||||
atom_t asciied
|
||||
atom_t prefix
|
||||
atom_t suffix
|
||||
|
||||
atom_t cluster
|
||||
atom_t pos
|
||||
atom_t supersense
|
||||
|
||||
float prob
|
||||
|
||||
flag_t flags
|
||||
|
||||
|
||||
cpdef enum LexStrs:
|
||||
LexStr_orig
|
||||
LexStr_norm
|
||||
LexStr_shape
|
||||
LexStr_unsparse
|
||||
LexStr_asciied
|
||||
LexStr_pre
|
||||
LexStr_suff
|
||||
LexStr_N
|
||||
cdef Lexeme EMPTY_LEXEME
|
||||
|
||||
|
||||
cpdef enum LexOrthFlags:
|
||||
LexOrth_alpha
|
||||
LexOrth_ascii
|
||||
LexOrth_digit
|
||||
LexOrth_lower
|
||||
LexOrth_punct
|
||||
LexOrth_space
|
||||
LexOrth_title
|
||||
LexOrth_upper
|
||||
LexOrth_N
|
||||
cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1
|
||||
|
||||
|
||||
cpdef enum LexDistFlags:
|
||||
LexDist_adj
|
||||
LexDist_adp
|
||||
LexDist_adv
|
||||
LexDist_conj
|
||||
LexDist_det
|
||||
LexDist_noun
|
||||
LexDist_num
|
||||
LexDist_pdt
|
||||
LexDist_pos
|
||||
LexDist_pron
|
||||
LexDist_prt
|
||||
LexDist_punct
|
||||
LexDist_verb
|
||||
|
||||
LexDist_lower
|
||||
LexDist_title
|
||||
LexDist_upper
|
||||
|
||||
LexDist_N
|
||||
cdef int from_dict(Lexeme* lex, dict props, StringStore store) except -1
|
||||
|
||||
|
||||
cdef struct LexemeC:
|
||||
int[<int>LexInt_N] ints
|
||||
float[<int>LexFloat_N] floats
|
||||
utf8_t[<int>LexStr_N] strings
|
||||
flag_t orth_flags
|
||||
flag_t dist_flags
|
||||
|
||||
|
||||
cdef LexemeC EMPTY_LEXEME
|
||||
|
||||
|
||||
cpdef dict get_lexeme_dict(size_t i, unicode string)
|
||||
|
||||
cdef char* intern_and_encode(unicode string, size_t* length) except NULL
|
||||
|
||||
cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *
|
||||
|
||||
cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *
|
||||
|
||||
cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i)
|
||||
|
||||
cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *
|
||||
|
||||
cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *
|
||||
|
||||
cdef dict lexeme_pack(LexemeC* lexeme)
|
||||
cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1
|
||||
cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
|
||||
return lexeme.flags & (1 << flag_id)
|
||||
|
|
118
spacy/lexeme.pyx
118
spacy/lexeme.pyx
|
@ -5,106 +5,40 @@ from libc.string cimport memset
|
|||
|
||||
import orth
|
||||
|
||||
from .utf8string cimport Utf8Str
|
||||
|
||||
OOV_DIST_FLAGS = 0
|
||||
|
||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
|
||||
|
||||
cpdef dict get_lexeme_dict(size_t i, unicode string):
|
||||
ints = [None for _ in range(LexInt_N)]
|
||||
ints[<int>LexInt_id] = i
|
||||
ints[<int>LexInt_length] = len(string)
|
||||
ints[<int>LexInt_cluster] = 0
|
||||
ints[<int>LexInt_pos] = 0
|
||||
ints[<int>LexInt_supersense] = 0
|
||||
|
||||
floats = [None for _ in range(LexFloat_N)]
|
||||
floats[<int>LexFloat_prob] = 0
|
||||
floats[<int>LexFloat_sentiment] = 0
|
||||
|
||||
strings = [None for _ in range(LexStr_N)]
|
||||
strings[<int>LexStr_orig] = string
|
||||
strings[<int>LexStr_norm] = strings[<int>LexStr_orig]
|
||||
strings[<int>LexStr_shape] = orth.word_shape(string)
|
||||
strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
|
||||
strings[<int>LexStr_asciied] = orth.asciied(string)
|
||||
strings[<int>LexStr_pre] = string[0]
|
||||
strings[<int>LexStr_suff] = string[-3:]
|
||||
|
||||
orth_flags = get_orth_flags(string)
|
||||
dist_flags = OOV_DIST_FLAGS
|
||||
|
||||
return {'ints': ints, 'floats': floats, 'strings': strings,
|
||||
'orth_flags': orth_flags, 'dist_flags': dist_flags}
|
||||
|
||||
def get_orth_flags(unicode string):
|
||||
def get_flags(unicode string):
|
||||
cdef flag_t flags = 0
|
||||
|
||||
flags |= orth.is_ascii(string) << LexOrth_ascii
|
||||
flags |= orth.is_alpha(string) << LexOrth_alpha
|
||||
flags |= orth.is_digit(string) << LexOrth_digit
|
||||
flags |= orth.is_lower(string) << LexOrth_lower
|
||||
flags |= orth.is_punct(string) << LexOrth_punct
|
||||
flags |= orth.is_space(string) << LexOrth_space
|
||||
flags |= orth.is_title(string) << LexOrth_title
|
||||
flags |= orth.is_upper(string) << LexOrth_upper
|
||||
flags |= orth.is_alpha(string) << IS_ALPHA
|
||||
flags |= orth.is_ascii(string) << IS_ASCII
|
||||
flags |= orth.is_digit(string) << IS_DIGIT
|
||||
flags |= orth.is_lower(string) << IS_LOWER
|
||||
flags |= orth.is_punct(string) << IS_PUNCT
|
||||
flags |= orth.is_space(string) << IS_SPACE
|
||||
flags |= orth.is_title(string) << IS_TITLE
|
||||
flags |= orth.is_upper(string) << IS_UPPER
|
||||
return flags
|
||||
|
||||
|
||||
def get_dist_flags(unicode string):
|
||||
return 0
|
||||
|
||||
|
||||
cdef char* intern_and_encode(unicode string, size_t* length) except NULL:
|
||||
cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1:
|
||||
cdef bytes byte_string = string.encode('utf8')
|
||||
cdef bytes utf8_string = intern(byte_string)
|
||||
Py_INCREF(utf8_string)
|
||||
length[0] = len(utf8_string)
|
||||
return <char*>utf8_string
|
||||
cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
|
||||
lex.id = orig_str.i
|
||||
lex.cluster = 0
|
||||
lex.length = len(string)
|
||||
lex.flags = get_flags(string)
|
||||
# TODO: Hook this up
|
||||
#lex.norm = norm_str.i
|
||||
#lex.shape = norm_str.i
|
||||
#lex.asciied = asciied_str.i
|
||||
#lex.prefix = prefix_str.i
|
||||
#lex.suffix = suffix_str.i
|
||||
|
||||
|
||||
cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *:
|
||||
return lexeme.ints[i]
|
||||
|
||||
|
||||
cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *:
|
||||
return lexeme.floats[i]
|
||||
|
||||
|
||||
cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i):
|
||||
cdef bytes byte_string = lexeme.strings[i]
|
||||
return byte_string.decode('utf8')
|
||||
|
||||
|
||||
cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *:
|
||||
return lexeme.orth_flags & (1 << flag_id)
|
||||
|
||||
|
||||
cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *:
|
||||
return lexeme.dist_flags & (1 << flag_id)
|
||||
|
||||
|
||||
cdef dict lexeme_pack(LexemeC* lex):
|
||||
cdef dict packed = {}
|
||||
packed['ints'] = [lex.ints[i] for i in range(LexInt_N)]
|
||||
packed['floats'] = [lex.floats[i] for i in range(LexFloat_N)]
|
||||
packed['strings'] = [lex.strings[i].decode('utf8') for i in range(LexStr_N)]
|
||||
packed['orth_flags'] = lex.orth_flags
|
||||
packed['dist_flags'] = lex.orth_flags
|
||||
return packed
|
||||
|
||||
|
||||
cdef int lexeme_unpack(LexemeC* lex, dict p) except -1:
|
||||
cdef size_t i
|
||||
cdef int lex_int
|
||||
cdef float lex_float
|
||||
cdef unicode string
|
||||
for i, lex_int in enumerate(p['ints']):
|
||||
lex.ints[i] = lex_int
|
||||
for i, lex_float in enumerate(p['floats']):
|
||||
lex.floats[i] = lex_float
|
||||
cdef size_t _
|
||||
for i in range(LexStr_N):
|
||||
lex_string = p['strings'][i]
|
||||
lex.strings[i] = intern_and_encode(lex_string, &_)
|
||||
lex.orth_flags = p['orth_flags']
|
||||
lex.dist_flags = p['dist_flags']
|
||||
cdef int from_dict(Lexeme* lex, dict props, StringStore stroe) except -1:
|
||||
pass
|
||||
|
|
|
@ -113,8 +113,8 @@ cpdef enum:
|
|||
CONTEXT_SIZE
|
||||
|
||||
|
||||
cdef int get_atoms(atom_t* atoms, LexemeC* p2, LexemeC* p1, LexemeC* n0, LexemeC* n1,
|
||||
LexemeC* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
|
||||
cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1,
|
||||
Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
|
||||
_fill_token(&atoms[P2i], p2)
|
||||
_fill_token(&atoms[P1i], p1)
|
||||
_fill_token(&atoms[N0i], n0)
|
||||
|
@ -124,16 +124,16 @@ cdef int get_atoms(atom_t* atoms, LexemeC* p2, LexemeC* p1, LexemeC* n0, LexemeC
|
|||
atoms[P2t] = prev_prev_tag
|
||||
|
||||
|
||||
cdef inline void _fill_token(atom_t* atoms, LexemeC* lex) nogil:
|
||||
atoms[0] = lex.ints[<int>LexInt_id]
|
||||
atoms[1] = lex.ints[<int>LexInt_cluster]
|
||||
atoms[2] = <atom_t>lex.strings[<int>LexStr_norm]
|
||||
atoms[3] = <atom_t>lex.strings[<int>LexStr_shape]
|
||||
atoms[4] = <atom_t>lex.strings[<int>LexStr_pre]
|
||||
atoms[5] = <atom_t>lex.strings[<int>LexStr_suff]
|
||||
cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
|
||||
atoms[0] = lex.id
|
||||
atoms[1] = lex.cluster
|
||||
atoms[2] = lex.norm
|
||||
atoms[3] = lex.shape
|
||||
atoms[4] = lex.prefix
|
||||
atoms[5] = lex.suffix
|
||||
|
||||
atoms[6] = lex.dist_flags & (1 << LexDist_title)
|
||||
atoms[7] = lex.dist_flags & (1 << LexDist_upper)
|
||||
atoms[6] = lex.flags & (1 << OFT_TITLE)
|
||||
atoms[7] = lex.flags & (1 << OFT_UPPER)
|
||||
|
||||
|
||||
TEMPLATES = (
|
||||
|
|
|
@ -20,6 +20,8 @@ def realign_tagged(token_rules, tagged_line, sep='/'):
|
|||
def read_tagged(detoken_rules, file_, sep='/'):
|
||||
sentences = []
|
||||
for line in file_:
|
||||
if not line.strip():
|
||||
continue
|
||||
line = realign_tagged(detoken_rules, line, sep=sep)
|
||||
tokens, tags = _parse_line(line, sep)
|
||||
assert len(tokens) == len(tags)
|
||||
|
@ -39,7 +41,7 @@ def _parse_line(line, sep):
|
|||
subtags.append('NULL')
|
||||
assert len(subtags) == len(subtokens), [t.string for t in subtokens]
|
||||
words.append(word)
|
||||
tags.extend([Tagger.encode_pos(pos) for pos in subtags])
|
||||
tags.extend([Tagger.encode_pos(ptb_to_univ(pos)) for pos in subtags])
|
||||
return EN.tokenize(' '.join(words)), tags
|
||||
|
||||
|
||||
|
@ -53,3 +55,86 @@ def get_tagdict(train_sents):
|
|||
tagdict.setdefault(word, {}).setdefault(tag, 0)
|
||||
tagdict[word][tag] += 1
|
||||
return tagdict
|
||||
|
||||
|
||||
def ptb_to_univ(tag):
|
||||
mapping = dict(tuple(line.split()) for line in """
|
||||
NULL NULL
|
||||
HYPH .
|
||||
ADD X
|
||||
NFP .
|
||||
AFX X
|
||||
XX X
|
||||
BES VERB
|
||||
HVS VERB
|
||||
GW X
|
||||
! .
|
||||
# .
|
||||
$ .
|
||||
'' .
|
||||
( .
|
||||
) .
|
||||
, .
|
||||
-LRB- .
|
||||
-RRB- .
|
||||
. .
|
||||
: .
|
||||
? .
|
||||
CC CONJ
|
||||
CD NUM
|
||||
CD|RB X
|
||||
DT DET
|
||||
EX DET
|
||||
FW X
|
||||
IN ADP
|
||||
IN|RP ADP
|
||||
JJ ADJ
|
||||
JJR ADJ
|
||||
JJRJR ADJ
|
||||
JJS ADJ
|
||||
JJ|RB ADJ
|
||||
JJ|VBG ADJ
|
||||
LS X
|
||||
MD VERB
|
||||
NN NOUN
|
||||
NNP NOUN
|
||||
NNPS NOUN
|
||||
NNS NOUN
|
||||
NN|NNS NOUN
|
||||
NN|SYM NOUN
|
||||
NN|VBG NOUN
|
||||
NP NOUN
|
||||
PDT DET
|
||||
POS PRT
|
||||
PRP PRON
|
||||
PRP$ PRON
|
||||
PRP|VBP PRON
|
||||
PRT PRT
|
||||
RB ADV
|
||||
RBR ADV
|
||||
RBS ADV
|
||||
RB|RP ADV
|
||||
RB|VBG ADV
|
||||
RN X
|
||||
RP PRT
|
||||
SYM X
|
||||
TO PRT
|
||||
UH X
|
||||
VB VERB
|
||||
VBD VERB
|
||||
VBD|VBN VERB
|
||||
VBG VERB
|
||||
VBG|NN VERB
|
||||
VBN VERB
|
||||
VBP VERB
|
||||
VBP|TO VERB
|
||||
VBZ VERB
|
||||
VP VERB
|
||||
WDT DET
|
||||
WH X
|
||||
WP PRON
|
||||
WP$ PRON
|
||||
WRB ADV
|
||||
`` .""".strip().split('\n'))
|
||||
return mapping[tag]
|
||||
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
from spacy.lang cimport Language
|
||||
|
||||
|
||||
cdef class PennTreebank3(Language):
|
||||
cdef list _split(self, unicode split)
|
161
spacy/ptb3.pyx
161
spacy/ptb3.pyx
|
@ -1,161 +0,0 @@
|
|||
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
|
||||
so that strings can be retrieved from hashes. Use 64-bit hash values and
|
||||
boldly assume no collisions.
|
||||
'''
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
from libc.stdint cimport uint64_t
|
||||
|
||||
|
||||
cimport spacy
|
||||
|
||||
import re
|
||||
|
||||
from spacy import orth
|
||||
|
||||
TAG_THRESH = 0.5
|
||||
UPPER_THRESH = 0.2
|
||||
LOWER_THRESH = 0.5
|
||||
TITLE_THRESH = 0.7
|
||||
|
||||
NR_FLAGS = 0
|
||||
|
||||
OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
|
||||
OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
|
||||
OFT_TITLE = NR_FLAGS; NR_FLAGS += 1
|
||||
|
||||
IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
|
||||
IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
|
||||
IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
|
||||
IS_SPACE = NR_FLAGS; NR_FLAGS += 1
|
||||
IS_ASCII = NR_FLAGS; NR_FLAGS += 1
|
||||
IS_TITLE = NR_FLAGS; NR_FLAGS += 1
|
||||
IS_LOWER = NR_FLAGS; NR_FLAGS += 1
|
||||
IS_UPPER = NR_FLAGS; NR_FLAGS += 1
|
||||
|
||||
CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_NUM = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_DET = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_ADP = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_ADV = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_VERB = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_PDT = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_POS = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_PRT = NR_FLAGS; NR_FLAGS += 1
|
||||
|
||||
|
||||
# List of contractions adapted from Robert MacIntyre's tokenizer.
|
||||
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
|
||||
re.compile(r"(?i)\b(d)('ye)\b"),
|
||||
re.compile(r"(?i)\b(gim)(me)\b"),
|
||||
re.compile(r"(?i)\b(gon)(na)\b"),
|
||||
re.compile(r"(?i)\b(got)(ta)\b"),
|
||||
re.compile(r"(?i)\b(lem)(me)\b"),
|
||||
re.compile(r"(?i)\b(mor)('n)\b"),
|
||||
re.compile(r"(?i)\b(wan)(na) ")]
|
||||
|
||||
CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
|
||||
re.compile(r"(?i) ('t)(was)\b")]
|
||||
|
||||
CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
|
||||
re.compile(r"(?i)\b(wha)(t)(cha)\b")]
|
||||
|
||||
def nltk_regex_tokenize(text):
|
||||
# Implementation taken from NLTK 3.0, based on tokenizer.sed
|
||||
|
||||
#starting quotes
|
||||
text = re.sub(r'^\"', r'``', text)
|
||||
text = re.sub(r'(``)', r' \1 ', text)
|
||||
text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
|
||||
|
||||
#punctuation
|
||||
text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
|
||||
text = re.sub(r'\.\.\.', r' ... ', text)
|
||||
text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
|
||||
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
|
||||
text = re.sub(r'[?!]', r' \g<0> ', text)
|
||||
|
||||
text = re.sub(r"([^'])' ", r"\1 ' ", text)
|
||||
|
||||
#parens, brackets, etc.
|
||||
text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
|
||||
text = re.sub(r'--', r' -- ', text)
|
||||
|
||||
#add extra space to make things easier
|
||||
text = " " + text + " "
|
||||
|
||||
#ending quotes
|
||||
text = re.sub(r'"', " '' ", text)
|
||||
text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
|
||||
|
||||
text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
|
||||
text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
|
||||
text)
|
||||
|
||||
for regexp in CONTRACTIONS2:
|
||||
text = regexp.sub(r' \1 \2 ', text)
|
||||
for regexp in CONTRACTIONS3:
|
||||
text = regexp.sub(r' \1 \2 ', text)
|
||||
|
||||
# We are not using CONTRACTIONS4 since
|
||||
# they are also commented out in the SED scripts
|
||||
# for regexp in self.CONTRACTIONS4:
|
||||
# text = regexp.sub(r' \1 \2 \3 ', text)
|
||||
|
||||
return text.split()
|
||||
|
||||
|
||||
cdef class PennTreebank3(Language):
|
||||
"""Fully PTB compatible English tokenizer, tightly coupled to lexicon.
|
||||
|
||||
Attributes:
|
||||
name (unicode): The two letter code used by Wikipedia for the language.
|
||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||
"""
|
||||
|
||||
|
||||
def __cinit__(self, name):
|
||||
flag_funcs = [0 for _ in range(NR_FLAGS)]
|
||||
|
||||
flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
|
||||
flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
|
||||
flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
|
||||
|
||||
flag_funcs[IS_ALPHA] = orth.is_alpha
|
||||
flag_funcs[IS_DIGIT] = orth.is_digit
|
||||
flag_funcs[IS_PUNCT] = orth.is_punct
|
||||
flag_funcs[IS_SPACE] = orth.is_space
|
||||
flag_funcs[IS_TITLE] = orth.is_title
|
||||
flag_funcs[IS_LOWER] = orth.is_lower
|
||||
flag_funcs[IS_UPPER] = orth.is_upper
|
||||
|
||||
flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
|
||||
flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
|
||||
flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
|
||||
flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
|
||||
flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
|
||||
flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
|
||||
flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
|
||||
flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
|
||||
flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
|
||||
flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
|
||||
flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
|
||||
|
||||
Language.__init__(self, name, flag_funcs)
|
||||
|
||||
|
||||
cdef list _split(self, unicode chunk):
|
||||
strings = nltk_regex_tokenize(chunk)
|
||||
if strings[-1] == '.':
|
||||
strings.pop()
|
||||
strings[-1] += '.'
|
||||
assert strings
|
||||
return strings
|
||||
|
||||
|
||||
PTB3 = PennTreebank3('ptb3')
|
|
@ -1,59 +1,49 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from spacy.lexeme cimport LexemeC
|
||||
from .lexeme cimport Lexeme
|
||||
from .typedefs cimport flag_t
|
||||
from .utf8string cimport StringStore
|
||||
|
||||
from thinc.typedefs cimport atom_t
|
||||
|
||||
|
||||
cdef class Tokens:
|
||||
cdef Pool mem
|
||||
cdef StringStore _string_store
|
||||
|
||||
cdef LexemeC** _lex_ptr
|
||||
cdef Lexeme** _lex_ptr
|
||||
cdef int* _idx_ptr
|
||||
cdef int* _pos_ptr
|
||||
cdef LexemeC** lex
|
||||
cdef Lexeme** lex
|
||||
cdef int* idx
|
||||
cdef int* pos
|
||||
|
||||
cdef int length
|
||||
cdef int max_length
|
||||
|
||||
cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
|
||||
cdef int push_back(self, int i, LexemeC* lexeme) except -1
|
||||
cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
|
||||
cdef int push_back(self, int i, Lexeme* lexeme) except -1
|
||||
|
||||
cpdef int id(self, size_t i) except -1
|
||||
cpdef float prob(self, size_t i) except 1
|
||||
cpdef int cluster(self, size_t i) except *
|
||||
cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *
|
||||
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *
|
||||
cpdef unicode string_view(self, size_t i, size_t view_id)
|
||||
|
||||
cpdef unicode string(self, size_t i)
|
||||
cpdef unicode orig(self, size_t i)
|
||||
cpdef unicode norm(self, size_t i)
|
||||
cpdef unicode shape(self, size_t i)
|
||||
cpdef unicode unsparse(self, size_t i)
|
||||
cpdef unicode asciied(self, size_t i)
|
||||
cpdef bint is_alpha(self, size_t i) except *
|
||||
cpdef bint is_ascii(self, size_t i) except *
|
||||
cpdef bint is_digit(self, size_t i) except *
|
||||
cpdef bint is_lower(self, size_t i) except *
|
||||
cpdef bint is_punct(self, size_t i) except *
|
||||
cpdef bint is_space(self, size_t i) except *
|
||||
cpdef bint is_title(self, size_t i) except *
|
||||
cpdef bint is_upper(self, size_t i) except *
|
||||
cpdef bint can_adj(self, size_t i) except *
|
||||
cpdef bint can_adp(self, size_t i) except *
|
||||
cpdef bint can_adv(self, size_t i) except *
|
||||
cpdef bint can_conj(self, size_t i) except *
|
||||
cpdef bint can_det(self, size_t i) except *
|
||||
cpdef bint can_noun(self, size_t i) except *
|
||||
cpdef bint can_num(self, size_t i) except *
|
||||
cpdef bint can_pdt(self, size_t i) except *
|
||||
cpdef bint can_pos(self, size_t i) except *
|
||||
cpdef bint can_pron(self, size_t i) except *
|
||||
cpdef bint can_prt(self, size_t i) except *
|
||||
cpdef bint can_punct(self, size_t i) except *
|
||||
cpdef bint can_verb(self, size_t i) except *
|
||||
cpdef bint oft_lower(self, size_t i) except *
|
||||
cpdef bint oft_title(self, size_t i) except *
|
||||
cpdef bint oft_upper(self, size_t i) except *
|
||||
cdef class Token:
|
||||
cdef StringStore _string_store
|
||||
cdef public int i
|
||||
cdef public int idx
|
||||
cdef public int pos
|
||||
|
||||
cdef public atom_t id
|
||||
cdef public atom_t cluster
|
||||
cdef public atom_t length
|
||||
cdef public atom_t lex_pos
|
||||
cdef public atom_t lex_supersense
|
||||
|
||||
cdef public atom_t norm
|
||||
cdef public atom_t shape
|
||||
cdef public atom_t vocab10k
|
||||
cdef public atom_t asciied
|
||||
cdef public atom_t prefix
|
||||
cdef public atom_t suffix
|
||||
|
||||
cdef public float prob
|
||||
|
||||
cdef public flag_t flags
|
||||
|
|
209
spacy/tokens.pyx
209
spacy/tokens.pyx
|
@ -1,10 +1,6 @@
|
|||
# cython: profile=True
|
||||
from .word cimport Lexeme
|
||||
|
||||
from .lexeme cimport *
|
||||
cimport numpy
|
||||
cimport cython
|
||||
import numpy
|
||||
|
||||
DEF PADDING = 5
|
||||
|
||||
|
@ -34,7 +30,8 @@ cdef class Tokens:
|
|||
>>> tokens.can_noun(1)
|
||||
True
|
||||
"""
|
||||
def __init__(self, string_length=0):
|
||||
def __init__(self, StringStore string_store, string_length=0):
|
||||
self._string_store = string_store
|
||||
if string_length >= 3:
|
||||
size = int(string_length / 3.0)
|
||||
else:
|
||||
|
@ -43,7 +40,7 @@ cdef class Tokens:
|
|||
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
||||
# However, we need to remember the true starting places, so that we can
|
||||
# realloc.
|
||||
self._lex_ptr = <LexemeC**>self.mem.alloc(size + (PADDING*2), sizeof(LexemeC*))
|
||||
self._lex_ptr = <Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
|
||||
self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
||||
self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
||||
self.lex = self._lex_ptr
|
||||
|
@ -55,39 +52,26 @@ cdef class Tokens:
|
|||
self.lex += PADDING
|
||||
self.idx += PADDING
|
||||
self.pos += PADDING
|
||||
|
||||
self.max_length = size
|
||||
self.length = 0
|
||||
|
||||
def __getitem__(self, i):
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return Lexeme(<size_t>self.lex[i])
|
||||
return Token(self._string_store, i, self.idx[i], self.pos[i], self.lex[i][0])
|
||||
|
||||
def __len__(self):
|
||||
return self.length
|
||||
|
||||
cdef int push_back(self, int idx, LexemeC* lexeme) except -1:
|
||||
cdef int push_back(self, int idx, Lexeme* lexeme) except -1:
|
||||
if self.length == self.max_length:
|
||||
self._realloc(self.length * 2)
|
||||
self.lex[self.length] = lexeme
|
||||
self.idx[self.length] = idx
|
||||
self.pos[self.length] = 0
|
||||
self.length += 1
|
||||
return idx + lexeme.ints[<int>LexInt_length]
|
||||
return idx + lexeme.length
|
||||
|
||||
def _realloc(self, new_size):
|
||||
self.max_length = new_size
|
||||
n = new_size + (PADDING * 2)
|
||||
self._lex_ptr = <LexemeC**>self.mem.realloc(self._lex_ptr, n * sizeof(LexemeC*))
|
||||
self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
|
||||
self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
|
||||
self.lex = self._lex_ptr + PADDING
|
||||
self.idx = self._idx_ptr + PADDING
|
||||
self.pos = self._pos_ptr + PADDING
|
||||
for i in range(self.length, self.max_length + PADDING):
|
||||
self.lex[i] = &EMPTY_LEXEME
|
||||
|
||||
cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
|
||||
cdef int extend(self, int idx, Lexeme** lexemes, int n) except -1:
|
||||
cdef int i
|
||||
if lexemes == NULL:
|
||||
return idx
|
||||
|
@ -101,154 +85,43 @@ cdef class Tokens:
|
|||
idx = self.push_back(idx, lexemes[i])
|
||||
return idx
|
||||
|
||||
cpdef int id(self, size_t i) except -1:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return self.lex[i].ints[<int>LexInt_id]
|
||||
def _realloc(self, new_size):
|
||||
self.max_length = new_size
|
||||
n = new_size + (PADDING * 2)
|
||||
self._lex_ptr = <Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
|
||||
self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
|
||||
self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
|
||||
self.lex = self._lex_ptr + PADDING
|
||||
self.idx = self._idx_ptr + PADDING
|
||||
self.pos = self._pos_ptr + PADDING
|
||||
for i in range(self.length, self.max_length + PADDING):
|
||||
self.lex[i] = &EMPTY_LEXEME
|
||||
|
||||
cpdef float prob(self, size_t i) except 1:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return self.lex[i].floats[<int>LexFloat_prob]
|
||||
|
||||
cpdef int cluster(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return self.lex[i].ints[<int>LexInt_cluster]
|
||||
@cython.freelist(64)
|
||||
cdef class Token:
|
||||
def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex):
|
||||
self._string_store = string_store
|
||||
self.i = i
|
||||
self.idx = idx
|
||||
self.pos = pos
|
||||
|
||||
cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_orth_flag(self.lex[i], flag_id)
|
||||
self.id = lex['id']
|
||||
self.cluster = lex['cluster']
|
||||
self.length = lex['length']
|
||||
self.lex_pos = lex['pos']
|
||||
self.lex_supersense = lex['supersense']
|
||||
self.norm = lex['norm']
|
||||
self.shape = lex['shape']
|
||||
self.vocab10k = lex['vocab10k']
|
||||
self.suffix = lex['asciied']
|
||||
self.prefix = lex['prefix']
|
||||
|
||||
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], flag_id)
|
||||
self.prob = lex['prob']
|
||||
self.flags = lex['flags']
|
||||
|
||||
cpdef unicode string_view(self, size_t i, size_t view_id):
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_get_string(self.lex[i], view_id)
|
||||
property string:
|
||||
def __get__(self):
|
||||
cdef bytes utf8string = self._string_store[self.id]
|
||||
return utf8string.decode('utf8')
|
||||
|
||||
# Provide accessor methods for the features supported by the language.
|
||||
# Without these, clients have to use the underlying string_view and check_flag
|
||||
# methods, which requires them to know the IDs.
|
||||
|
||||
cpdef unicode string(self, size_t i):
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return self.orig(i)
|
||||
|
||||
cpdef unicode orig(self, size_t i):
|
||||
bounds_check(i, self.length, PADDING)
|
||||
cdef bytes utf8_string = self.lex[i].strings[<int>LexStr_orig]
|
||||
cdef unicode string = utf8_string.decode('utf8')
|
||||
return string
|
||||
|
||||
cpdef unicode norm(self, size_t i):
|
||||
bounds_check(i, self.length, PADDING)
|
||||
cdef bytes utf8_string = self.lex[i].strings[<int>LexStr_norm]
|
||||
cdef unicode string = utf8_string.decode('utf8')
|
||||
return string
|
||||
|
||||
cpdef unicode shape(self, size_t i):
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_get_string(self.lex[i], LexStr_shape)
|
||||
|
||||
cpdef unicode unsparse(self, size_t i):
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_get_string(self.lex[i], LexStr_unsparse)
|
||||
|
||||
cpdef unicode asciied(self, size_t i):
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_get_string(self.lex[i], LexStr_asciied)
|
||||
|
||||
cpdef bint is_alpha(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_orth_flag(self.lex[i], LexOrth_alpha)
|
||||
|
||||
cpdef bint is_ascii(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_orth_flag(self.lex[i], LexOrth_ascii)
|
||||
|
||||
cpdef bint is_digit(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_orth_flag(self.lex[i], LexOrth_digit)
|
||||
|
||||
cpdef bint is_lower(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_orth_flag(self.lex[i], LexOrth_lower)
|
||||
|
||||
cpdef bint is_punct(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_orth_flag(self.lex[i], LexOrth_punct)
|
||||
|
||||
cpdef bint is_space(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_orth_flag(self.lex[i], LexOrth_space)
|
||||
|
||||
cpdef bint is_title(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_orth_flag(self.lex[i], LexOrth_title)
|
||||
|
||||
cpdef bint is_upper(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_orth_flag(self.lex[i], LexOrth_upper)
|
||||
|
||||
cpdef bint can_adj(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], LexDist_adj)
|
||||
|
||||
cpdef bint can_adp(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], LexDist_adp)
|
||||
|
||||
cpdef bint can_adv(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], LexDist_adv)
|
||||
|
||||
cpdef bint can_conj(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], LexDist_conj)
|
||||
|
||||
cpdef bint can_det(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], LexDist_det)
|
||||
|
||||
cpdef bint can_noun(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], LexDist_noun)
|
||||
|
||||
cpdef bint can_num(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], LexDist_num)
|
||||
|
||||
cpdef bint can_pdt(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], LexDist_pdt)
|
||||
|
||||
cpdef bint can_pos(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], LexDist_pos)
|
||||
|
||||
cpdef bint can_pron(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], LexDist_pron)
|
||||
|
||||
cpdef bint can_prt(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], LexDist_prt)
|
||||
|
||||
cpdef bint can_punct(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], LexDist_punct)
|
||||
|
||||
cpdef bint can_verb(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], LexDist_verb)
|
||||
|
||||
cpdef bint oft_lower(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], LexDist_lower)
|
||||
|
||||
cpdef bint oft_title(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], LexDist_title)
|
||||
|
||||
cpdef bint oft_upper(self, size_t i) except *:
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return lexeme_check_dist_flag(self.lex[i], LexDist_upper)
|
||||
|
|
|
@ -1,12 +0,0 @@
|
|||
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
|
||||
from spacy.lexeme cimport LexemeC
|
||||
|
||||
DEF MAX_FLAG = 64
|
||||
|
||||
|
||||
cdef class Lexeme:
|
||||
cdef LexemeC* _c
|
||||
|
||||
cpdef bint check_orth_flag(self, size_t flag_id) except *
|
||||
cpdef bint check_dist_flag(self, size_t flag_id) except *
|
||||
cpdef unicode string_view(self, size_t view_id)
|
|
@ -1,80 +0,0 @@
|
|||
# cython: profile=True
|
||||
# cython: embedsignature=True
|
||||
|
||||
from .lexeme cimport lexeme_get_string
|
||||
from .lexeme cimport lexeme_check_orth_flag, lexeme_check_dist_flag
|
||||
|
||||
from .lexeme cimport *
|
||||
|
||||
|
||||
cdef class Lexeme:
|
||||
"""A lexical type --- a word, punctuation symbol, whitespace sequence, etc
|
||||
keyed by a case-sensitive unicode string. All tokens with the same string,
|
||||
e.g. all instances of "dog", ",", "NASA" etc should be mapped to the same
|
||||
Lexeme.
|
||||
|
||||
You should avoid instantiating Lexemes directly, and instead use the
|
||||
:py:meth:`space.lang.Language.tokenize` and :py:meth:`spacy.lang.Language.lookup`
|
||||
methods on the global object exposed by the language you're working with,
|
||||
e.g. :py:data:`spacy.en.EN`.
|
||||
|
||||
Attributes:
|
||||
string (unicode):
|
||||
The unicode string.
|
||||
|
||||
Implemented as a property; relatively expensive.
|
||||
|
||||
length (size_t):
|
||||
The number of unicode code-points in the string.
|
||||
|
||||
prob (double):
|
||||
An estimate of the word's unigram log probability.
|
||||
|
||||
Probabilities are calculated from a large text corpus, and smoothed using
|
||||
simple Good-Turing. Estimates are read from data/en/probabilities, and
|
||||
can be replaced using spacy.en.load_probabilities.
|
||||
|
||||
cluster (size_t):
|
||||
An integer representation of the word's Brown cluster.
|
||||
|
||||
A Brown cluster is an address into a binary tree, which gives some (noisy)
|
||||
information about the word's distributional context.
|
||||
|
||||
>>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
|
||||
>>> print ["{0:b"} % lookup(s).cluster for s in strings]
|
||||
["100111110110", "100111100100", "01010111011001", "100111110110"]
|
||||
|
||||
The clusterings are unideal, but often slightly useful.
|
||||
"pineapple" and "apple" share a long prefix, indicating a similar meaning,
|
||||
while "dapple" is totally different. On the other hand, "scalable" receives
|
||||
the same cluster ID as "pineapple", which is not what we'd like.
|
||||
"""
|
||||
def __cinit__(self, size_t lexeme_addr):
|
||||
self._c = <LexemeC*>lexeme_addr
|
||||
|
||||
property string:
|
||||
def __get__(self):
|
||||
cdef bytes utf8_string = self._c.strings[<int>LexStr_orig]
|
||||
cdef unicode string = utf8_string.decode('utf8')
|
||||
return string
|
||||
|
||||
property prob:
|
||||
def __get__(self):
|
||||
return self._c.floats[<int>LexFloat_prob]
|
||||
|
||||
property cluster:
|
||||
def __get__(self):
|
||||
return self._c.ints[<int>LexInt_cluster]
|
||||
|
||||
property length:
|
||||
def __get__(self):
|
||||
return self._c.ints[<int>LexInt_length]
|
||||
|
||||
cpdef bint check_orth_flag(self, size_t flag_id) except *:
|
||||
return lexeme_check_orth_flag(self._c, flag_id)
|
||||
|
||||
cpdef bint check_dist_flag(self, size_t flag_id) except *:
|
||||
return lexeme_check_dist_flag(self._c, flag_id)
|
||||
|
||||
cpdef unicode string_view(self, size_t view_id):
|
||||
return lexeme_get_string(self._c, view_id)
|
|
@ -5,8 +5,8 @@ from spacy.en import EN
|
|||
|
||||
def test_possess():
|
||||
tokens = EN.tokenize("Mike's")
|
||||
assert tokens[0].string == "Mike"
|
||||
assert tokens[1].string == "'s"
|
||||
assert EN.lexicon.strings[tokens[0].id] == "Mike"
|
||||
assert EN.lexicon.strings[tokens[1].id] == "'s"
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
|
|
|
@ -8,19 +8,17 @@ from spacy.lexeme import *
|
|||
|
||||
def test_is_alpha():
|
||||
the = EN.lexicon.lookup('the')
|
||||
assert the.check_orth_flag(LexOrth_alpha)
|
||||
assert the['flags'] & (1 << IS_ALPHA)
|
||||
year = EN.lexicon.lookup('1999')
|
||||
assert not year.check_orth_flag(LexOrth_alpha)
|
||||
assert not year['flags'] & (1 << IS_ALPHA)
|
||||
mixed = EN.lexicon.lookup('hello1')
|
||||
assert not mixed.check_orth_flag(LexOrth_alpha)
|
||||
assert not mixed['flags'] & (1 << IS_ALPHA)
|
||||
|
||||
|
||||
def test_is_digit():
|
||||
the = EN.lexicon.lookup('the')
|
||||
assert not the.check_orth_flag(LexOrth_digit)
|
||||
assert not the['flags'] & (1 << IS_DIGIT)
|
||||
year = EN.lexicon.lookup('1999')
|
||||
assert year.check_orth_flag(LexOrth_digit)
|
||||
assert year['flags'] & (1 << IS_DIGIT)
|
||||
mixed = EN.lexicon.lookup('hello1')
|
||||
assert not mixed.check_orth_flag(LexOrth_digit)
|
||||
|
||||
|
||||
assert not mixed['flags'] & (1 << IS_DIGIT)
|
||||
|
|
|
@ -1,27 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
import spacy.word
|
||||
from spacy.en import EN
|
||||
from spacy.lexeme import *
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def C3P0():
|
||||
return EN.lexicon.lookup("C3P0")
|
||||
|
||||
|
||||
def test_shape(C3P0):
|
||||
assert C3P0.string_view(LexStr_shape) == "XdXd"
|
||||
|
||||
|
||||
def test_length():
|
||||
t = EN.lexicon.lookup('the')
|
||||
assert t.length == 3
|
||||
t = EN.lexicon.lookup("n't")
|
||||
assert t.length == 3
|
||||
t = EN.lexicon.lookup("'s")
|
||||
assert t.length == 2
|
||||
t = EN.lexicon.lookup('Xxxx')
|
||||
assert t.length == 4
|
|
@ -8,9 +8,9 @@ from spacy.en import EN
|
|||
|
||||
def test_one():
|
||||
tokens = EN.tokenize('Betty Botter bought a pound of butter.')
|
||||
assert tokens.string(0) == 'Betty'
|
||||
assert tokens[0].string == 'Betty'
|
||||
tokens2 = EN.tokenize('Betty also bought a pound of butter.')
|
||||
assert tokens2.string(0) == 'Betty'
|
||||
assert tokens2[0].string == 'Betty'
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -5,41 +5,39 @@ from spacy.en import EN
|
|||
|
||||
|
||||
def test_single_word():
|
||||
lex_ids = EN.tokenize(u'hello')
|
||||
assert lex_ids[0].string == EN.lexicon.lookup(u'hello').string
|
||||
tokens = EN.tokenize(u'hello')
|
||||
assert tokens[0].string == 'hello'
|
||||
|
||||
|
||||
def test_two_words():
|
||||
words = EN.tokenize('hello possums')
|
||||
assert len(words) == 2
|
||||
assert words[0].string == EN.lexicon.lookup('hello').string
|
||||
assert words[0].string != words[1].string
|
||||
tokens = EN.tokenize('hello possums')
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].string != tokens[1].string
|
||||
|
||||
|
||||
def test_punct():
|
||||
tokens = EN.tokenize('hello, possums.')
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].string == EN.lexicon.lookup('hello').string
|
||||
assert tokens[1].string == EN.lexicon.lookup(',').string
|
||||
assert tokens[2].string == EN.lexicon.lookup('possums').string
|
||||
assert tokens[1].string != EN.lexicon.lookup('hello').string
|
||||
assert tokens[0].string == 'hello'
|
||||
assert tokens[1].string == ','
|
||||
assert tokens[2].string == 'possums'
|
||||
assert tokens[1].string != 'hello'
|
||||
|
||||
|
||||
def test_digits():
|
||||
lex_ids = EN.tokenize('The year: 1984.')
|
||||
assert lex_ids.orig(3) == "1984"
|
||||
assert len(lex_ids) == 5
|
||||
assert lex_ids[0].string == EN.lexicon.lookup('The').string
|
||||
assert lex_ids[3].string == EN.lexicon.lookup('1984').string
|
||||
tokens = EN.tokenize('The year: 1984.')
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].id == EN.lexicon.lookup('The')['id']
|
||||
assert tokens[3].id == EN.lexicon.lookup('1984')['id']
|
||||
|
||||
|
||||
def test_contraction():
|
||||
lex_ids = EN.tokenize("don't giggle")
|
||||
assert len(lex_ids) == 3
|
||||
assert lex_ids[1].string == EN.lexicon.lookup("not").string
|
||||
lex_ids = EN.tokenize("i said don't!")
|
||||
assert len(lex_ids) == 5
|
||||
assert lex_ids[4].string == EN.lexicon.lookup('!').string
|
||||
tokens = EN.tokenize("don't giggle")
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].id == EN.lexicon.lookup("not")['id']
|
||||
tokens = EN.tokenize("i said don't!")
|
||||
assert len(tokens) == 5
|
||||
assert tokens[4].id == EN.lexicon.lookup('!')['id']
|
||||
|
||||
|
||||
def test_contraction_punct():
|
||||
|
|
|
@ -5,30 +5,19 @@ from spacy.en import EN
|
|||
|
||||
def test_neq():
|
||||
addr = EN.lexicon.lookup('Hello')
|
||||
assert EN.lexicon.lookup('bye').string != addr.string
|
||||
assert EN.lexicon.lookup('bye')['id'] != addr['id']
|
||||
|
||||
|
||||
def test_eq():
|
||||
addr = EN.lexicon.lookup('Hello')
|
||||
assert EN.lexicon.lookup('Hello').string == addr.string
|
||||
|
||||
|
||||
def test_round_trip():
|
||||
hello = EN.lexicon.lookup('Hello')
|
||||
assert hello.string == 'Hello'
|
||||
assert EN.lexicon.lookup('Hello')['id'] == addr['id']
|
||||
|
||||
|
||||
def test_case_neq():
|
||||
addr = EN.lexicon.lookup('Hello')
|
||||
assert EN.lexicon.lookup('hello').string != addr.string
|
||||
assert EN.lexicon.lookup('hello')['id'] != addr['id']
|
||||
|
||||
|
||||
def test_punct_neq():
|
||||
addr = EN.lexicon.lookup('Hello')
|
||||
assert EN.lexicon.lookup('Hello,').string != addr.string
|
||||
|
||||
|
||||
def test_short():
|
||||
addr = EN.lexicon.lookup('I')
|
||||
assert addr.string == 'I'
|
||||
assert addr.string != 'not'
|
||||
assert EN.lexicon.lookup('Hello,')['id'] != addr['id']
|
||||
|
|
Loading…
Reference in New Issue
Block a user