mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Refactoring to use Tokens object
This commit is contained in:
parent
b8c4549ffe
commit
cf412adba8
33
spacy/en.pxd
33
spacy/en.pxd
|
@ -1,6 +1,37 @@
|
||||||
from spacy.lang cimport Language
|
from spacy.lang cimport Language
|
||||||
from spacy.word cimport Lexeme
|
from spacy.word cimport Lexeme
|
||||||
cimport cython
|
from spacy.tokens cimport Tokens
|
||||||
|
|
||||||
|
|
||||||
|
cdef class EnglishTokens(Tokens):
|
||||||
|
cpdef unicode canon_string(self, size_t i)
|
||||||
|
cpdef unicode shape_string(self, size_t i)
|
||||||
|
cpdef unicode non_sparse_string(self, size_t i)
|
||||||
|
cpdef unicode asciied(self, size_t i)
|
||||||
|
cpdef bint is_alpha(self, size_t i)
|
||||||
|
cpdef bint is_ascii(self, size_t i)
|
||||||
|
cpdef bint is_digit(self, size_t i)
|
||||||
|
cpdef bint is_lower(self, size_t i)
|
||||||
|
cpdef bint is_punct(self, size_t i)
|
||||||
|
cpdef bint is_space(self, size_t i)
|
||||||
|
cpdef bint is_title(self, size_t i)
|
||||||
|
cpdef bint is_upper(self, size_t i)
|
||||||
|
cpdef bint can_adj(self, size_t i)
|
||||||
|
cpdef bint can_adp(self, size_t i)
|
||||||
|
cpdef bint can_adv(self, size_t i)
|
||||||
|
cpdef bint can_conj(self, size_t i)
|
||||||
|
cpdef bint can_det(self, size_t i)
|
||||||
|
cpdef bint can_noun(self, size_t i)
|
||||||
|
cpdef bint can_num(self, size_t i)
|
||||||
|
cpdef bint can_pdt(self, size_t i)
|
||||||
|
cpdef bint can_pos(self, size_t i)
|
||||||
|
cpdef bint can_pron(self, size_t i)
|
||||||
|
cpdef bint can_prt(self, size_t i)
|
||||||
|
cpdef bint can_punct(self, size_t i)
|
||||||
|
cpdef bint can_verb(self, size_t i)
|
||||||
|
cpdef bint oft_lower(self, size_t i)
|
||||||
|
cpdef bint oft_title(self, size_t i)
|
||||||
|
cpdef bint oft_upper(self, size_t i)
|
||||||
|
|
||||||
|
|
||||||
cdef class English(Language):
|
cdef class English(Language):
|
||||||
|
|
233
spacy/en.pyx
233
spacy/en.pyx
|
@ -46,44 +46,169 @@ from spacy import util
|
||||||
|
|
||||||
from spacy import orth
|
from spacy import orth
|
||||||
|
|
||||||
TAG_THRESH = 0.5
|
|
||||||
UPPER_THRESH = 0.2
|
|
||||||
LOWER_THRESH = 0.5
|
|
||||||
TITLE_THRESH = 0.7
|
|
||||||
|
|
||||||
NR_FLAGS = 0
|
cdef enum Flags:
|
||||||
|
Flag_IsAlpha
|
||||||
|
Flag_IsAscii
|
||||||
|
Flag_IsDigit
|
||||||
|
Flag_IsLower
|
||||||
|
Flag_IsPunct
|
||||||
|
Flag_IsSpace
|
||||||
|
Flag_IsTitle
|
||||||
|
Flag_IsUpper
|
||||||
|
|
||||||
OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
|
Flag_CanAdj
|
||||||
OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
|
Flag_CanAdp
|
||||||
OFT_TITLE = NR_FLAGS; NR_FLAGS += 1
|
Flag_CanAdv
|
||||||
|
Flag_CanConj
|
||||||
|
Flag_CanDet
|
||||||
|
Flag_CanNoun
|
||||||
|
Flag_CanNum
|
||||||
|
Flag_CanPdt
|
||||||
|
Flag_CanPos
|
||||||
|
Flag_CanPron
|
||||||
|
Flag_CanPrt
|
||||||
|
Flag_CanPunct
|
||||||
|
Flag_CanVerb
|
||||||
|
|
||||||
IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
|
Flag_OftLower
|
||||||
IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
|
Flag_OftTitle
|
||||||
IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
|
Flag_OftUpper
|
||||||
IS_SPACE = NR_FLAGS; NR_FLAGS += 1
|
Flag_N
|
||||||
IS_ASCII = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
IS_TITLE = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
IS_LOWER = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
IS_UPPER = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
|
|
||||||
CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_NUM = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_DET = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_ADP = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_ADV = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_VERB = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_PDT = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_POS = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_PRT = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
|
|
||||||
NR_VIEWS = 0
|
cdef enum Views:
|
||||||
CANON_CASED = NR_VIEWS; NR_VIEWS += 1
|
View_CanonForm
|
||||||
SHAPE = NR_VIEWS; NR_VIEWS += 1
|
View_WordShape
|
||||||
NON_SPARSE = NR_VIEWS; NR_VIEWS += 1
|
View_NonSparse
|
||||||
|
View_Asciied
|
||||||
|
View_N
|
||||||
|
|
||||||
|
|
||||||
|
# Assign the flag and view functions by enum value.
|
||||||
|
# This is verbose, but it ensures we don't get nasty order sensitivities.
|
||||||
|
STRING_VIEW_FUNCS = [None] * View_N
|
||||||
|
STRING_VIEW_FUNCS[View_CanonForm] = orth.canon_case
|
||||||
|
STRING_VIEW_FUNCS[View_WordShape] = orth.word_shape
|
||||||
|
STRING_VIEW_FUNCS[View_NonSparse] = orth.non_sparse
|
||||||
|
STRING_VIEW_FUNCS[View_Asciied] = orth.asciied
|
||||||
|
|
||||||
|
FLAG_FUNCS = [None] * Flag_N
|
||||||
|
FLAG_FUNCS[Flag_IsAlpha] = orth.is_alpha
|
||||||
|
FLAG_FUNCS[Flag_IsAscii] = orth.is_ascii
|
||||||
|
FLAG_FUNCS[Flag_IsDigit] = orth.is_digit
|
||||||
|
FLAG_FUNCS[Flag_IsLower] = orth.is_lower
|
||||||
|
FLAG_FUNCS[Flag_IsPunct] = orth.is_punct
|
||||||
|
FLAG_FUNCS[Flag_IsSpace] = orth.is_space
|
||||||
|
FLAG_FUNCS[Flag_IsTitle] = orth.is_title
|
||||||
|
FLAG_FUNCS[Flag_IsUpper] = orth.is_upper
|
||||||
|
|
||||||
|
FLAG_FUNCS[Flag_CanAdj] = orth.can_tag('ADJ')
|
||||||
|
FLAG_FUNCS[Flag_CanAdp] = orth.can_tag('ADP')
|
||||||
|
FLAG_FUNCS[Flag_CanAdv] = orth.can_tag('ADV')
|
||||||
|
FLAG_FUNCS[Flag_CanConj] = orth.can_tag('CONJ')
|
||||||
|
FLAG_FUNCS[Flag_CanDet] = orth.can_tag('DET')
|
||||||
|
FLAG_FUNCS[Flag_CanNoun] = orth.can_tag('NOUN')
|
||||||
|
FLAG_FUNCS[Flag_CanNum] = orth.can_tag('NUM')
|
||||||
|
FLAG_FUNCS[Flag_CanPdt] = orth.can_tag('PDT')
|
||||||
|
FLAG_FUNCS[Flag_CanPos] = orth.can_tag('POS')
|
||||||
|
FLAG_FUNCS[Flag_CanPron] = orth.can_tag('PRON')
|
||||||
|
FLAG_FUNCS[Flag_CanPrt] = orth.can_tag('PRT')
|
||||||
|
FLAG_FUNCS[Flag_CanPunct] = orth.can_tag('PUNCT')
|
||||||
|
FLAG_FUNCS[Flag_CanVerb] = orth.can_tag('VERB')
|
||||||
|
|
||||||
|
FLAG_FUNCS[Flag_OftLower] = orth.oft_case('lower', 0.7)
|
||||||
|
FLAG_FUNCS[Flag_OftTitle] = orth.oft_case('title', 0.7)
|
||||||
|
FLAG_FUNCS[Flag_OftUpper] = orth.oft_case('upper', 0.7)
|
||||||
|
|
||||||
|
|
||||||
|
cdef class EnglishTokens(Tokens):
|
||||||
|
# Provide accessor methods for the features supported by the language.
|
||||||
|
# Without these, clients have to use the underlying string_view and check_flag
|
||||||
|
# methods, which requires them to know the IDs.
|
||||||
|
cpdef unicode canon_string(self, size_t i):
|
||||||
|
return self.lexemes[i].string_view(View_CanonForm)
|
||||||
|
|
||||||
|
cpdef unicode shape_string(self, size_t i):
|
||||||
|
return self.lexemes[i].string_view(View_WordShape)
|
||||||
|
|
||||||
|
cpdef unicode non_sparse_string(self, size_t i):
|
||||||
|
return self.lexemes[i].string_view(View_NonSparse)
|
||||||
|
|
||||||
|
cpdef unicode asciied(self, size_t i):
|
||||||
|
return self.lexemes[i].string_views(View_Asciied)
|
||||||
|
|
||||||
|
cpdef bint is_alpha(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_IsAlpha)
|
||||||
|
|
||||||
|
cpdef bint is_ascii(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_IsAscii)
|
||||||
|
|
||||||
|
cpdef bint is_digit(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_IsDigit)
|
||||||
|
|
||||||
|
cpdef bint is_lower(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_IsLower)
|
||||||
|
|
||||||
|
cpdef bint is_punct(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_IsPunct)
|
||||||
|
|
||||||
|
cpdef bint is_space(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_IsSpace)
|
||||||
|
|
||||||
|
cpdef bint is_title(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_IsTitle)
|
||||||
|
|
||||||
|
cpdef bint is_upper(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_IsUpper)
|
||||||
|
|
||||||
|
cpdef bint can_adj(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_CanAdj)
|
||||||
|
|
||||||
|
cpdef bint can_adp(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_CanAdp)
|
||||||
|
|
||||||
|
cpdef bint can_adv(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_CanAdv)
|
||||||
|
|
||||||
|
cpdef bint can_conj(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_CanConj)
|
||||||
|
|
||||||
|
cpdef bint can_det(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_CanDet)
|
||||||
|
|
||||||
|
cpdef bint can_noun(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_CanNoun)
|
||||||
|
|
||||||
|
cpdef bint can_num(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_CanNum)
|
||||||
|
|
||||||
|
cpdef bint can_pdt(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_CanPdt)
|
||||||
|
|
||||||
|
cpdef bint can_pos(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_CanPos)
|
||||||
|
|
||||||
|
cpdef bint can_pron(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_CanPron)
|
||||||
|
|
||||||
|
cpdef bint can_prt(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_CanPrt)
|
||||||
|
|
||||||
|
cpdef bint can_punct(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_CanPunct)
|
||||||
|
|
||||||
|
cpdef bint can_verb(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_CanVerb)
|
||||||
|
|
||||||
|
cpdef bint oft_lower(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_OftLower)
|
||||||
|
|
||||||
|
cpdef bint oft_title(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_OftTitle)
|
||||||
|
|
||||||
|
cpdef bint oft_upper(self, size_t i):
|
||||||
|
return self.lexemes[i].check_flag(i, Flag_OftUpper)
|
||||||
|
|
||||||
|
|
||||||
cdef class English(Language):
|
cdef class English(Language):
|
||||||
|
@ -93,48 +218,15 @@ cdef class English(Language):
|
||||||
name (unicode): The two letter code used by Wikipedia for the language.
|
name (unicode): The two letter code used by Wikipedia for the language.
|
||||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||||
"""
|
"""
|
||||||
|
def __cinit__(self, name, user_string_features, user_flag_features):
|
||||||
def __cinit__(self, name, string_features, flag_features):
|
|
||||||
flag_funcs = [None for _ in range(NR_FLAGS)]
|
|
||||||
|
|
||||||
flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
|
|
||||||
flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
|
|
||||||
flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
|
|
||||||
|
|
||||||
flag_funcs[IS_ALPHA] = orth.is_alpha
|
|
||||||
flag_funcs[IS_DIGIT] = orth.is_digit
|
|
||||||
flag_funcs[IS_PUNCT] = orth.is_punct
|
|
||||||
flag_funcs[IS_SPACE] = orth.is_space
|
|
||||||
flag_funcs[IS_ASCII] = orth.is_ascii
|
|
||||||
flag_funcs[IS_TITLE] = orth.is_title
|
|
||||||
flag_funcs[IS_LOWER] = orth.is_lower
|
|
||||||
flag_funcs[IS_UPPER] = orth.is_upper
|
|
||||||
|
|
||||||
flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_ADV] = orth.can_tag('ADV', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_PRON] = orth.can_tag('PRON', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
|
|
||||||
|
|
||||||
string_funcs = [None for _ in range(NR_VIEWS)]
|
|
||||||
string_funcs[CANON_CASED] = orth.canon_case
|
|
||||||
string_funcs[SHAPE] = orth.word_shape
|
|
||||||
string_funcs[NON_SPARSE] = orth.non_sparse
|
|
||||||
self.name = name
|
|
||||||
self.cache = {}
|
self.cache = {}
|
||||||
lang_data = util.read_lang_data(name)
|
lang_data = util.read_lang_data(name)
|
||||||
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||||
self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,
|
self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||||
string_funcs, flag_funcs)
|
STRING_VIEW_FUNCS + user_string_features,
|
||||||
|
FLAG_FUNCS + user_flag_features)
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
|
self.token_class = EnglishTokens
|
||||||
|
|
||||||
cdef int _split_one(self, unicode word):
|
cdef int _split_one(self, unicode word):
|
||||||
cdef size_t length = len(word)
|
cdef size_t length = len(word)
|
||||||
|
@ -154,6 +246,7 @@ cdef class English(Language):
|
||||||
i += 1
|
i += 1
|
||||||
return i
|
return i
|
||||||
|
|
||||||
|
|
||||||
cdef bint _check_punct(unicode word, size_t i, size_t length):
|
cdef bint _check_punct(unicode word, size_t i, size_t length):
|
||||||
# Don't count appostrophes as punct if the next char is a letter
|
# Don't count appostrophes as punct if the next char is a letter
|
||||||
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
||||||
|
|
|
@ -1,9 +1,21 @@
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
from spacy.word cimport Lexeme
|
from spacy.word cimport Lexeme
|
||||||
|
from spacy.tokens cimport Tokens
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct Flags:
|
||||||
|
size_t is_alpha
|
||||||
|
size_t can_noun
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct ViewIDs:
|
||||||
|
size_t canon_form
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
|
cpdef readonly size_t size
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string)
|
cpdef Lexeme lookup(self, unicode string)
|
||||||
|
|
||||||
cdef dict _dict
|
cdef dict _dict
|
||||||
|
@ -16,10 +28,11 @@ cdef class Language:
|
||||||
cdef unicode name
|
cdef unicode name
|
||||||
cdef dict cache
|
cdef dict cache
|
||||||
cpdef readonly Lexicon lexicon
|
cpdef readonly Lexicon lexicon
|
||||||
|
cpdef readonly object tokens_class
|
||||||
|
|
||||||
cpdef list tokenize(self, unicode text)
|
cpdef list tokenize(self, unicode text)
|
||||||
cpdef Lexeme lookup(self, unicode text)
|
cpdef Lexeme lookup(self, unicode text)
|
||||||
|
|
||||||
cdef list _tokenize(self, unicode string)
|
cdef _tokenize(self, Tokens tokens, unicode string)
|
||||||
cdef list _split(self, unicode string)
|
cdef list _split(self, unicode string)
|
||||||
cdef int _split_one(self, unicode word)
|
cdef int _split_one(self, unicode word)
|
||||||
|
|
|
@ -14,6 +14,7 @@ import json
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
from .util import read_lang_data
|
from .util import read_lang_data
|
||||||
|
from spacy.tokens import Tokens
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
|
@ -42,6 +43,23 @@ cdef class Language:
|
||||||
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||||
string_features, flag_features)
|
string_features, flag_features)
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
|
self.token_class = Tokens
|
||||||
|
|
||||||
|
property nr_types:
|
||||||
|
def __get__(self):
|
||||||
|
"""Return the number of lexical types in the vocabulary"""
|
||||||
|
return self.lexicon.size
|
||||||
|
|
||||||
|
cpdef Lexeme lookup(self, unicode string):
|
||||||
|
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
lexeme (Lexeme): A reference to a lexical type.
|
||||||
|
"""
|
||||||
|
return self.lexicon.lookup(string)
|
||||||
|
|
||||||
cpdef list tokenize(self, unicode string):
|
cpdef list tokenize(self, unicode string):
|
||||||
"""Tokenize a string.
|
"""Tokenize a string.
|
||||||
|
@ -58,43 +76,39 @@ cdef class Language:
|
||||||
Returns:
|
Returns:
|
||||||
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
|
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
|
||||||
"""
|
"""
|
||||||
if not string:
|
assert string
|
||||||
return []
|
|
||||||
cdef list tokens = []
|
|
||||||
cdef size_t length = len(string)
|
cdef size_t length = len(string)
|
||||||
cdef size_t start = 0
|
cdef size_t start = 0
|
||||||
cdef size_t i = 0
|
cdef size_t i = 0
|
||||||
|
cdef Tokens tokens = self.token_class()
|
||||||
for c in string:
|
for c in string:
|
||||||
if c == ' ':
|
if c == ' ':
|
||||||
if start < i:
|
if start < i:
|
||||||
tokens.extend(self._tokenize(string[start:i]))
|
self._tokenize(tokens, string[start:i])
|
||||||
start = i + 1
|
start = i + 1
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
tokens.extend(self._tokenize(string[start:]))
|
self._tokenize(tokens, string[start:i])
|
||||||
assert tokens
|
assert tokens
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string):
|
cdef _tokenize(self, Tokens tokens, unicode string):
|
||||||
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
cdef list lexemes
|
||||||
|
if len(string) == 1:
|
||||||
Args:
|
lexemes = [self.lookup(string)]
|
||||||
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
elif string in self.cache:
|
||||||
|
lexemes = self.cache[string]
|
||||||
Returns:
|
else:
|
||||||
lexeme (Lexeme): A reference to a lexical type.
|
lexemes = []
|
||||||
"""
|
substrings = self._split(string)
|
||||||
return self.lexicon.lookup(string)
|
for i, substring in enumerate(substrings):
|
||||||
|
lexemes.append(self.lexicon.lookup(substring))
|
||||||
cdef list _tokenize(self, unicode string):
|
self.cache[string] = lexemes
|
||||||
if string in self.cache:
|
|
||||||
return self.cache[string]
|
cdef Lexeme lexeme
|
||||||
cdef list lexemes = []
|
for lexeme in lexemes:
|
||||||
substrings = self._split(string)
|
tokens.append(lexeme)
|
||||||
for i, substring in enumerate(substrings):
|
|
||||||
lexemes.append(self.lexicon.lookup(substring))
|
|
||||||
self.cache[string] = lexemes
|
|
||||||
return lexemes
|
|
||||||
|
|
||||||
cdef list _split(self, unicode string):
|
cdef list _split(self, unicode string):
|
||||||
"""Find how to split a contiguous span of non-space characters into substrings.
|
"""Find how to split a contiguous span of non-space characters into substrings.
|
||||||
|
@ -146,12 +160,14 @@ cdef class Lexicon:
|
||||||
self._flag_features = flag_features
|
self._flag_features = flag_features
|
||||||
self._string_features = string_features
|
self._string_features = string_features
|
||||||
self._dict = {}
|
self._dict = {}
|
||||||
|
self.size = 0
|
||||||
cdef Lexeme word
|
cdef Lexeme word
|
||||||
for string in words:
|
for string in words:
|
||||||
word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0),
|
word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0),
|
||||||
case_stats.get(string, {}), tag_stats.get(string, {}),
|
case_stats.get(string, {}), tag_stats.get(string, {}),
|
||||||
self._string_features, self._flag_features)
|
self._string_features, self._flag_features)
|
||||||
self._dict[string] = word
|
self._dict[string] = word
|
||||||
|
self.size += 1
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string):
|
cpdef Lexeme lookup(self, unicode string):
|
||||||
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
||||||
|
@ -169,4 +185,5 @@ cdef class Lexicon:
|
||||||
cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features,
|
cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features,
|
||||||
self._flag_features)
|
self._flag_features)
|
||||||
self._dict[string] = word
|
self._dict[string] = word
|
||||||
|
self.size += 1
|
||||||
return word
|
return word
|
||||||
|
|
|
@ -4,6 +4,10 @@ import unicodedata
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
|
||||||
|
|
||||||
|
TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split()
|
||||||
|
|
||||||
|
|
||||||
# Binary string features
|
# Binary string features
|
||||||
def is_alpha(string, prob, case_stats, tag_stats):
|
def is_alpha(string, prob, case_stats, tag_stats):
|
||||||
return string.isalpha()
|
return string.isalpha()
|
||||||
|
@ -107,7 +111,7 @@ def non_sparse(string, prob, cluster, case_stats, tag_stats):
|
||||||
return word_shape(string, prob, cluster, case_stats, tag_stats)
|
return word_shape(string, prob, cluster, case_stats, tag_stats)
|
||||||
|
|
||||||
|
|
||||||
def asciify(string):
|
def asciied(string):
|
||||||
'''"ASCIIfy" a Unicode string by stripping all umlauts, tildes, etc.'''
|
'''"ASCIIfy" a Unicode string by stripping all umlauts, tildes, etc.'''
|
||||||
# Snippet from
|
# Snippet from
|
||||||
# http://www.physic.ut.ee/~kkannike/english/prog/python/util/asciify/index.html
|
# http://www.physic.ut.ee/~kkannike/english/prog/python/util/asciify/index.html
|
||||||
|
|
9
spacy/tokens.pxd
Normal file
9
spacy/tokens.pxd
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
cdef class Tokens:
|
||||||
|
cdef list lexemes
|
||||||
|
cpdef append(self, object lexeme)
|
||||||
|
|
||||||
|
cpdef unicode string(self, size_t i)
|
||||||
|
cpdef double prob(self, size_t i)
|
||||||
|
cpdef size_t cluster(self, size_t i)
|
||||||
|
cpdef bint check_flag(self, size_t i, size_t flag_id)
|
||||||
|
cpdef unicode string_view(self, size_t i, size_t view_id)
|
39
spacy/tokens.pyx
Normal file
39
spacy/tokens.pyx
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
cdef class Tokens:
|
||||||
|
"""A sequence of references to Lexeme objects.
|
||||||
|
|
||||||
|
The Tokens class provides fast and memory-efficient access to lexical features,
|
||||||
|
and can efficiently export the data to a numpy array. Specific languages
|
||||||
|
create their own Tokens subclasses, to provide more convenient access to
|
||||||
|
language-specific features.
|
||||||
|
|
||||||
|
>>> from spacy.en import EN
|
||||||
|
>>> tokens = EN.tokenize('An example sentence.')
|
||||||
|
>>> tokens.string(0)
|
||||||
|
'An'
|
||||||
|
>>> tokens.prob(0) > tokens.prob(1)
|
||||||
|
True
|
||||||
|
>>> tokens.can_noun(0)
|
||||||
|
False
|
||||||
|
>>> tokens.can_noun(1)
|
||||||
|
True
|
||||||
|
"""
|
||||||
|
def __cinit__(self):
|
||||||
|
self.lexemes = []
|
||||||
|
|
||||||
|
cpdef append(self, object lexeme):
|
||||||
|
self.lexemes.append(lexeme)
|
||||||
|
|
||||||
|
cpdef unicode string(self, size_t i):
|
||||||
|
return self.lexemes[i].string
|
||||||
|
|
||||||
|
cpdef double prob(self, size_t i):
|
||||||
|
return self.lexemes[i].prob
|
||||||
|
|
||||||
|
cpdef size_t cluster(self, size_t i):
|
||||||
|
return self.lexemes[i].cluster
|
||||||
|
|
||||||
|
cpdef bint check_flag(self, size_t i, size_t flag_id):
|
||||||
|
return self.lexemes[i].check_flag(flag_id)
|
||||||
|
|
||||||
|
cpdef unicode string_view(self, size_t i, size_t view_id):
|
||||||
|
return self.lexemes[i].string_view(view_id)
|
Loading…
Reference in New Issue
Block a user