mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* Large refactor, particularly to Python API
This commit is contained in:
parent
168b2b8cb2
commit
08ce602243
42
spacy/de.pxd
42
spacy/de.pxd
|
@ -1,42 +0,0 @@
|
||||||
from spacy.spacy cimport Language
|
|
||||||
from spacy.word cimport Lexeme
|
|
||||||
cimport cython
|
|
||||||
|
|
||||||
|
|
||||||
cpdef size_t ALPHA
|
|
||||||
cpdef size_t DIGIT
|
|
||||||
cpdef size_t PUNCT
|
|
||||||
cpdef size_t SPACE
|
|
||||||
cpdef size_t LOWER
|
|
||||||
cpdef size_t UPPER
|
|
||||||
cpdef size_t TITLE
|
|
||||||
cpdef size_t ASCII
|
|
||||||
|
|
||||||
cpdef size_t OFT_LOWER
|
|
||||||
cpdef size_t OFT_TITLE
|
|
||||||
cpdef size_t OFT_UPPER
|
|
||||||
|
|
||||||
cpdef size_t PUNCT
|
|
||||||
cpdef size_t CONJ
|
|
||||||
cpdef size_t NUM
|
|
||||||
cpdef size_t N
|
|
||||||
cpdef size_t DET
|
|
||||||
cpdef size_t ADP
|
|
||||||
cpdef size_t ADJ
|
|
||||||
cpdef size_t ADV
|
|
||||||
cpdef size_t VERB
|
|
||||||
cpdef size_t NOUN
|
|
||||||
cpdef size_t PDT
|
|
||||||
cpdef size_t POS
|
|
||||||
cpdef size_t PRON
|
|
||||||
cpdef size_t PRT
|
|
||||||
|
|
||||||
cdef class English(spacy.Language):
|
|
||||||
cdef int find_split(self, unicode word)
|
|
||||||
|
|
||||||
|
|
||||||
cdef English EN
|
|
||||||
|
|
||||||
|
|
||||||
cpdef Word lookup(unicode word)
|
|
||||||
cpdef list tokenize(unicode string)
|
|
126
spacy/de.pyx
126
spacy/de.pyx
|
@ -1,126 +0,0 @@
|
||||||
# cython: profile=True
|
|
||||||
# cython: embedsignature=True
|
|
||||||
'''Tokenize German text, using a scheme based on the Negra corpus.
|
|
||||||
|
|
||||||
Tokenization is generally similar to English text, and the same set of orthographic
|
|
||||||
flags are used.
|
|
||||||
|
|
||||||
An abbreviation list is used to handle common abbreviations. Hyphenated words
|
|
||||||
are not split, following the Treebank usage.
|
|
||||||
'''
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from libc.stdint cimport uint64_t
|
|
||||||
|
|
||||||
cimport spacy
|
|
||||||
|
|
||||||
from spacy.orth import is_alpha, is_digit, is_punct, is_space, is_lower, is_ascii
|
|
||||||
from spacy.orth import canonicalize_case, get_string_shape, asciify, get_non_sparse
|
|
||||||
from spacy.common cimport check_punct
|
|
||||||
|
|
||||||
# Python-readable flag constants --- can't read an enum from Python
|
|
||||||
|
|
||||||
# Don't want to manually assign these numbers, or we'll insert one and have to
|
|
||||||
# change them all.
|
|
||||||
# Don't use "i", as we don't want it in the global scope!
|
|
||||||
cdef size_t __i = 0
|
|
||||||
|
|
||||||
ALPHA = __i; i += 1
|
|
||||||
DIGIT = __i; __i += 1
|
|
||||||
PUNCT = __i; __i += 1
|
|
||||||
SPACE = __i; __i += 1
|
|
||||||
LOWER = __i; __i += 1
|
|
||||||
UPPER = __i; __i += 1
|
|
||||||
TITLE = __i; __i += 1
|
|
||||||
ASCII = __i; __i += 1
|
|
||||||
|
|
||||||
OFT_LOWER = __i; __i += 1
|
|
||||||
OFT_UPPER = __i; __i += 1
|
|
||||||
OFT_TITLE = __i; __i += 1
|
|
||||||
|
|
||||||
PUNCT = __i; __i += 1
|
|
||||||
CONJ = __i; __i += 1
|
|
||||||
NUM = __i; __i += 1
|
|
||||||
X = __i; __i += 1
|
|
||||||
DET = __i; __i += 1
|
|
||||||
ADP = __i; __i += 1
|
|
||||||
ADJ = __i; __i += 1
|
|
||||||
ADV = __i; __i += 1
|
|
||||||
VERB = __i; __i += 1
|
|
||||||
NOUN = __i; __i += 1
|
|
||||||
PDT = __i; __i += 1
|
|
||||||
POS = __i; __i += 1
|
|
||||||
PRON = __i; __i += 1
|
|
||||||
PRT = __i; __i += 1
|
|
||||||
|
|
||||||
|
|
||||||
# These are for the string views
|
|
||||||
__i = 0
|
|
||||||
SIC = __i; __i += 1
|
|
||||||
CANON_CASED = __i; __i += 1
|
|
||||||
NON_SPARSE = __i; __i += 1
|
|
||||||
SHAPE = __i; __i += 1
|
|
||||||
NR_STRING_VIEWS = __i
|
|
||||||
|
|
||||||
|
|
||||||
def get_string_views(unicode string, lexeme):
|
|
||||||
views = ['' for _ in range(NR_STRING_VIEWS)]
|
|
||||||
views[SIC] = string
|
|
||||||
views[CANON_CASED] = canonicalize_case(string, lexeme)
|
|
||||||
views[SHAPE] = get_string_shape(string)
|
|
||||||
views[ASCIIFIED] = get_asciified(string)
|
|
||||||
views[FIXED_VOCAB] = get_non_sparse(string, views[ASCIIFIED], views[CANON_CASED],
|
|
||||||
views[SHAPE], lexeme)
|
|
||||||
return views
|
|
||||||
|
|
||||||
|
|
||||||
def set_orth_flags(unicode string, flags_t flags)
|
|
||||||
setters = [
|
|
||||||
(ALPHA, is_alpha),
|
|
||||||
(DIGIT, is_digit),
|
|
||||||
(PUNCT, is_punct),
|
|
||||||
(SPACE, is_space),
|
|
||||||
(LOWER, is_lower),
|
|
||||||
(UPPER, is_upper),
|
|
||||||
(SPACE, is_space)
|
|
||||||
]
|
|
||||||
|
|
||||||
for bit, setter in setters:
|
|
||||||
if setter(string):
|
|
||||||
flags |= 1 << bit
|
|
||||||
return flags
|
|
||||||
|
|
||||||
|
|
||||||
cdef class German(spacy.Language):
|
|
||||||
cdef Lexeme new_lexeme(self, unicode string, cluster=0, case_stats=None,
|
|
||||||
tag_freqs=None):
|
|
||||||
return Lexeme(s, length, views, prob=prob, cluster=cluster,
|
|
||||||
flags=self.get_flags(string)
|
|
||||||
|
|
||||||
cdef int find_split(self, unicode word):
|
|
||||||
cdef size_t length = len(word)
|
|
||||||
cdef int i = 0
|
|
||||||
if word.startswith("'s") or word.startswith("'S"):
|
|
||||||
return 2
|
|
||||||
# Contractions
|
|
||||||
if word.endswith("'s") and length >= 3:
|
|
||||||
return length - 2
|
|
||||||
# Leading punctuation
|
|
||||||
if check_punct(word, 0, length):
|
|
||||||
return 1
|
|
||||||
elif length >= 1:
|
|
||||||
# Split off all trailing punctuation characters
|
|
||||||
i = 0
|
|
||||||
while i < length and not check_punct(word, i, length):
|
|
||||||
i += 1
|
|
||||||
return i
|
|
||||||
|
|
||||||
|
|
||||||
DE = German('de')
|
|
||||||
|
|
||||||
lookup = DE.lookup
|
|
||||||
tokenize = DE.tokenize
|
|
||||||
load_clusters = DE.load_clusters
|
|
||||||
load_unigram_probs = DE.load_unigram_probs
|
|
||||||
load_case_stats = DE.load_case_stats
|
|
||||||
load_tag_stats = DE.load_tag_stats
|
|
|
@ -1,5 +1,4 @@
|
||||||
from spacy.lang cimport Language
|
from spacy.lang cimport Language
|
||||||
from spacy.word cimport Lexeme
|
|
||||||
from spacy.tokens cimport Tokens
|
from spacy.tokens cimport Tokens
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,14 +1,12 @@
|
||||||
from libc.stdint cimport uint32_t
|
|
||||||
from libc.stdint cimport uint64_t
|
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libc.stdint cimport uint64_t, int64_t
|
|
||||||
|
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .word cimport Lexeme
|
from .typedefs cimport hash_t
|
||||||
from .tokens cimport Tokens
|
from .tokens cimport Tokens
|
||||||
from .lexeme cimport LexemeC
|
from .lexeme cimport Lexeme
|
||||||
|
from .utf8string cimport StringStore
|
||||||
|
|
||||||
|
|
||||||
cdef extern from "Python.h":
|
cdef extern from "Python.h":
|
||||||
|
@ -21,23 +19,25 @@ cdef extern from "Python.h":
|
||||||
cdef struct String:
|
cdef struct String:
|
||||||
Py_UNICODE* chars
|
Py_UNICODE* chars
|
||||||
size_t n
|
size_t n
|
||||||
uint64_t key
|
hash_t key
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cpdef readonly size_t size
|
cpdef readonly size_t size
|
||||||
|
cpdef readonly StringStore strings
|
||||||
|
|
||||||
cdef vector[LexemeC*] lexemes
|
cdef vector[Lexeme*] lexemes
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string)
|
cpdef Lexeme lookup(self, unicode string)
|
||||||
cdef LexemeC* get(self, String* s) except NULL
|
cdef Lexeme* get(self, String* s) except NULL
|
||||||
|
|
||||||
cdef PreshMap _dict
|
cdef PreshMap _dict
|
||||||
|
|
||||||
cdef list _string_features
|
cdef list _string_features
|
||||||
cdef list _flag_features
|
cdef list _flag_features
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef Pool _mem
|
cdef Pool _mem
|
||||||
cdef unicode name
|
cdef unicode name
|
||||||
|
@ -52,12 +52,12 @@ cdef class Language:
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
|
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
|
||||||
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
|
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes) except NULL
|
vector[Lexeme*] *suffixes) except NULL
|
||||||
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
|
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
|
||||||
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
|
||||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
cdef int _save_cached(self, LexemeC** tokens, uint64_t key, int n) except -1
|
cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1
|
||||||
|
|
||||||
|
|
|
@ -13,22 +13,21 @@ import random
|
||||||
from os import path
|
from os import path
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .util import read_lang_data
|
|
||||||
from .tokens import Tokens
|
|
||||||
from .lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
|
|
||||||
from .lexeme cimport LexStr_orig
|
|
||||||
from murmurhash.mrmr cimport hash64
|
|
||||||
|
|
||||||
from cpython.ref cimport Py_INCREF
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from cython.operator cimport preincrement as preinc
|
from cython.operator cimport preincrement as preinc
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
|
|
||||||
|
from murmurhash.mrmr cimport hash64
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from spacy import orth
|
|
||||||
from spacy import util
|
from .lexeme cimport Lexeme
|
||||||
|
from .lexeme cimport from_dict as lexeme_from_dict
|
||||||
|
from .lexeme cimport from_string as lexeme_from_string
|
||||||
|
|
||||||
|
from . import orth
|
||||||
|
from . import util
|
||||||
|
from .util import read_lang_data
|
||||||
|
from .tokens import Tokens
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
|
@ -64,7 +63,7 @@ cdef class Language:
|
||||||
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
|
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
|
||||||
"""
|
"""
|
||||||
cdef int length = len(string)
|
cdef int length = len(string)
|
||||||
cdef Tokens tokens = Tokens(length)
|
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return tokens
|
return tokens
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
|
@ -76,7 +75,7 @@ cdef class Language:
|
||||||
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
||||||
if start < i:
|
if start < i:
|
||||||
string_slice(&span, chars, start, i)
|
string_slice(&span, chars, start, i)
|
||||||
lexemes = <LexemeC**>self.cache.get(span.key)
|
lexemes = <Lexeme**>self.cache.get(span.key)
|
||||||
if lexemes != NULL:
|
if lexemes != NULL:
|
||||||
tokens.extend(start, lexemes, 0)
|
tokens.extend(start, lexemes, 0)
|
||||||
else:
|
else:
|
||||||
|
@ -88,7 +87,7 @@ cdef class Language:
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
string_slice(&span, chars, start, i)
|
string_slice(&span, chars, start, i)
|
||||||
lexemes = <LexemeC**>self.cache.get(span.key)
|
lexemes = <Lexeme**>self.cache.get(span.key)
|
||||||
if lexemes != NULL:
|
if lexemes != NULL:
|
||||||
tokens.extend(start, lexemes, 0)
|
tokens.extend(start, lexemes, 0)
|
||||||
else:
|
else:
|
||||||
|
@ -96,9 +95,9 @@ cdef class Language:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
|
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
|
||||||
cdef vector[LexemeC*] prefixes
|
cdef vector[Lexeme*] prefixes
|
||||||
cdef vector[LexemeC*] suffixes
|
cdef vector[Lexeme*] suffixes
|
||||||
cdef uint64_t orig_key
|
cdef hash_t orig_key
|
||||||
cdef int orig_size
|
cdef int orig_size
|
||||||
orig_key = span.key
|
orig_key = span.key
|
||||||
orig_size = tokens.length
|
orig_size = tokens.length
|
||||||
|
@ -106,8 +105,8 @@ cdef class Language:
|
||||||
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
|
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
|
||||||
self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
|
self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
|
||||||
|
|
||||||
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
|
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes) except NULL:
|
vector[Lexeme*] *suffixes) except NULL:
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
cdef String prefix
|
cdef String prefix
|
||||||
cdef String suffix
|
cdef String suffix
|
||||||
|
@ -150,15 +149,15 @@ cdef class Language:
|
||||||
|
|
||||||
cdef int _attach_tokens(self, Tokens tokens,
|
cdef int _attach_tokens(self, Tokens tokens,
|
||||||
int idx, String* string,
|
int idx, String* string,
|
||||||
vector[LexemeC*] *prefixes,
|
vector[Lexeme*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes) except -1:
|
vector[Lexeme*] *suffixes) except -1:
|
||||||
cdef int split
|
cdef int split
|
||||||
cdef LexemeC** lexemes
|
cdef Lexeme** lexemes
|
||||||
cdef LexemeC* lexeme
|
cdef Lexeme* lexeme
|
||||||
cdef String span
|
cdef String span
|
||||||
idx = tokens.extend(idx, prefixes.data(), prefixes.size())
|
idx = tokens.extend(idx, prefixes.data(), prefixes.size())
|
||||||
if string.n != 0:
|
if string.n != 0:
|
||||||
lexemes = <LexemeC**>self.cache.get(string.key)
|
lexemes = <Lexeme**>self.cache.get(string.key)
|
||||||
if lexemes != NULL:
|
if lexemes != NULL:
|
||||||
idx = tokens.extend(idx, lexemes, 0)
|
idx = tokens.extend(idx, lexemes, 0)
|
||||||
else:
|
else:
|
||||||
|
@ -172,13 +171,13 @@ cdef class Language:
|
||||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
||||||
string_slice(&span, string.chars, split + 1, string.n)
|
string_slice(&span, string.chars, split + 1, string.n)
|
||||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
||||||
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
|
cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
|
||||||
while it != suffixes.rend():
|
while it != suffixes.rend():
|
||||||
idx = tokens.push_back(idx, deref(it))
|
idx = tokens.push_back(idx, deref(it))
|
||||||
preinc(it)
|
preinc(it)
|
||||||
|
|
||||||
cdef int _save_cached(self, LexemeC** tokens, uint64_t key, int n) except -1:
|
cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1:
|
||||||
lexemes = <LexemeC**>self._mem.alloc(n + 1, sizeof(LexemeC**))
|
lexemes = <Lexeme**>self._mem.alloc(n + 1, sizeof(Lexeme**))
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
lexemes[i] = tokens[i]
|
lexemes[i] = tokens[i]
|
||||||
|
@ -212,14 +211,14 @@ cdef class Language:
|
||||||
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
||||||
a string and tokens is a list of strings.
|
a string and tokens is a list of strings.
|
||||||
'''
|
'''
|
||||||
cdef LexemeC** lexemes
|
cdef Lexeme** lexemes
|
||||||
cdef uint64_t hashed
|
cdef hash_t hashed
|
||||||
cdef String string
|
cdef String string
|
||||||
for uni_string, substrings in token_rules:
|
for uni_string, substrings in token_rules:
|
||||||
lexemes = <LexemeC**>self._mem.alloc(len(substrings) + 1, sizeof(LexemeC*))
|
lexemes = <Lexeme**>self._mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
|
||||||
for i, substring in enumerate(substrings):
|
for i, substring in enumerate(substrings):
|
||||||
string_from_unicode(&string, substring)
|
string_from_unicode(&string, substring)
|
||||||
lexemes[i] = <LexemeC*>self.lexicon.get(&string)
|
lexemes[i] = <Lexeme*>self.lexicon.get(&string)
|
||||||
lexemes[i + 1] = NULL
|
lexemes[i + 1] = NULL
|
||||||
string_from_unicode(&string, uni_string)
|
string_from_unicode(&string, uni_string)
|
||||||
self.specials.set(string.key, lexemes)
|
self.specials.set(string.key, lexemes)
|
||||||
|
@ -227,33 +226,29 @@ cdef class Language:
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
def __cinit__(self, lexemes):
|
def __init__(self, lexemes):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._dict = PreshMap(2 ** 20)
|
self._dict = PreshMap(2 ** 20)
|
||||||
|
self.strings = StringStore()
|
||||||
self.size = 0
|
self.size = 0
|
||||||
cdef String string
|
cdef String string
|
||||||
cdef dict lexeme_dict
|
cdef Lexeme* lexeme
|
||||||
cdef LexemeC* lexeme
|
#for py_string, lexeme_dict in lexemes.iteritems():
|
||||||
for py_string, lexeme_dict in lexemes.iteritems():
|
# string_from_unicode(&string, py_string)
|
||||||
string_from_unicode(&string, py_string)
|
# lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
|
||||||
lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
|
# lexeme_from_dict(lexeme, lexeme_dict, self.strings)
|
||||||
lexeme_unpack(lexeme, lexeme_dict)
|
# self._dict.set(string.key, lexeme)
|
||||||
self._dict.set(string.key, lexeme)
|
# self.lexemes.push_back(lexeme)
|
||||||
self.lexemes.push_back(lexeme)
|
# self.size += 1
|
||||||
self.size += 1
|
|
||||||
|
|
||||||
def __getitem__(self, size_t i):
|
cdef Lexeme* get(self, String* string) except NULL:
|
||||||
return Lexeme(<size_t>self.lexemes.at(i))
|
cdef Lexeme* lex
|
||||||
|
lex = <Lexeme*>self._dict.get(string.key)
|
||||||
cdef LexemeC* get(self, String* string) except NULL:
|
|
||||||
cdef LexemeC* lex
|
|
||||||
lex = <LexemeC*>self._dict.get(string.key)
|
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
|
lex = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
|
||||||
cdef unicode unicode_string = string.chars[:string.n]
|
lexeme_from_string(lex, string.chars[:string.n], self.strings)
|
||||||
lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
|
|
||||||
self._dict.set(string.key, lex)
|
self._dict.set(string.key, lex)
|
||||||
self.lexemes.push_back(lex)
|
self.lexemes.push_back(lex)
|
||||||
self.size += 1
|
self.size += 1
|
||||||
|
@ -270,8 +265,8 @@ cdef class Lexicon:
|
||||||
"""
|
"""
|
||||||
cdef String string
|
cdef String string
|
||||||
string_from_unicode(&string, uni_string)
|
string_from_unicode(&string, uni_string)
|
||||||
cdef LexemeC* lexeme = self.get(&string)
|
cdef Lexeme* lexeme = self.get(&string)
|
||||||
return Lexeme(<size_t>lexeme)
|
return lexeme[0]
|
||||||
|
|
||||||
|
|
||||||
cdef void string_from_unicode(String* s, unicode uni):
|
cdef void string_from_unicode(String* s, unicode uni):
|
||||||
|
|
117
spacy/lexeme.pxd
117
spacy/lexeme.pxd
|
@ -1,94 +1,55 @@
|
||||||
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
|
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
|
|
||||||
|
from thinc.typedefs cimport atom_t
|
||||||
|
|
||||||
|
from .utf8string cimport StringStore
|
||||||
|
|
||||||
cpdef flag_t OOV_DIST_FLAGS
|
cpdef flag_t OOV_DIST_FLAGS
|
||||||
|
|
||||||
|
# Flags
|
||||||
|
cpdef enum:
|
||||||
|
IS_ALPHA
|
||||||
|
IS_ASCII
|
||||||
|
IS_DIGIT
|
||||||
|
IS_LOWER
|
||||||
|
IS_PUNCT
|
||||||
|
IS_SPACE
|
||||||
|
IS_TITLE
|
||||||
|
IS_UPPER
|
||||||
|
|
||||||
cpdef enum LexInts:
|
OFT_LOWER
|
||||||
LexInt_id
|
OFT_TITLE
|
||||||
LexInt_length
|
OFT_UPPER
|
||||||
LexInt_cluster
|
|
||||||
LexInt_pos
|
|
||||||
LexInt_supersense
|
|
||||||
LexInt_N
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum LexFloats:
|
cdef struct Lexeme:
|
||||||
LexFloat_prob
|
atom_t id
|
||||||
LexFloat_sentiment
|
atom_t length
|
||||||
LexFloat_N
|
|
||||||
|
atom_t norm
|
||||||
|
atom_t shape
|
||||||
|
atom_t vocab10k
|
||||||
|
atom_t asciied
|
||||||
|
atom_t prefix
|
||||||
|
atom_t suffix
|
||||||
|
|
||||||
|
atom_t cluster
|
||||||
|
atom_t pos
|
||||||
|
atom_t supersense
|
||||||
|
|
||||||
|
float prob
|
||||||
|
|
||||||
|
flag_t flags
|
||||||
|
|
||||||
|
|
||||||
cpdef enum LexStrs:
|
cdef Lexeme EMPTY_LEXEME
|
||||||
LexStr_orig
|
|
||||||
LexStr_norm
|
|
||||||
LexStr_shape
|
|
||||||
LexStr_unsparse
|
|
||||||
LexStr_asciied
|
|
||||||
LexStr_pre
|
|
||||||
LexStr_suff
|
|
||||||
LexStr_N
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum LexOrthFlags:
|
cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1
|
||||||
LexOrth_alpha
|
|
||||||
LexOrth_ascii
|
|
||||||
LexOrth_digit
|
|
||||||
LexOrth_lower
|
|
||||||
LexOrth_punct
|
|
||||||
LexOrth_space
|
|
||||||
LexOrth_title
|
|
||||||
LexOrth_upper
|
|
||||||
LexOrth_N
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum LexDistFlags:
|
cdef int from_dict(Lexeme* lex, dict props, StringStore store) except -1
|
||||||
LexDist_adj
|
|
||||||
LexDist_adp
|
|
||||||
LexDist_adv
|
|
||||||
LexDist_conj
|
|
||||||
LexDist_det
|
|
||||||
LexDist_noun
|
|
||||||
LexDist_num
|
|
||||||
LexDist_pdt
|
|
||||||
LexDist_pos
|
|
||||||
LexDist_pron
|
|
||||||
LexDist_prt
|
|
||||||
LexDist_punct
|
|
||||||
LexDist_verb
|
|
||||||
|
|
||||||
LexDist_lower
|
|
||||||
LexDist_title
|
|
||||||
LexDist_upper
|
|
||||||
|
|
||||||
LexDist_N
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct LexemeC:
|
cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
|
||||||
int[<int>LexInt_N] ints
|
return lexeme.flags & (1 << flag_id)
|
||||||
float[<int>LexFloat_N] floats
|
|
||||||
utf8_t[<int>LexStr_N] strings
|
|
||||||
flag_t orth_flags
|
|
||||||
flag_t dist_flags
|
|
||||||
|
|
||||||
|
|
||||||
cdef LexemeC EMPTY_LEXEME
|
|
||||||
|
|
||||||
|
|
||||||
cpdef dict get_lexeme_dict(size_t i, unicode string)
|
|
||||||
|
|
||||||
cdef char* intern_and_encode(unicode string, size_t* length) except NULL
|
|
||||||
|
|
||||||
cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *
|
|
||||||
|
|
||||||
cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *
|
|
||||||
|
|
||||||
cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i)
|
|
||||||
|
|
||||||
cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *
|
|
||||||
|
|
||||||
cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *
|
|
||||||
|
|
||||||
cdef dict lexeme_pack(LexemeC* lexeme)
|
|
||||||
cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1
|
|
||||||
|
|
118
spacy/lexeme.pyx
118
spacy/lexeme.pyx
|
@ -5,106 +5,40 @@ from libc.string cimport memset
|
||||||
|
|
||||||
import orth
|
import orth
|
||||||
|
|
||||||
|
from .utf8string cimport Utf8Str
|
||||||
|
|
||||||
OOV_DIST_FLAGS = 0
|
OOV_DIST_FLAGS = 0
|
||||||
|
|
||||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
|
||||||
|
|
||||||
cpdef dict get_lexeme_dict(size_t i, unicode string):
|
|
||||||
ints = [None for _ in range(LexInt_N)]
|
|
||||||
ints[<int>LexInt_id] = i
|
|
||||||
ints[<int>LexInt_length] = len(string)
|
|
||||||
ints[<int>LexInt_cluster] = 0
|
|
||||||
ints[<int>LexInt_pos] = 0
|
|
||||||
ints[<int>LexInt_supersense] = 0
|
|
||||||
|
|
||||||
floats = [None for _ in range(LexFloat_N)]
|
def get_flags(unicode string):
|
||||||
floats[<int>LexFloat_prob] = 0
|
|
||||||
floats[<int>LexFloat_sentiment] = 0
|
|
||||||
|
|
||||||
strings = [None for _ in range(LexStr_N)]
|
|
||||||
strings[<int>LexStr_orig] = string
|
|
||||||
strings[<int>LexStr_norm] = strings[<int>LexStr_orig]
|
|
||||||
strings[<int>LexStr_shape] = orth.word_shape(string)
|
|
||||||
strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
|
|
||||||
strings[<int>LexStr_asciied] = orth.asciied(string)
|
|
||||||
strings[<int>LexStr_pre] = string[0]
|
|
||||||
strings[<int>LexStr_suff] = string[-3:]
|
|
||||||
|
|
||||||
orth_flags = get_orth_flags(string)
|
|
||||||
dist_flags = OOV_DIST_FLAGS
|
|
||||||
|
|
||||||
return {'ints': ints, 'floats': floats, 'strings': strings,
|
|
||||||
'orth_flags': orth_flags, 'dist_flags': dist_flags}
|
|
||||||
|
|
||||||
def get_orth_flags(unicode string):
|
|
||||||
cdef flag_t flags = 0
|
cdef flag_t flags = 0
|
||||||
|
flags |= orth.is_alpha(string) << IS_ALPHA
|
||||||
flags |= orth.is_ascii(string) << LexOrth_ascii
|
flags |= orth.is_ascii(string) << IS_ASCII
|
||||||
flags |= orth.is_alpha(string) << LexOrth_alpha
|
flags |= orth.is_digit(string) << IS_DIGIT
|
||||||
flags |= orth.is_digit(string) << LexOrth_digit
|
flags |= orth.is_lower(string) << IS_LOWER
|
||||||
flags |= orth.is_lower(string) << LexOrth_lower
|
flags |= orth.is_punct(string) << IS_PUNCT
|
||||||
flags |= orth.is_punct(string) << LexOrth_punct
|
flags |= orth.is_space(string) << IS_SPACE
|
||||||
flags |= orth.is_space(string) << LexOrth_space
|
flags |= orth.is_title(string) << IS_TITLE
|
||||||
flags |= orth.is_title(string) << LexOrth_title
|
flags |= orth.is_upper(string) << IS_UPPER
|
||||||
flags |= orth.is_upper(string) << LexOrth_upper
|
|
||||||
return flags
|
return flags
|
||||||
|
|
||||||
|
|
||||||
def get_dist_flags(unicode string):
|
cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1:
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
cdef char* intern_and_encode(unicode string, size_t* length) except NULL:
|
|
||||||
cdef bytes byte_string = string.encode('utf8')
|
cdef bytes byte_string = string.encode('utf8')
|
||||||
cdef bytes utf8_string = intern(byte_string)
|
cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
|
||||||
Py_INCREF(utf8_string)
|
lex.id = orig_str.i
|
||||||
length[0] = len(utf8_string)
|
lex.cluster = 0
|
||||||
return <char*>utf8_string
|
lex.length = len(string)
|
||||||
|
lex.flags = get_flags(string)
|
||||||
|
# TODO: Hook this up
|
||||||
|
#lex.norm = norm_str.i
|
||||||
|
#lex.shape = norm_str.i
|
||||||
|
#lex.asciied = asciied_str.i
|
||||||
|
#lex.prefix = prefix_str.i
|
||||||
|
#lex.suffix = suffix_str.i
|
||||||
|
|
||||||
|
|
||||||
cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *:
|
cdef int from_dict(Lexeme* lex, dict props, StringStore stroe) except -1:
|
||||||
return lexeme.ints[i]
|
pass
|
||||||
|
|
||||||
|
|
||||||
cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *:
|
|
||||||
return lexeme.floats[i]
|
|
||||||
|
|
||||||
|
|
||||||
cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i):
|
|
||||||
cdef bytes byte_string = lexeme.strings[i]
|
|
||||||
return byte_string.decode('utf8')
|
|
||||||
|
|
||||||
|
|
||||||
cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *:
|
|
||||||
return lexeme.orth_flags & (1 << flag_id)
|
|
||||||
|
|
||||||
|
|
||||||
cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *:
|
|
||||||
return lexeme.dist_flags & (1 << flag_id)
|
|
||||||
|
|
||||||
|
|
||||||
cdef dict lexeme_pack(LexemeC* lex):
|
|
||||||
cdef dict packed = {}
|
|
||||||
packed['ints'] = [lex.ints[i] for i in range(LexInt_N)]
|
|
||||||
packed['floats'] = [lex.floats[i] for i in range(LexFloat_N)]
|
|
||||||
packed['strings'] = [lex.strings[i].decode('utf8') for i in range(LexStr_N)]
|
|
||||||
packed['orth_flags'] = lex.orth_flags
|
|
||||||
packed['dist_flags'] = lex.orth_flags
|
|
||||||
return packed
|
|
||||||
|
|
||||||
|
|
||||||
cdef int lexeme_unpack(LexemeC* lex, dict p) except -1:
|
|
||||||
cdef size_t i
|
|
||||||
cdef int lex_int
|
|
||||||
cdef float lex_float
|
|
||||||
cdef unicode string
|
|
||||||
for i, lex_int in enumerate(p['ints']):
|
|
||||||
lex.ints[i] = lex_int
|
|
||||||
for i, lex_float in enumerate(p['floats']):
|
|
||||||
lex.floats[i] = lex_float
|
|
||||||
cdef size_t _
|
|
||||||
for i in range(LexStr_N):
|
|
||||||
lex_string = p['strings'][i]
|
|
||||||
lex.strings[i] = intern_and_encode(lex_string, &_)
|
|
||||||
lex.orth_flags = p['orth_flags']
|
|
||||||
lex.dist_flags = p['dist_flags']
|
|
||||||
|
|
|
@ -113,8 +113,8 @@ cpdef enum:
|
||||||
CONTEXT_SIZE
|
CONTEXT_SIZE
|
||||||
|
|
||||||
|
|
||||||
cdef int get_atoms(atom_t* atoms, LexemeC* p2, LexemeC* p1, LexemeC* n0, LexemeC* n1,
|
cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1,
|
||||||
LexemeC* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
|
Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
|
||||||
_fill_token(&atoms[P2i], p2)
|
_fill_token(&atoms[P2i], p2)
|
||||||
_fill_token(&atoms[P1i], p1)
|
_fill_token(&atoms[P1i], p1)
|
||||||
_fill_token(&atoms[N0i], n0)
|
_fill_token(&atoms[N0i], n0)
|
||||||
|
@ -124,16 +124,16 @@ cdef int get_atoms(atom_t* atoms, LexemeC* p2, LexemeC* p1, LexemeC* n0, LexemeC
|
||||||
atoms[P2t] = prev_prev_tag
|
atoms[P2t] = prev_prev_tag
|
||||||
|
|
||||||
|
|
||||||
cdef inline void _fill_token(atom_t* atoms, LexemeC* lex) nogil:
|
cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
|
||||||
atoms[0] = lex.ints[<int>LexInt_id]
|
atoms[0] = lex.id
|
||||||
atoms[1] = lex.ints[<int>LexInt_cluster]
|
atoms[1] = lex.cluster
|
||||||
atoms[2] = <atom_t>lex.strings[<int>LexStr_norm]
|
atoms[2] = lex.norm
|
||||||
atoms[3] = <atom_t>lex.strings[<int>LexStr_shape]
|
atoms[3] = lex.shape
|
||||||
atoms[4] = <atom_t>lex.strings[<int>LexStr_pre]
|
atoms[4] = lex.prefix
|
||||||
atoms[5] = <atom_t>lex.strings[<int>LexStr_suff]
|
atoms[5] = lex.suffix
|
||||||
|
|
||||||
atoms[6] = lex.dist_flags & (1 << LexDist_title)
|
atoms[6] = lex.flags & (1 << OFT_TITLE)
|
||||||
atoms[7] = lex.dist_flags & (1 << LexDist_upper)
|
atoms[7] = lex.flags & (1 << OFT_UPPER)
|
||||||
|
|
||||||
|
|
||||||
TEMPLATES = (
|
TEMPLATES = (
|
||||||
|
|
|
@ -20,6 +20,8 @@ def realign_tagged(token_rules, tagged_line, sep='/'):
|
||||||
def read_tagged(detoken_rules, file_, sep='/'):
|
def read_tagged(detoken_rules, file_, sep='/'):
|
||||||
sentences = []
|
sentences = []
|
||||||
for line in file_:
|
for line in file_:
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
line = realign_tagged(detoken_rules, line, sep=sep)
|
line = realign_tagged(detoken_rules, line, sep=sep)
|
||||||
tokens, tags = _parse_line(line, sep)
|
tokens, tags = _parse_line(line, sep)
|
||||||
assert len(tokens) == len(tags)
|
assert len(tokens) == len(tags)
|
||||||
|
@ -39,7 +41,7 @@ def _parse_line(line, sep):
|
||||||
subtags.append('NULL')
|
subtags.append('NULL')
|
||||||
assert len(subtags) == len(subtokens), [t.string for t in subtokens]
|
assert len(subtags) == len(subtokens), [t.string for t in subtokens]
|
||||||
words.append(word)
|
words.append(word)
|
||||||
tags.extend([Tagger.encode_pos(pos) for pos in subtags])
|
tags.extend([Tagger.encode_pos(ptb_to_univ(pos)) for pos in subtags])
|
||||||
return EN.tokenize(' '.join(words)), tags
|
return EN.tokenize(' '.join(words)), tags
|
||||||
|
|
||||||
|
|
||||||
|
@ -53,3 +55,86 @@ def get_tagdict(train_sents):
|
||||||
tagdict.setdefault(word, {}).setdefault(tag, 0)
|
tagdict.setdefault(word, {}).setdefault(tag, 0)
|
||||||
tagdict[word][tag] += 1
|
tagdict[word][tag] += 1
|
||||||
return tagdict
|
return tagdict
|
||||||
|
|
||||||
|
|
||||||
|
def ptb_to_univ(tag):
|
||||||
|
mapping = dict(tuple(line.split()) for line in """
|
||||||
|
NULL NULL
|
||||||
|
HYPH .
|
||||||
|
ADD X
|
||||||
|
NFP .
|
||||||
|
AFX X
|
||||||
|
XX X
|
||||||
|
BES VERB
|
||||||
|
HVS VERB
|
||||||
|
GW X
|
||||||
|
! .
|
||||||
|
# .
|
||||||
|
$ .
|
||||||
|
'' .
|
||||||
|
( .
|
||||||
|
) .
|
||||||
|
, .
|
||||||
|
-LRB- .
|
||||||
|
-RRB- .
|
||||||
|
. .
|
||||||
|
: .
|
||||||
|
? .
|
||||||
|
CC CONJ
|
||||||
|
CD NUM
|
||||||
|
CD|RB X
|
||||||
|
DT DET
|
||||||
|
EX DET
|
||||||
|
FW X
|
||||||
|
IN ADP
|
||||||
|
IN|RP ADP
|
||||||
|
JJ ADJ
|
||||||
|
JJR ADJ
|
||||||
|
JJRJR ADJ
|
||||||
|
JJS ADJ
|
||||||
|
JJ|RB ADJ
|
||||||
|
JJ|VBG ADJ
|
||||||
|
LS X
|
||||||
|
MD VERB
|
||||||
|
NN NOUN
|
||||||
|
NNP NOUN
|
||||||
|
NNPS NOUN
|
||||||
|
NNS NOUN
|
||||||
|
NN|NNS NOUN
|
||||||
|
NN|SYM NOUN
|
||||||
|
NN|VBG NOUN
|
||||||
|
NP NOUN
|
||||||
|
PDT DET
|
||||||
|
POS PRT
|
||||||
|
PRP PRON
|
||||||
|
PRP$ PRON
|
||||||
|
PRP|VBP PRON
|
||||||
|
PRT PRT
|
||||||
|
RB ADV
|
||||||
|
RBR ADV
|
||||||
|
RBS ADV
|
||||||
|
RB|RP ADV
|
||||||
|
RB|VBG ADV
|
||||||
|
RN X
|
||||||
|
RP PRT
|
||||||
|
SYM X
|
||||||
|
TO PRT
|
||||||
|
UH X
|
||||||
|
VB VERB
|
||||||
|
VBD VERB
|
||||||
|
VBD|VBN VERB
|
||||||
|
VBG VERB
|
||||||
|
VBG|NN VERB
|
||||||
|
VBN VERB
|
||||||
|
VBP VERB
|
||||||
|
VBP|TO VERB
|
||||||
|
VBZ VERB
|
||||||
|
VP VERB
|
||||||
|
WDT DET
|
||||||
|
WH X
|
||||||
|
WP PRON
|
||||||
|
WP$ PRON
|
||||||
|
WRB ADV
|
||||||
|
`` .""".strip().split('\n'))
|
||||||
|
return mapping[tag]
|
||||||
|
|
||||||
|
|
|
@ -1,5 +0,0 @@
|
||||||
from spacy.lang cimport Language
|
|
||||||
|
|
||||||
|
|
||||||
cdef class PennTreebank3(Language):
|
|
||||||
cdef list _split(self, unicode split)
|
|
161
spacy/ptb3.pyx
161
spacy/ptb3.pyx
|
@ -1,161 +0,0 @@
|
||||||
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
|
|
||||||
so that strings can be retrieved from hashes. Use 64-bit hash values and
|
|
||||||
boldly assume no collisions.
|
|
||||||
'''
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
from libc.stdint cimport uint64_t
|
|
||||||
|
|
||||||
|
|
||||||
cimport spacy
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from spacy import orth
|
|
||||||
|
|
||||||
TAG_THRESH = 0.5
|
|
||||||
UPPER_THRESH = 0.2
|
|
||||||
LOWER_THRESH = 0.5
|
|
||||||
TITLE_THRESH = 0.7
|
|
||||||
|
|
||||||
NR_FLAGS = 0
|
|
||||||
|
|
||||||
OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
OFT_TITLE = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
|
|
||||||
IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
IS_SPACE = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
IS_ASCII = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
IS_TITLE = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
IS_LOWER = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
IS_UPPER = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
|
|
||||||
CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_NUM = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_DET = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_ADP = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_ADV = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_VERB = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_PDT = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_POS = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
CAN_PRT = NR_FLAGS; NR_FLAGS += 1
|
|
||||||
|
|
||||||
|
|
||||||
# List of contractions adapted from Robert MacIntyre's tokenizer.
|
|
||||||
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
|
|
||||||
re.compile(r"(?i)\b(d)('ye)\b"),
|
|
||||||
re.compile(r"(?i)\b(gim)(me)\b"),
|
|
||||||
re.compile(r"(?i)\b(gon)(na)\b"),
|
|
||||||
re.compile(r"(?i)\b(got)(ta)\b"),
|
|
||||||
re.compile(r"(?i)\b(lem)(me)\b"),
|
|
||||||
re.compile(r"(?i)\b(mor)('n)\b"),
|
|
||||||
re.compile(r"(?i)\b(wan)(na) ")]
|
|
||||||
|
|
||||||
CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
|
|
||||||
re.compile(r"(?i) ('t)(was)\b")]
|
|
||||||
|
|
||||||
CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
|
|
||||||
re.compile(r"(?i)\b(wha)(t)(cha)\b")]
|
|
||||||
|
|
||||||
def nltk_regex_tokenize(text):
|
|
||||||
# Implementation taken from NLTK 3.0, based on tokenizer.sed
|
|
||||||
|
|
||||||
#starting quotes
|
|
||||||
text = re.sub(r'^\"', r'``', text)
|
|
||||||
text = re.sub(r'(``)', r' \1 ', text)
|
|
||||||
text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
|
|
||||||
|
|
||||||
#punctuation
|
|
||||||
text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
|
|
||||||
text = re.sub(r'\.\.\.', r' ... ', text)
|
|
||||||
text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
|
|
||||||
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
|
|
||||||
text = re.sub(r'[?!]', r' \g<0> ', text)
|
|
||||||
|
|
||||||
text = re.sub(r"([^'])' ", r"\1 ' ", text)
|
|
||||||
|
|
||||||
#parens, brackets, etc.
|
|
||||||
text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
|
|
||||||
text = re.sub(r'--', r' -- ', text)
|
|
||||||
|
|
||||||
#add extra space to make things easier
|
|
||||||
text = " " + text + " "
|
|
||||||
|
|
||||||
#ending quotes
|
|
||||||
text = re.sub(r'"', " '' ", text)
|
|
||||||
text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
|
|
||||||
|
|
||||||
text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
|
|
||||||
text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
|
|
||||||
text)
|
|
||||||
|
|
||||||
for regexp in CONTRACTIONS2:
|
|
||||||
text = regexp.sub(r' \1 \2 ', text)
|
|
||||||
for regexp in CONTRACTIONS3:
|
|
||||||
text = regexp.sub(r' \1 \2 ', text)
|
|
||||||
|
|
||||||
# We are not using CONTRACTIONS4 since
|
|
||||||
# they are also commented out in the SED scripts
|
|
||||||
# for regexp in self.CONTRACTIONS4:
|
|
||||||
# text = regexp.sub(r' \1 \2 \3 ', text)
|
|
||||||
|
|
||||||
return text.split()
|
|
||||||
|
|
||||||
|
|
||||||
cdef class PennTreebank3(Language):
|
|
||||||
"""Fully PTB compatible English tokenizer, tightly coupled to lexicon.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
name (unicode): The two letter code used by Wikipedia for the language.
|
|
||||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def __cinit__(self, name):
|
|
||||||
flag_funcs = [0 for _ in range(NR_FLAGS)]
|
|
||||||
|
|
||||||
flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
|
|
||||||
flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
|
|
||||||
flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
|
|
||||||
|
|
||||||
flag_funcs[IS_ALPHA] = orth.is_alpha
|
|
||||||
flag_funcs[IS_DIGIT] = orth.is_digit
|
|
||||||
flag_funcs[IS_PUNCT] = orth.is_punct
|
|
||||||
flag_funcs[IS_SPACE] = orth.is_space
|
|
||||||
flag_funcs[IS_TITLE] = orth.is_title
|
|
||||||
flag_funcs[IS_LOWER] = orth.is_lower
|
|
||||||
flag_funcs[IS_UPPER] = orth.is_upper
|
|
||||||
|
|
||||||
flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
|
|
||||||
flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
|
|
||||||
|
|
||||||
Language.__init__(self, name, flag_funcs)
|
|
||||||
|
|
||||||
|
|
||||||
cdef list _split(self, unicode chunk):
|
|
||||||
strings = nltk_regex_tokenize(chunk)
|
|
||||||
if strings[-1] == '.':
|
|
||||||
strings.pop()
|
|
||||||
strings[-1] += '.'
|
|
||||||
assert strings
|
|
||||||
return strings
|
|
||||||
|
|
||||||
|
|
||||||
PTB3 = PennTreebank3('ptb3')
|
|
|
@ -1,59 +1,49 @@
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from spacy.lexeme cimport LexemeC
|
from .lexeme cimport Lexeme
|
||||||
|
from .typedefs cimport flag_t
|
||||||
|
from .utf8string cimport StringStore
|
||||||
|
|
||||||
from thinc.typedefs cimport atom_t
|
from thinc.typedefs cimport atom_t
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
|
cdef StringStore _string_store
|
||||||
|
|
||||||
cdef LexemeC** _lex_ptr
|
cdef Lexeme** _lex_ptr
|
||||||
cdef int* _idx_ptr
|
cdef int* _idx_ptr
|
||||||
cdef int* _pos_ptr
|
cdef int* _pos_ptr
|
||||||
cdef LexemeC** lex
|
cdef Lexeme** lex
|
||||||
cdef int* idx
|
cdef int* idx
|
||||||
cdef int* pos
|
cdef int* pos
|
||||||
|
|
||||||
cdef int length
|
cdef int length
|
||||||
cdef int max_length
|
cdef int max_length
|
||||||
|
|
||||||
cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
|
cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
|
||||||
cdef int push_back(self, int i, LexemeC* lexeme) except -1
|
cdef int push_back(self, int i, Lexeme* lexeme) except -1
|
||||||
|
|
||||||
cpdef int id(self, size_t i) except -1
|
|
||||||
cpdef float prob(self, size_t i) except 1
|
|
||||||
cpdef int cluster(self, size_t i) except *
|
|
||||||
cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *
|
|
||||||
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *
|
|
||||||
cpdef unicode string_view(self, size_t i, size_t view_id)
|
|
||||||
|
|
||||||
cpdef unicode string(self, size_t i)
|
cdef class Token:
|
||||||
cpdef unicode orig(self, size_t i)
|
cdef StringStore _string_store
|
||||||
cpdef unicode norm(self, size_t i)
|
cdef public int i
|
||||||
cpdef unicode shape(self, size_t i)
|
cdef public int idx
|
||||||
cpdef unicode unsparse(self, size_t i)
|
cdef public int pos
|
||||||
cpdef unicode asciied(self, size_t i)
|
|
||||||
cpdef bint is_alpha(self, size_t i) except *
|
cdef public atom_t id
|
||||||
cpdef bint is_ascii(self, size_t i) except *
|
cdef public atom_t cluster
|
||||||
cpdef bint is_digit(self, size_t i) except *
|
cdef public atom_t length
|
||||||
cpdef bint is_lower(self, size_t i) except *
|
cdef public atom_t lex_pos
|
||||||
cpdef bint is_punct(self, size_t i) except *
|
cdef public atom_t lex_supersense
|
||||||
cpdef bint is_space(self, size_t i) except *
|
|
||||||
cpdef bint is_title(self, size_t i) except *
|
cdef public atom_t norm
|
||||||
cpdef bint is_upper(self, size_t i) except *
|
cdef public atom_t shape
|
||||||
cpdef bint can_adj(self, size_t i) except *
|
cdef public atom_t vocab10k
|
||||||
cpdef bint can_adp(self, size_t i) except *
|
cdef public atom_t asciied
|
||||||
cpdef bint can_adv(self, size_t i) except *
|
cdef public atom_t prefix
|
||||||
cpdef bint can_conj(self, size_t i) except *
|
cdef public atom_t suffix
|
||||||
cpdef bint can_det(self, size_t i) except *
|
|
||||||
cpdef bint can_noun(self, size_t i) except *
|
cdef public float prob
|
||||||
cpdef bint can_num(self, size_t i) except *
|
|
||||||
cpdef bint can_pdt(self, size_t i) except *
|
cdef public flag_t flags
|
||||||
cpdef bint can_pos(self, size_t i) except *
|
|
||||||
cpdef bint can_pron(self, size_t i) except *
|
|
||||||
cpdef bint can_prt(self, size_t i) except *
|
|
||||||
cpdef bint can_punct(self, size_t i) except *
|
|
||||||
cpdef bint can_verb(self, size_t i) except *
|
|
||||||
cpdef bint oft_lower(self, size_t i) except *
|
|
||||||
cpdef bint oft_title(self, size_t i) except *
|
|
||||||
cpdef bint oft_upper(self, size_t i) except *
|
|
||||||
|
|
209
spacy/tokens.pyx
209
spacy/tokens.pyx
|
@ -1,10 +1,6 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
from .word cimport Lexeme
|
|
||||||
|
|
||||||
from .lexeme cimport *
|
from .lexeme cimport *
|
||||||
cimport numpy
|
|
||||||
cimport cython
|
cimport cython
|
||||||
import numpy
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
|
||||||
|
@ -34,7 +30,8 @@ cdef class Tokens:
|
||||||
>>> tokens.can_noun(1)
|
>>> tokens.can_noun(1)
|
||||||
True
|
True
|
||||||
"""
|
"""
|
||||||
def __init__(self, string_length=0):
|
def __init__(self, StringStore string_store, string_length=0):
|
||||||
|
self._string_store = string_store
|
||||||
if string_length >= 3:
|
if string_length >= 3:
|
||||||
size = int(string_length / 3.0)
|
size = int(string_length / 3.0)
|
||||||
else:
|
else:
|
||||||
|
@ -43,7 +40,7 @@ cdef class Tokens:
|
||||||
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
||||||
# However, we need to remember the true starting places, so that we can
|
# However, we need to remember the true starting places, so that we can
|
||||||
# realloc.
|
# realloc.
|
||||||
self._lex_ptr = <LexemeC**>self.mem.alloc(size + (PADDING*2), sizeof(LexemeC*))
|
self._lex_ptr = <Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
|
||||||
self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
||||||
self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
||||||
self.lex = self._lex_ptr
|
self.lex = self._lex_ptr
|
||||||
|
@ -55,39 +52,26 @@ cdef class Tokens:
|
||||||
self.lex += PADDING
|
self.lex += PADDING
|
||||||
self.idx += PADDING
|
self.idx += PADDING
|
||||||
self.pos += PADDING
|
self.pos += PADDING
|
||||||
|
|
||||||
self.max_length = size
|
self.max_length = size
|
||||||
self.length = 0
|
self.length = 0
|
||||||
|
|
||||||
def __getitem__(self, i):
|
def __getitem__(self, i):
|
||||||
bounds_check(i, self.length, PADDING)
|
bounds_check(i, self.length, PADDING)
|
||||||
return Lexeme(<size_t>self.lex[i])
|
return Token(self._string_store, i, self.idx[i], self.pos[i], self.lex[i][0])
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
cdef int push_back(self, int idx, LexemeC* lexeme) except -1:
|
cdef int push_back(self, int idx, Lexeme* lexeme) except -1:
|
||||||
if self.length == self.max_length:
|
if self.length == self.max_length:
|
||||||
self._realloc(self.length * 2)
|
self._realloc(self.length * 2)
|
||||||
self.lex[self.length] = lexeme
|
self.lex[self.length] = lexeme
|
||||||
self.idx[self.length] = idx
|
self.idx[self.length] = idx
|
||||||
self.pos[self.length] = 0
|
self.pos[self.length] = 0
|
||||||
self.length += 1
|
self.length += 1
|
||||||
return idx + lexeme.ints[<int>LexInt_length]
|
return idx + lexeme.length
|
||||||
|
|
||||||
def _realloc(self, new_size):
|
cdef int extend(self, int idx, Lexeme** lexemes, int n) except -1:
|
||||||
self.max_length = new_size
|
|
||||||
n = new_size + (PADDING * 2)
|
|
||||||
self._lex_ptr = <LexemeC**>self.mem.realloc(self._lex_ptr, n * sizeof(LexemeC*))
|
|
||||||
self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
|
|
||||||
self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
|
|
||||||
self.lex = self._lex_ptr + PADDING
|
|
||||||
self.idx = self._idx_ptr + PADDING
|
|
||||||
self.pos = self._pos_ptr + PADDING
|
|
||||||
for i in range(self.length, self.max_length + PADDING):
|
|
||||||
self.lex[i] = &EMPTY_LEXEME
|
|
||||||
|
|
||||||
cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
|
|
||||||
cdef int i
|
cdef int i
|
||||||
if lexemes == NULL:
|
if lexemes == NULL:
|
||||||
return idx
|
return idx
|
||||||
|
@ -101,154 +85,43 @@ cdef class Tokens:
|
||||||
idx = self.push_back(idx, lexemes[i])
|
idx = self.push_back(idx, lexemes[i])
|
||||||
return idx
|
return idx
|
||||||
|
|
||||||
cpdef int id(self, size_t i) except -1:
|
def _realloc(self, new_size):
|
||||||
bounds_check(i, self.length, PADDING)
|
self.max_length = new_size
|
||||||
return self.lex[i].ints[<int>LexInt_id]
|
n = new_size + (PADDING * 2)
|
||||||
|
self._lex_ptr = <Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
|
||||||
|
self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
|
||||||
|
self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
|
||||||
|
self.lex = self._lex_ptr + PADDING
|
||||||
|
self.idx = self._idx_ptr + PADDING
|
||||||
|
self.pos = self._pos_ptr + PADDING
|
||||||
|
for i in range(self.length, self.max_length + PADDING):
|
||||||
|
self.lex[i] = &EMPTY_LEXEME
|
||||||
|
|
||||||
cpdef float prob(self, size_t i) except 1:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return self.lex[i].floats[<int>LexFloat_prob]
|
|
||||||
|
|
||||||
cpdef int cluster(self, size_t i) except *:
|
@cython.freelist(64)
|
||||||
bounds_check(i, self.length, PADDING)
|
cdef class Token:
|
||||||
return self.lex[i].ints[<int>LexInt_cluster]
|
def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex):
|
||||||
|
self._string_store = string_store
|
||||||
|
self.i = i
|
||||||
|
self.idx = idx
|
||||||
|
self.pos = pos
|
||||||
|
|
||||||
cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *:
|
self.id = lex['id']
|
||||||
bounds_check(i, self.length, PADDING)
|
self.cluster = lex['cluster']
|
||||||
return lexeme_check_orth_flag(self.lex[i], flag_id)
|
self.length = lex['length']
|
||||||
|
self.lex_pos = lex['pos']
|
||||||
|
self.lex_supersense = lex['supersense']
|
||||||
|
self.norm = lex['norm']
|
||||||
|
self.shape = lex['shape']
|
||||||
|
self.vocab10k = lex['vocab10k']
|
||||||
|
self.suffix = lex['asciied']
|
||||||
|
self.prefix = lex['prefix']
|
||||||
|
|
||||||
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *:
|
self.prob = lex['prob']
|
||||||
bounds_check(i, self.length, PADDING)
|
self.flags = lex['flags']
|
||||||
return lexeme_check_dist_flag(self.lex[i], flag_id)
|
|
||||||
|
|
||||||
cpdef unicode string_view(self, size_t i, size_t view_id):
|
property string:
|
||||||
bounds_check(i, self.length, PADDING)
|
def __get__(self):
|
||||||
return lexeme_get_string(self.lex[i], view_id)
|
cdef bytes utf8string = self._string_store[self.id]
|
||||||
|
return utf8string.decode('utf8')
|
||||||
|
|
||||||
# Provide accessor methods for the features supported by the language.
|
|
||||||
# Without these, clients have to use the underlying string_view and check_flag
|
|
||||||
# methods, which requires them to know the IDs.
|
|
||||||
|
|
||||||
cpdef unicode string(self, size_t i):
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return self.orig(i)
|
|
||||||
|
|
||||||
cpdef unicode orig(self, size_t i):
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
cdef bytes utf8_string = self.lex[i].strings[<int>LexStr_orig]
|
|
||||||
cdef unicode string = utf8_string.decode('utf8')
|
|
||||||
return string
|
|
||||||
|
|
||||||
cpdef unicode norm(self, size_t i):
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
cdef bytes utf8_string = self.lex[i].strings[<int>LexStr_norm]
|
|
||||||
cdef unicode string = utf8_string.decode('utf8')
|
|
||||||
return string
|
|
||||||
|
|
||||||
cpdef unicode shape(self, size_t i):
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_get_string(self.lex[i], LexStr_shape)
|
|
||||||
|
|
||||||
cpdef unicode unsparse(self, size_t i):
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_get_string(self.lex[i], LexStr_unsparse)
|
|
||||||
|
|
||||||
cpdef unicode asciied(self, size_t i):
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_get_string(self.lex[i], LexStr_asciied)
|
|
||||||
|
|
||||||
cpdef bint is_alpha(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_orth_flag(self.lex[i], LexOrth_alpha)
|
|
||||||
|
|
||||||
cpdef bint is_ascii(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_orth_flag(self.lex[i], LexOrth_ascii)
|
|
||||||
|
|
||||||
cpdef bint is_digit(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_orth_flag(self.lex[i], LexOrth_digit)
|
|
||||||
|
|
||||||
cpdef bint is_lower(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_orth_flag(self.lex[i], LexOrth_lower)
|
|
||||||
|
|
||||||
cpdef bint is_punct(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_orth_flag(self.lex[i], LexOrth_punct)
|
|
||||||
|
|
||||||
cpdef bint is_space(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_orth_flag(self.lex[i], LexOrth_space)
|
|
||||||
|
|
||||||
cpdef bint is_title(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_orth_flag(self.lex[i], LexOrth_title)
|
|
||||||
|
|
||||||
cpdef bint is_upper(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_orth_flag(self.lex[i], LexOrth_upper)
|
|
||||||
|
|
||||||
cpdef bint can_adj(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_dist_flag(self.lex[i], LexDist_adj)
|
|
||||||
|
|
||||||
cpdef bint can_adp(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_dist_flag(self.lex[i], LexDist_adp)
|
|
||||||
|
|
||||||
cpdef bint can_adv(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_dist_flag(self.lex[i], LexDist_adv)
|
|
||||||
|
|
||||||
cpdef bint can_conj(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_dist_flag(self.lex[i], LexDist_conj)
|
|
||||||
|
|
||||||
cpdef bint can_det(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_dist_flag(self.lex[i], LexDist_det)
|
|
||||||
|
|
||||||
cpdef bint can_noun(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_dist_flag(self.lex[i], LexDist_noun)
|
|
||||||
|
|
||||||
cpdef bint can_num(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_dist_flag(self.lex[i], LexDist_num)
|
|
||||||
|
|
||||||
cpdef bint can_pdt(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_dist_flag(self.lex[i], LexDist_pdt)
|
|
||||||
|
|
||||||
cpdef bint can_pos(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_dist_flag(self.lex[i], LexDist_pos)
|
|
||||||
|
|
||||||
cpdef bint can_pron(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_dist_flag(self.lex[i], LexDist_pron)
|
|
||||||
|
|
||||||
cpdef bint can_prt(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_dist_flag(self.lex[i], LexDist_prt)
|
|
||||||
|
|
||||||
cpdef bint can_punct(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_dist_flag(self.lex[i], LexDist_punct)
|
|
||||||
|
|
||||||
cpdef bint can_verb(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_dist_flag(self.lex[i], LexDist_verb)
|
|
||||||
|
|
||||||
cpdef bint oft_lower(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_dist_flag(self.lex[i], LexDist_lower)
|
|
||||||
|
|
||||||
cpdef bint oft_title(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_dist_flag(self.lex[i], LexDist_title)
|
|
||||||
|
|
||||||
cpdef bint oft_upper(self, size_t i) except *:
|
|
||||||
bounds_check(i, self.length, PADDING)
|
|
||||||
return lexeme_check_dist_flag(self.lex[i], LexDist_upper)
|
|
||||||
|
|
|
@ -1,12 +0,0 @@
|
||||||
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
|
|
||||||
from spacy.lexeme cimport LexemeC
|
|
||||||
|
|
||||||
DEF MAX_FLAG = 64
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexeme:
|
|
||||||
cdef LexemeC* _c
|
|
||||||
|
|
||||||
cpdef bint check_orth_flag(self, size_t flag_id) except *
|
|
||||||
cpdef bint check_dist_flag(self, size_t flag_id) except *
|
|
||||||
cpdef unicode string_view(self, size_t view_id)
|
|
|
@ -1,80 +0,0 @@
|
||||||
# cython: profile=True
|
|
||||||
# cython: embedsignature=True
|
|
||||||
|
|
||||||
from .lexeme cimport lexeme_get_string
|
|
||||||
from .lexeme cimport lexeme_check_orth_flag, lexeme_check_dist_flag
|
|
||||||
|
|
||||||
from .lexeme cimport *
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexeme:
|
|
||||||
"""A lexical type --- a word, punctuation symbol, whitespace sequence, etc
|
|
||||||
keyed by a case-sensitive unicode string. All tokens with the same string,
|
|
||||||
e.g. all instances of "dog", ",", "NASA" etc should be mapped to the same
|
|
||||||
Lexeme.
|
|
||||||
|
|
||||||
You should avoid instantiating Lexemes directly, and instead use the
|
|
||||||
:py:meth:`space.lang.Language.tokenize` and :py:meth:`spacy.lang.Language.lookup`
|
|
||||||
methods on the global object exposed by the language you're working with,
|
|
||||||
e.g. :py:data:`spacy.en.EN`.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
string (unicode):
|
|
||||||
The unicode string.
|
|
||||||
|
|
||||||
Implemented as a property; relatively expensive.
|
|
||||||
|
|
||||||
length (size_t):
|
|
||||||
The number of unicode code-points in the string.
|
|
||||||
|
|
||||||
prob (double):
|
|
||||||
An estimate of the word's unigram log probability.
|
|
||||||
|
|
||||||
Probabilities are calculated from a large text corpus, and smoothed using
|
|
||||||
simple Good-Turing. Estimates are read from data/en/probabilities, and
|
|
||||||
can be replaced using spacy.en.load_probabilities.
|
|
||||||
|
|
||||||
cluster (size_t):
|
|
||||||
An integer representation of the word's Brown cluster.
|
|
||||||
|
|
||||||
A Brown cluster is an address into a binary tree, which gives some (noisy)
|
|
||||||
information about the word's distributional context.
|
|
||||||
|
|
||||||
>>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
|
|
||||||
>>> print ["{0:b"} % lookup(s).cluster for s in strings]
|
|
||||||
["100111110110", "100111100100", "01010111011001", "100111110110"]
|
|
||||||
|
|
||||||
The clusterings are unideal, but often slightly useful.
|
|
||||||
"pineapple" and "apple" share a long prefix, indicating a similar meaning,
|
|
||||||
while "dapple" is totally different. On the other hand, "scalable" receives
|
|
||||||
the same cluster ID as "pineapple", which is not what we'd like.
|
|
||||||
"""
|
|
||||||
def __cinit__(self, size_t lexeme_addr):
|
|
||||||
self._c = <LexemeC*>lexeme_addr
|
|
||||||
|
|
||||||
property string:
|
|
||||||
def __get__(self):
|
|
||||||
cdef bytes utf8_string = self._c.strings[<int>LexStr_orig]
|
|
||||||
cdef unicode string = utf8_string.decode('utf8')
|
|
||||||
return string
|
|
||||||
|
|
||||||
property prob:
|
|
||||||
def __get__(self):
|
|
||||||
return self._c.floats[<int>LexFloat_prob]
|
|
||||||
|
|
||||||
property cluster:
|
|
||||||
def __get__(self):
|
|
||||||
return self._c.ints[<int>LexInt_cluster]
|
|
||||||
|
|
||||||
property length:
|
|
||||||
def __get__(self):
|
|
||||||
return self._c.ints[<int>LexInt_length]
|
|
||||||
|
|
||||||
cpdef bint check_orth_flag(self, size_t flag_id) except *:
|
|
||||||
return lexeme_check_orth_flag(self._c, flag_id)
|
|
||||||
|
|
||||||
cpdef bint check_dist_flag(self, size_t flag_id) except *:
|
|
||||||
return lexeme_check_dist_flag(self._c, flag_id)
|
|
||||||
|
|
||||||
cpdef unicode string_view(self, size_t view_id):
|
|
||||||
return lexeme_get_string(self._c, view_id)
|
|
|
@ -5,8 +5,8 @@ from spacy.en import EN
|
||||||
|
|
||||||
def test_possess():
|
def test_possess():
|
||||||
tokens = EN.tokenize("Mike's")
|
tokens = EN.tokenize("Mike's")
|
||||||
assert tokens[0].string == "Mike"
|
assert EN.lexicon.strings[tokens[0].id] == "Mike"
|
||||||
assert tokens[1].string == "'s"
|
assert EN.lexicon.strings[tokens[1].id] == "'s"
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -8,19 +8,17 @@ from spacy.lexeme import *
|
||||||
|
|
||||||
def test_is_alpha():
|
def test_is_alpha():
|
||||||
the = EN.lexicon.lookup('the')
|
the = EN.lexicon.lookup('the')
|
||||||
assert the.check_orth_flag(LexOrth_alpha)
|
assert the['flags'] & (1 << IS_ALPHA)
|
||||||
year = EN.lexicon.lookup('1999')
|
year = EN.lexicon.lookup('1999')
|
||||||
assert not year.check_orth_flag(LexOrth_alpha)
|
assert not year['flags'] & (1 << IS_ALPHA)
|
||||||
mixed = EN.lexicon.lookup('hello1')
|
mixed = EN.lexicon.lookup('hello1')
|
||||||
assert not mixed.check_orth_flag(LexOrth_alpha)
|
assert not mixed['flags'] & (1 << IS_ALPHA)
|
||||||
|
|
||||||
|
|
||||||
def test_is_digit():
|
def test_is_digit():
|
||||||
the = EN.lexicon.lookup('the')
|
the = EN.lexicon.lookup('the')
|
||||||
assert not the.check_orth_flag(LexOrth_digit)
|
assert not the['flags'] & (1 << IS_DIGIT)
|
||||||
year = EN.lexicon.lookup('1999')
|
year = EN.lexicon.lookup('1999')
|
||||||
assert year.check_orth_flag(LexOrth_digit)
|
assert year['flags'] & (1 << IS_DIGIT)
|
||||||
mixed = EN.lexicon.lookup('hello1')
|
mixed = EN.lexicon.lookup('hello1')
|
||||||
assert not mixed.check_orth_flag(LexOrth_digit)
|
assert not mixed['flags'] & (1 << IS_DIGIT)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,27 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
import spacy.word
|
|
||||||
from spacy.en import EN
|
|
||||||
from spacy.lexeme import *
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def C3P0():
|
|
||||||
return EN.lexicon.lookup("C3P0")
|
|
||||||
|
|
||||||
|
|
||||||
def test_shape(C3P0):
|
|
||||||
assert C3P0.string_view(LexStr_shape) == "XdXd"
|
|
||||||
|
|
||||||
|
|
||||||
def test_length():
|
|
||||||
t = EN.lexicon.lookup('the')
|
|
||||||
assert t.length == 3
|
|
||||||
t = EN.lexicon.lookup("n't")
|
|
||||||
assert t.length == 3
|
|
||||||
t = EN.lexicon.lookup("'s")
|
|
||||||
assert t.length == 2
|
|
||||||
t = EN.lexicon.lookup('Xxxx')
|
|
||||||
assert t.length == 4
|
|
|
@ -8,9 +8,9 @@ from spacy.en import EN
|
||||||
|
|
||||||
def test_one():
|
def test_one():
|
||||||
tokens = EN.tokenize('Betty Botter bought a pound of butter.')
|
tokens = EN.tokenize('Betty Botter bought a pound of butter.')
|
||||||
assert tokens.string(0) == 'Betty'
|
assert tokens[0].string == 'Betty'
|
||||||
tokens2 = EN.tokenize('Betty also bought a pound of butter.')
|
tokens2 = EN.tokenize('Betty also bought a pound of butter.')
|
||||||
assert tokens2.string(0) == 'Betty'
|
assert tokens2[0].string == 'Betty'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,41 +5,39 @@ from spacy.en import EN
|
||||||
|
|
||||||
|
|
||||||
def test_single_word():
|
def test_single_word():
|
||||||
lex_ids = EN.tokenize(u'hello')
|
tokens = EN.tokenize(u'hello')
|
||||||
assert lex_ids[0].string == EN.lexicon.lookup(u'hello').string
|
assert tokens[0].string == 'hello'
|
||||||
|
|
||||||
|
|
||||||
def test_two_words():
|
def test_two_words():
|
||||||
words = EN.tokenize('hello possums')
|
tokens = EN.tokenize('hello possums')
|
||||||
assert len(words) == 2
|
assert len(tokens) == 2
|
||||||
assert words[0].string == EN.lexicon.lookup('hello').string
|
assert tokens[0].string != tokens[1].string
|
||||||
assert words[0].string != words[1].string
|
|
||||||
|
|
||||||
|
|
||||||
def test_punct():
|
def test_punct():
|
||||||
tokens = EN.tokenize('hello, possums.')
|
tokens = EN.tokenize('hello, possums.')
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
assert tokens[0].string == EN.lexicon.lookup('hello').string
|
assert tokens[0].string == 'hello'
|
||||||
assert tokens[1].string == EN.lexicon.lookup(',').string
|
assert tokens[1].string == ','
|
||||||
assert tokens[2].string == EN.lexicon.lookup('possums').string
|
assert tokens[2].string == 'possums'
|
||||||
assert tokens[1].string != EN.lexicon.lookup('hello').string
|
assert tokens[1].string != 'hello'
|
||||||
|
|
||||||
|
|
||||||
def test_digits():
|
def test_digits():
|
||||||
lex_ids = EN.tokenize('The year: 1984.')
|
tokens = EN.tokenize('The year: 1984.')
|
||||||
assert lex_ids.orig(3) == "1984"
|
assert len(tokens) == 5
|
||||||
assert len(lex_ids) == 5
|
assert tokens[0].id == EN.lexicon.lookup('The')['id']
|
||||||
assert lex_ids[0].string == EN.lexicon.lookup('The').string
|
assert tokens[3].id == EN.lexicon.lookup('1984')['id']
|
||||||
assert lex_ids[3].string == EN.lexicon.lookup('1984').string
|
|
||||||
|
|
||||||
|
|
||||||
def test_contraction():
|
def test_contraction():
|
||||||
lex_ids = EN.tokenize("don't giggle")
|
tokens = EN.tokenize("don't giggle")
|
||||||
assert len(lex_ids) == 3
|
assert len(tokens) == 3
|
||||||
assert lex_ids[1].string == EN.lexicon.lookup("not").string
|
assert tokens[1].id == EN.lexicon.lookup("not")['id']
|
||||||
lex_ids = EN.tokenize("i said don't!")
|
tokens = EN.tokenize("i said don't!")
|
||||||
assert len(lex_ids) == 5
|
assert len(tokens) == 5
|
||||||
assert lex_ids[4].string == EN.lexicon.lookup('!').string
|
assert tokens[4].id == EN.lexicon.lookup('!')['id']
|
||||||
|
|
||||||
|
|
||||||
def test_contraction_punct():
|
def test_contraction_punct():
|
||||||
|
|
|
@ -5,30 +5,19 @@ from spacy.en import EN
|
||||||
|
|
||||||
def test_neq():
|
def test_neq():
|
||||||
addr = EN.lexicon.lookup('Hello')
|
addr = EN.lexicon.lookup('Hello')
|
||||||
assert EN.lexicon.lookup('bye').string != addr.string
|
assert EN.lexicon.lookup('bye')['id'] != addr['id']
|
||||||
|
|
||||||
|
|
||||||
def test_eq():
|
def test_eq():
|
||||||
addr = EN.lexicon.lookup('Hello')
|
addr = EN.lexicon.lookup('Hello')
|
||||||
assert EN.lexicon.lookup('Hello').string == addr.string
|
assert EN.lexicon.lookup('Hello')['id'] == addr['id']
|
||||||
|
|
||||||
|
|
||||||
def test_round_trip():
|
|
||||||
hello = EN.lexicon.lookup('Hello')
|
|
||||||
assert hello.string == 'Hello'
|
|
||||||
|
|
||||||
|
|
||||||
def test_case_neq():
|
def test_case_neq():
|
||||||
addr = EN.lexicon.lookup('Hello')
|
addr = EN.lexicon.lookup('Hello')
|
||||||
assert EN.lexicon.lookup('hello').string != addr.string
|
assert EN.lexicon.lookup('hello')['id'] != addr['id']
|
||||||
|
|
||||||
|
|
||||||
def test_punct_neq():
|
def test_punct_neq():
|
||||||
addr = EN.lexicon.lookup('Hello')
|
addr = EN.lexicon.lookup('Hello')
|
||||||
assert EN.lexicon.lookup('Hello,').string != addr.string
|
assert EN.lexicon.lookup('Hello,')['id'] != addr['id']
|
||||||
|
|
||||||
|
|
||||||
def test_short():
|
|
||||||
addr = EN.lexicon.lookup('I')
|
|
||||||
assert addr.string == 'I'
|
|
||||||
assert addr.string != 'not'
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user