mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Refactoring with Lexeme as a class now compiles. Basic design seems to work
This commit is contained in:
parent
68bae2fec6
commit
e9a62b6eba
14
spacy/en.pxd
14
spacy/en.pxd
|
@ -1,4 +1,4 @@
|
||||||
from spacy.spacy cimport Language
|
from spacy.lang cimport Language
|
||||||
from spacy.word cimport Lexeme
|
from spacy.word cimport Lexeme
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
|
@ -31,12 +31,14 @@ cpdef size_t POS
|
||||||
cpdef size_t PRON
|
cpdef size_t PRON
|
||||||
cpdef size_t PRT
|
cpdef size_t PRT
|
||||||
|
|
||||||
cdef class English(spacy.Language):
|
cpdef size_t SIC
|
||||||
cdef int find_split(self, unicode word)
|
cpdef size_t CANON_CASED
|
||||||
|
cpdef size_t SHAPE
|
||||||
|
cpdef size_t NON_SPARSE
|
||||||
|
|
||||||
|
|
||||||
cdef English EN
|
cdef class English(Language):
|
||||||
|
cpdef int _split_one(self, unicode word)
|
||||||
|
|
||||||
|
|
||||||
cpdef Word lookup(unicode word)
|
cpdef English EN
|
||||||
cpdef list tokenize(unicode string)
|
|
||||||
|
|
189
spacy/en.pyx
189
spacy/en.pyx
|
@ -31,6 +31,7 @@ same scheme. Tokenization problems are a major cause of poor performance for
|
||||||
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
|
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
|
||||||
provides a fully Penn Treebank 3-compliant tokenizer.
|
provides a fully Penn Treebank 3-compliant tokenizer.
|
||||||
'''
|
'''
|
||||||
|
# TODO
|
||||||
#The script translate_treebank_tokenization can be used to transform a treebank's
|
#The script translate_treebank_tokenization can be used to transform a treebank's
|
||||||
#annotation to use one of the spacy tokenization schemes.
|
#annotation to use one of the spacy tokenization schemes.
|
||||||
|
|
||||||
|
@ -40,90 +41,14 @@ from __future__ import unicode_literals
|
||||||
from libc.stdlib cimport malloc, calloc, free
|
from libc.stdlib cimport malloc, calloc, free
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
cimport spacy
|
cimport lang
|
||||||
|
|
||||||
|
from spacy import orth
|
||||||
# Python-readable flag constants --- can't read an enum from Python
|
|
||||||
|
|
||||||
# Don't want to manually assign these numbers, or we'll insert one and have to
|
|
||||||
# change them all.
|
|
||||||
# Don't use "i", as we don't want it in the global scope!
|
|
||||||
cdef size_t __i = 0
|
|
||||||
|
|
||||||
ALPHA = __i; i += 1
|
|
||||||
DIGIT = __i; __i += 1
|
|
||||||
PUNCT = __i; __i += 1
|
|
||||||
SPACE = __i; __i += 1
|
|
||||||
LOWER = __i; __i += 1
|
|
||||||
UPPER = __i; __i += 1
|
|
||||||
TITLE = __i; __i += 1
|
|
||||||
ASCII = __i; __i += 1
|
|
||||||
|
|
||||||
OFT_LOWER = __i; __i += 1
|
|
||||||
OFT_UPPER = __i; __i += 1
|
|
||||||
OFT_TITLE = __i; __i += 1
|
|
||||||
|
|
||||||
PUNCT = __i; __i += 1
|
|
||||||
CONJ = __i; __i += 1
|
|
||||||
NUM = __i; __i += 1
|
|
||||||
X = __i; __i += 1
|
|
||||||
DET = __i; __i += 1
|
|
||||||
ADP = __i; __i += 1
|
|
||||||
ADJ = __i; __i += 1
|
|
||||||
ADV = __i; __i += 1
|
|
||||||
VERB = __i; __i += 1
|
|
||||||
NOUN = __i; __i += 1
|
|
||||||
PDT = __i; __i += 1
|
|
||||||
POS = __i; __i += 1
|
|
||||||
PRON = __i; __i += 1
|
|
||||||
PRT = __i; __i += 1
|
|
||||||
|
|
||||||
|
|
||||||
# These are for the string views
|
|
||||||
__i = 0
|
|
||||||
SIC = __i; __i += 1
|
|
||||||
CANON_CASED = __i; __i += 1
|
|
||||||
NON_SPARSE = __i; __i += 1
|
|
||||||
SHAPE = __i; __i += 1
|
|
||||||
NR_STRING_VIEWS = __i
|
|
||||||
|
|
||||||
|
|
||||||
def get_string_views(unicode string, lexeme):
|
|
||||||
views = ['' for _ in range(NR_STRING_VIEWS)]
|
|
||||||
views[SIC] = string
|
|
||||||
views[CANON_CASED] = canonicalize_case(string, lexeme)
|
|
||||||
views[SHAPE] = get_string_shape(string)
|
|
||||||
views[NON_SPARSE] = get_non_sparse(string, views[CANON_CASED], views[SHAPE],
|
|
||||||
lexeme)
|
|
||||||
return views
|
|
||||||
|
|
||||||
|
|
||||||
def set_orth_flags(unicode string, flags_t flags)
|
|
||||||
setters = [
|
|
||||||
(ALPHA, is_alpha),
|
|
||||||
(DIGIT, is_digit),
|
|
||||||
(PUNCT, is_punct),
|
|
||||||
(SPACE, is_space),
|
|
||||||
(LOWER, is_lower),
|
|
||||||
(UPPER, is_upper),
|
|
||||||
(SPACE, is_space)
|
|
||||||
]
|
|
||||||
|
|
||||||
for bit, setter in setters:
|
|
||||||
if setter(string):
|
|
||||||
flags |= 1 << bit
|
|
||||||
return flags
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
cdef class English(Language):
|
||||||
cdef class English(spacy.Language):
|
cpdef int _split_one(self, unicode word):
|
||||||
cdef Lexeme new_lexeme(self, unicode string, cluster=0, prob=0, case_stats=None,
|
|
||||||
tag_freqs=None):
|
|
||||||
return Lexeme(s, length, views, prob=prob, cluster=cluster,
|
|
||||||
flags=self.get_flags(string))
|
|
||||||
|
|
||||||
cdef int find_split(self, unicode word):
|
|
||||||
cdef size_t length = len(word)
|
cdef size_t length = len(word)
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
if word.startswith("'s") or word.startswith("'S"):
|
if word.startswith("'s") or word.startswith("'S"):
|
||||||
|
@ -132,17 +57,16 @@ cdef class English(spacy.Language):
|
||||||
if word.endswith("'s") and length >= 3:
|
if word.endswith("'s") and length >= 3:
|
||||||
return length - 2
|
return length - 2
|
||||||
# Leading punctuation
|
# Leading punctuation
|
||||||
if check_punct(word, 0, length):
|
if _check_punct(word, 0, length):
|
||||||
return 1
|
return 1
|
||||||
elif length >= 1:
|
elif length >= 1:
|
||||||
# Split off all trailing punctuation characters
|
# Split off all trailing punctuation characters
|
||||||
i = 0
|
i = 0
|
||||||
while i < length and not check_punct(word, i, length):
|
while i < length and not _check_punct(word, i, length):
|
||||||
i += 1
|
i += 1
|
||||||
return i
|
return i
|
||||||
|
|
||||||
|
cdef bint _check_punct(unicode word, size_t i, size_t length):
|
||||||
cdef bint check_punct(unicode word, size_t i, size_t length):
|
|
||||||
# Don't count appostrophes as punct if the next char is a letter
|
# Don't count appostrophes as punct if the next char is a letter
|
||||||
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
||||||
return i == 0
|
return i == 0
|
||||||
|
@ -160,69 +84,46 @@ cdef bint check_punct(unicode word, size_t i, size_t length):
|
||||||
EN = English('en')
|
EN = English('en')
|
||||||
|
|
||||||
|
|
||||||
cpdef list tokenize(unicode string):
|
# Thresholds for frequency related flags
|
||||||
"""Tokenize a string.
|
TAG_THRESH = 0.5
|
||||||
|
LOWER_THRESH = 0.5
|
||||||
The tokenization rules are defined in two places:
|
UPPER_THRESH = 0.3
|
||||||
|
TITLE_THRESH = 0.9
|
||||||
* The data/en/tokenization table, which handles special cases like contractions;
|
|
||||||
* The :py:meth:`spacy.en.English.find_split` function, which is used to split off punctuation etc.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
string (unicode): The string to be tokenized.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
|
|
||||||
"""
|
|
||||||
return EN.tokenize(string)
|
|
||||||
|
|
||||||
|
|
||||||
cpdef Lexeme lookup(unicode string):
|
# Python-readable flag constants --- can't read an enum from Python
|
||||||
"""Retrieve (or create, if not found) a Lexeme for a string, and return its ID.
|
ALPHA = EN.lexicon.add_flag(orth.is_alpha)
|
||||||
|
DIGIT = EN.lexicon.add_flag(orth.is_digit)
|
||||||
|
PUNCT = EN.lexicon.add_flag(orth.is_punct)
|
||||||
|
SPACE = EN.lexicon.add_flag(orth.is_space)
|
||||||
|
PUNCT = EN.lexicon.add_flag(orth.is_punct)
|
||||||
|
ASCII = EN.lexicon.add_flag(orth.is_ascii)
|
||||||
|
TITLE = EN.lexicon.add_flag(orth.is_title)
|
||||||
|
LOWER = EN.lexicon.add_flag(orth.is_lower)
|
||||||
|
UPPER = EN.lexicon.add_flag(orth.is_upper)
|
||||||
|
|
||||||
Properties of the Lexeme are accessed by passing LexID to the accessor methods.
|
OFT_LOWER = EN.lexicon.add_flag(orth.case_trend('lower', LOWER_THRESH))
|
||||||
Access is cheap/free, as the LexID is the memory address of the Lexeme.
|
OFT_UPPER = EN.lexicon.add_flag(orth.case_trend('upper', UPPER_THRESH))
|
||||||
|
OFT_TITLE = EN.lexicon.add_flag(orth.case_trend('title', TITLE_THRESH))
|
||||||
Args:
|
|
||||||
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
|
||||||
|
|
||||||
Returns:
|
CAN_PUNCT = EN.lexicon.add_flag(orth.can_tag("PUNCT", TAG_THRESH))
|
||||||
lexeme (LexID): A reference to a lexical type.
|
CAN_CONJ = EN.lexicon.add_flag(orth.can_tag("CONJ", TAG_THRESH))
|
||||||
"""
|
CAN_NUM = EN.lexicon.add_flag(orth.can_tag("NUM", TAG_THRESH))
|
||||||
return EN.lookup(string)
|
CAN_N = EN.lexicon.add_flag(orth.can_tag("N", TAG_THRESH))
|
||||||
|
CAN_DET = EN.lexicon.add_flag(orth.can_tag("DET", TAG_THRESH))
|
||||||
|
CAN_ADP = EN.lexicon.add_flag(orth.can_tag("ADP", TAG_THRESH))
|
||||||
|
CAN_ADJ = EN.lexicon.add_flag(orth.can_tag("ADJ", TAG_THRESH))
|
||||||
|
CAN_ADV = EN.lexicon.add_flag(orth.can_tag("ADV", TAG_THRESH))
|
||||||
|
CAN_VERB = EN.lexicon.add_flag(orth.can_tag("VERB", TAG_THRESH))
|
||||||
|
CAN_NOUN = EN.lexicon.add_flag(orth.can_tag("NOUN", TAG_THRESH))
|
||||||
|
CAN_PDT = EN.lexicon.add_flag(orth.can_tag("PDT", TAG_THRESH))
|
||||||
|
CAN_POS = EN.lexicon.add_flag(orth.can_tag("POS", TAG_THRESH))
|
||||||
|
CAN_PRON = EN.lexicon.add_flag(orth.can_tag("PRON", TAG_THRESH))
|
||||||
|
CAN_PRT = EN.lexicon.add_flag(orth.can_tag("PRT", TAG_THRESH))
|
||||||
|
|
||||||
|
|
||||||
def add_string_views(view_funcs):
|
# These are the name of string transforms
|
||||||
"""Add a string view to existing and previous lexical entries.
|
SIC = EN.lexicon.add_transform(orth.sic_string)
|
||||||
|
CANON_CASED = EN.lexicon.add_transform(orth.canon_case)
|
||||||
Args:
|
SHAPE = EN.lexicon.add_transform(orth.word_shape)
|
||||||
get_view (function): A unicode --> unicode function.
|
NON_SPARSE = EN.lexicon.add_transform(orth.non_sparse)
|
||||||
|
|
||||||
Returns:
|
|
||||||
view_id (int): An integer key you can use to access the view.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def load_clusters(location):
|
|
||||||
"""Load cluster data.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def load_unigram_probs(location):
|
|
||||||
"""Load unigram probabilities.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def load_case_stats(location):
|
|
||||||
"""Load case stats.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def load_tag_stats(location):
|
|
||||||
"""Load tag statistics.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
|
@ -3,18 +3,23 @@ from libc.stdint cimport uint64_t
|
||||||
from spacy.word cimport Lexeme
|
from spacy.word cimport Lexeme
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Lexicon:
|
||||||
|
cdef public list flag_checkers
|
||||||
|
cdef public list string_transformers
|
||||||
|
|
||||||
|
cdef dict lexicon
|
||||||
|
|
||||||
|
cpdef Lexeme lookup(self, unicode string)
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef object name
|
cdef object name
|
||||||
cdef dict blobs
|
cdef dict cache
|
||||||
cdef dict lexicon
|
cpdef readonly Lexicon lexicon
|
||||||
|
|
||||||
cpdef list tokenize(self, unicode text)
|
cpdef list tokenize(self, unicode text)
|
||||||
|
|
||||||
cdef Word lookup(self, unicode string)
|
cdef list _tokenize(self, unicode string)
|
||||||
cdef list lookup_chunk(self, unicode chunk)
|
cpdef list _split(self, unicode string)
|
||||||
|
cpdef int _split_one(self, unicode word)
|
||||||
|
|
||||||
cdef list new_chunk(self, unicode string, list substrings)
|
|
||||||
cdef Word new_lexeme(self, unicode lex)
|
|
||||||
|
|
||||||
cpdef list find_substrings(self, unicode chunk)
|
|
||||||
cdef int find_split(self, unicode word)
|
|
||||||
|
|
206
spacy/lang.pyx
206
spacy/lang.pyx
|
@ -6,37 +6,37 @@ Provides the main implementation for the spacy tokenizer. Specific languages
|
||||||
subclass the Language class, over-writing the tokenization rules as necessary.
|
subclass the Language class, over-writing the tokenization rules as necessary.
|
||||||
Special-case tokenization rules are read from data/<lang>/tokenization .
|
Special-case tokenization rules are read from data/<lang>/tokenization .
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
|
import json
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
view_funcs = []
|
|
||||||
def __cinit__(self, name):
|
def __cinit__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.blobs = {}
|
self.cache = {}
|
||||||
self.lexicon = {}
|
self.lexicon = Lexicon()
|
||||||
self.load_tokenization(util.read_tokenization(name))
|
self.load_tokenization(util.read_tokenization(name))
|
||||||
self.load_dist_info(util.read_dist_info(name))
|
|
||||||
|
|
||||||
cpdef list tokenize(self, unicode string):
|
cpdef list tokenize(self, unicode string):
|
||||||
"""Tokenize.
|
"""Tokenize a string.
|
||||||
|
|
||||||
Split the string into tokens.
|
The tokenization rules are defined in two places:
|
||||||
|
|
||||||
|
* The data/<lang>/tokenization table, which handles special cases like contractions;
|
||||||
|
* The appropriate :py:meth:`find_split` function, which is used to split
|
||||||
|
off punctuation etc.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
string (unicode): The string to split.
|
string (unicode): The string to be tokenized.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tokens (list): A list of Lexeme objects.
|
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
|
||||||
"""
|
"""
|
||||||
cdef list blob
|
|
||||||
cdef list tokens = []
|
cdef list tokens = []
|
||||||
cdef size_t length = len(string)
|
cdef size_t length = len(string)
|
||||||
cdef size_t start = 0
|
cdef size_t start = 0
|
||||||
|
@ -44,74 +44,28 @@ cdef class Language:
|
||||||
for c in string:
|
for c in string:
|
||||||
if c == ' ':
|
if c == ' ':
|
||||||
if start < i:
|
if start < i:
|
||||||
blob = self.lookup_blob(string[start:i])
|
tokens.extend(self._tokenize(string[start:i]))
|
||||||
tokens.extend(blob)
|
|
||||||
start = i + 1
|
start = i + 1
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
chunk = self.lookup_blob(string[start:])
|
tokens.extend(self._tokenize(string[start:]))
|
||||||
tokens.extend(chunk)
|
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef Lexeme lookup(self, unicode string):
|
cdef list _tokenize(self, unicode string):
|
||||||
assert len(string) != 0
|
if string in self.cache:
|
||||||
cdef Word word
|
return self.cache[string]
|
||||||
if string in self.vocab:
|
cdef list lexemes = []
|
||||||
word = self.vocab[string]
|
substrings = self._split(string)
|
||||||
else:
|
|
||||||
word = self.new_lexeme(string)
|
|
||||||
return word
|
|
||||||
|
|
||||||
cdef list lookup_blob(self, unicode string):
|
|
||||||
cdef list chunk
|
|
||||||
cdef size_t blob_id
|
|
||||||
if string in self.blobs:
|
|
||||||
blob = self.blobs[string]
|
|
||||||
else:
|
|
||||||
blob = self.new_blob(string, self.find_substrings(string))
|
|
||||||
return chunk
|
|
||||||
|
|
||||||
cdef list new_blob(self, unicode string, list substrings):
|
|
||||||
blob = []
|
|
||||||
for i, substring in enumerate(substrings):
|
for i, substring in enumerate(substrings):
|
||||||
blob.append(self.lookup(substring))
|
lexemes.append(self.lookup(substring))
|
||||||
self.blobs[string] = chunk
|
self.cache[string] = lexemes
|
||||||
return blob
|
return lexemes
|
||||||
|
|
||||||
cdef Word new_lexeme(self, unicode string):
|
cpdef list _split(self, unicode string):
|
||||||
# TODO
|
"""Find how to split a contiguous span of non-space characters into substrings.
|
||||||
#lexeme = Lexeme(string.encode('utf8'), string_views)
|
|
||||||
#return lexeme
|
|
||||||
|
|
||||||
"""
|
|
||||||
def add_view_funcs(self, list view_funcs):
|
|
||||||
self.view_funcs.extend(view_funcs)
|
|
||||||
cdef size_t nr_views = len(self.view_funcs)
|
|
||||||
|
|
||||||
cdef unicode view
|
|
||||||
cdef StringHash hashed
|
|
||||||
cdef StringHash key
|
|
||||||
cdef unicode string
|
|
||||||
cdef LexID lex_id
|
|
||||||
cdef Lexeme* word
|
|
||||||
|
|
||||||
for key, lex_id in self.vocab.items():
|
|
||||||
word = <Lexeme*>lex_id
|
|
||||||
free(word.string_views)
|
|
||||||
word.string_views = <StringHash*>calloc(nr_views, sizeof(StringHash))
|
|
||||||
string = word.string[:word.length].decode('utf8')
|
|
||||||
for i, view_func in enumerate(self.view_funcs):
|
|
||||||
view = view_func(string)
|
|
||||||
hashed = hash(view)
|
|
||||||
word.string_views[i] = hashed
|
|
||||||
self.bacov[hashed] = view
|
|
||||||
"""
|
|
||||||
|
|
||||||
cpdef list find_substrings(self, unicode blob):
|
|
||||||
"""Find how to split a chunk into substrings.
|
|
||||||
|
|
||||||
This method calls find_split repeatedly. Most languages will want to
|
This method calls find_split repeatedly. Most languages will want to
|
||||||
override find_split, but it may be useful to override this instead.
|
override _split_one, but it may be useful to override this instead.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
chunk (unicode): The string to be split, e.g. u"Mike's!"
|
chunk (unicode): The string to be split, e.g. u"Mike's!"
|
||||||
|
@ -120,22 +74,22 @@ cdef class Language:
|
||||||
substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
|
substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
|
||||||
"""
|
"""
|
||||||
substrings = []
|
substrings = []
|
||||||
while blob:
|
while string:
|
||||||
split = self.find_split(blob)
|
split = self._split_one(string)
|
||||||
if split == 0:
|
if split == 0:
|
||||||
substrings.append(blob)
|
substrings.append(string)
|
||||||
break
|
break
|
||||||
substrings.append(blob[:split])
|
substrings.append(string[:split])
|
||||||
blob = blob[split:]
|
string = string[split:]
|
||||||
return substrings
|
return substrings
|
||||||
|
|
||||||
cdef int find_split(self, unicode word):
|
cpdef int _split_one(self, unicode word):
|
||||||
return len(word)
|
return len(word)
|
||||||
|
|
||||||
def load_tokenization(self, token_rules):
|
def load_special_tokenization(self, token_rules):
|
||||||
'''Load special-case tokenization rules.
|
'''Load special-case tokenization rules.
|
||||||
|
|
||||||
Loads special-case tokenization rules into the Language.chunk cache,
|
Loads special-case tokenization rules into the Language.cache cache,
|
||||||
read from data/<lang>/tokenization . The special cases are loaded before
|
read from data/<lang>/tokenization . The special cases are loaded before
|
||||||
any language data is tokenized, giving these priority. For instance,
|
any language data is tokenized, giving these priority. For instance,
|
||||||
the English tokenization rules map "ain't" to ["are", "not"].
|
the English tokenization rules map "ain't" to ["are", "not"].
|
||||||
|
@ -144,25 +98,83 @@ cdef class Language:
|
||||||
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
||||||
a string and tokens is a list of strings.
|
a string and tokens is a list of strings.
|
||||||
'''
|
'''
|
||||||
for chunk, tokens in token_rules:
|
for string, substrings in token_rules:
|
||||||
self.new_chunk(chunk, tokens)
|
lexemes = []
|
||||||
|
for i, substring in enumerate(substrings):
|
||||||
|
lexemes.append(self.lookup(substring))
|
||||||
|
self.cache[string] = lexemes
|
||||||
|
|
||||||
|
|
||||||
def load_dist_info(self, dist_info):
|
cdef class Lexicon:
|
||||||
'''Load distributional information for the known lexemes of the language.
|
def __cinit__(self):
|
||||||
|
self.flag_checkers = []
|
||||||
|
self.string_transforms = []
|
||||||
|
self.lexicon = {}
|
||||||
|
|
||||||
The distributional information is read from data/<lang>/dist_info.json .
|
cpdef Lexeme lookup(self, unicode string):
|
||||||
It contains information like the (smoothed) unigram log probability of
|
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
||||||
the word, how often the word is found upper-cased, how often the word
|
|
||||||
is found title-cased, etc.
|
Args:
|
||||||
'''
|
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
lexeme (Lexeme): A reference to a lexical type.
|
||||||
|
"""
|
||||||
|
assert len(string) != 0
|
||||||
|
if string in self.lexicon:
|
||||||
|
return self.lexicon[string]
|
||||||
|
|
||||||
|
prob = _pop_default(self.probs, string, 0.0)
|
||||||
|
cluster = _pop_default(self.clusters, string, 0.0)
|
||||||
|
case_stats = _pop_default(self.case_stats, string, {})
|
||||||
|
tag_stats = _pop_default(self.tag_stats, string, {})
|
||||||
|
|
||||||
|
cdef Lexeme word = Lexeme(string, prob, cluster, case_stats, tag_stats,
|
||||||
|
self.flag_checkers, self.string_transformers)
|
||||||
|
self.lexicon[string] = word
|
||||||
|
return word
|
||||||
|
|
||||||
|
def add_flag(self, flag_checker):
|
||||||
cdef unicode string
|
cdef unicode string
|
||||||
cdef dict word_dist
|
cdef Lexeme word
|
||||||
cdef Word w
|
flag_id = len(self.flag_checkers)
|
||||||
for string, word_dist in dist_info.items():
|
for string, word in self.lexicon.items():
|
||||||
w = self.lookup(string)
|
if flag_checker(string, word.prob, {}):
|
||||||
w.prob = word_dist.prob
|
word.set_flag(flag_id)
|
||||||
w.cluster = word_dist.cluster
|
self.flag_checkers.append(flag_checker)
|
||||||
for flag in word_dist.flags:
|
return flag_id
|
||||||
w.dist_flags |= DIST_FLAGS[flag]
|
|
||||||
for tag in word_dist.tagdict:
|
def add_transform(self, string_transform):
|
||||||
w.possible_tags |= TAGS[tag]
|
self.string_transformers.append(string_transform)
|
||||||
|
return len(self.string_transformers) - 1
|
||||||
|
|
||||||
|
def load_probs(self, location):
|
||||||
|
"""Load unigram probabilities.
|
||||||
|
"""
|
||||||
|
self.probs = json.load(location)
|
||||||
|
|
||||||
|
cdef Lexeme word
|
||||||
|
cdef unicode string
|
||||||
|
|
||||||
|
for string, word in self.lexicon.items():
|
||||||
|
prob = _pop_default(self.probs, string, 0.0)
|
||||||
|
word.prob = prob
|
||||||
|
|
||||||
|
def load_clusters(self, location):
|
||||||
|
self.probs = json.load(location)
|
||||||
|
|
||||||
|
cdef Lexeme word
|
||||||
|
cdef unicode string
|
||||||
|
|
||||||
|
for string, word in self.lexicon.items():
|
||||||
|
cluster = _pop_default(self.cluster, string, 0)
|
||||||
|
word.cluster = cluster
|
||||||
|
|
||||||
|
def load_stats(self, location):
|
||||||
|
"""Load distributional stats.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
def _pop_default(dict d, key, default):
|
||||||
|
return d.pop(key) if key in d else default
|
||||||
|
|
|
@ -1,54 +0,0 @@
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import codecs
|
|
||||||
import json
|
|
||||||
|
|
||||||
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
|
|
||||||
|
|
||||||
|
|
||||||
def utf8open(loc, mode='r'):
|
|
||||||
return codecs.open(loc, mode, 'utf8')
|
|
||||||
|
|
||||||
|
|
||||||
def load_case_stats(data_dir):
|
|
||||||
case_loc = path.join(data_dir, 'case')
|
|
||||||
case_stats = {}
|
|
||||||
with utf8open(case_loc) as cases_file:
|
|
||||||
for line in cases_file:
|
|
||||||
word, upper, title = line.split()
|
|
||||||
case_stats[word] = (float(upper), float(title))
|
|
||||||
return case_stats
|
|
||||||
|
|
||||||
|
|
||||||
def read_dist_info(lang):
|
|
||||||
dist_path = path.join(DATA_DIR, lang, 'distribution_info.json')
|
|
||||||
if path.exists(dist_path):
|
|
||||||
with open(dist_path) as file_:
|
|
||||||
dist_info = json.load(file_)
|
|
||||||
else:
|
|
||||||
dist_info = {}
|
|
||||||
return dist_info
|
|
||||||
|
|
||||||
|
|
||||||
def read_tokenization(lang):
|
|
||||||
loc = path.join(DATA_DIR, lang, 'tokenization')
|
|
||||||
entries = []
|
|
||||||
seen = set()
|
|
||||||
with utf8open(loc) as file_:
|
|
||||||
for line in file_:
|
|
||||||
line = line.strip()
|
|
||||||
if line.startswith('#'):
|
|
||||||
continue
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
pieces = line.split()
|
|
||||||
chunk = pieces.pop(0)
|
|
||||||
assert chunk not in seen, chunk
|
|
||||||
seen.add(chunk)
|
|
||||||
entries.append((chunk, list(pieces)))
|
|
||||||
if chunk[0].isalpha() and chunk[0].islower():
|
|
||||||
chunk = chunk[0].title() + chunk[1:]
|
|
||||||
pieces[0] = pieces[0][0].title() + pieces[0][1:]
|
|
||||||
seen.add(chunk)
|
|
||||||
entries.append((chunk, pieces))
|
|
||||||
return entries
|
|
|
@ -7,19 +7,19 @@ DEF MAX_FLAG = 64
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
# NB: the readonly keyword refers to _Python_ access. The attributes are
|
# NB: the readonly keyword refers to _Python_ access. The attributes are
|
||||||
# writeable from Cython.
|
# writeable from Cython.
|
||||||
cdef readonly id_t id
|
cpdef readonly id_t id
|
||||||
cdef readonly size_t length
|
cpdef readonly size_t length
|
||||||
cdef readonly double prob
|
cpdef readonly double prob
|
||||||
cdef readonly size_t cluster
|
cpdef readonly size_t cluster
|
||||||
|
|
||||||
cdef readonly utf8_t* strings
|
cdef utf8_t* views
|
||||||
cdef readonly size_t nr_strings
|
cdef size_t nr_views
|
||||||
|
|
||||||
cdef readonly flag_t flags
|
cdef readonly flag_t flags
|
||||||
|
|
||||||
cpdef bint check_flag(self, size_t flag_id) except *
|
cpdef bint check_flag(self, size_t flag_id) except *
|
||||||
cpdef int set_flag(self, size_t flag_id) except -1
|
cpdef int set_flag(self, size_t flag_id) except -1
|
||||||
|
|
||||||
cpdef unicode get_string(self, size_t i) except *
|
cpdef unicode get_view_string(self, size_t i)
|
||||||
cpdef id_t get_id(self, size_t i) except 0
|
cpdef id_t get_view_id(self, size_t i) except 0
|
||||||
cpdef int add_strings(self, list strings) except -1
|
cpdef int add_view(self, unicode view) except -1
|
||||||
|
|
176
spacy/word.pyx
176
spacy/word.pyx
|
@ -2,10 +2,7 @@
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
|
|
||||||
|
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free, realloc
|
||||||
|
|
||||||
from spacy cimport flags
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
"""A lexical type.
|
"""A lexical type.
|
||||||
|
@ -53,7 +50,7 @@ cdef class Lexeme:
|
||||||
the same cluster ID as "pineapple", which is not what we'd like.
|
the same cluster ID as "pineapple", which is not what we'd like.
|
||||||
"""
|
"""
|
||||||
def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0,
|
def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0,
|
||||||
cluster=0, orth_flags=0, dist_flags=0, possible_tags=0):
|
flags=0):
|
||||||
self.id = <id_t>&string
|
self.id = <id_t>&string
|
||||||
self.length = length
|
self.length = length
|
||||||
self.nr_strings = 0
|
self.nr_strings = 0
|
||||||
|
@ -66,25 +63,21 @@ cdef class Lexeme:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.strings[0].decode('utf8')
|
return self.strings[0].decode('utf8')
|
||||||
|
|
||||||
cpdef unicode get_view_string(self, size_t i) except *:
|
cpdef unicode get_view_string(self, size_t i):
|
||||||
assert i < self.nr_strings
|
assert i < self.nr_strings
|
||||||
return self.strings[i].decode('utf8')
|
return self.strings[i].decode('utf8')
|
||||||
|
|
||||||
cpdef intptr_t get_view_id(self, size_t i) except 0:
|
cpdef id_t get_view_id(self, size_t i) except 0:
|
||||||
assert i < self.nr_strings
|
assert i < self.nr_strings
|
||||||
return <string_id_t>&self.views[i]
|
return <id_t>&self.views[i]
|
||||||
|
|
||||||
cpdef int add_views(self, list views) except -1:
|
cpdef int add_view(self, unicode view) except -1:
|
||||||
self.nr_views += len(strings)
|
self.nr_views += 1
|
||||||
self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t))
|
self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t))
|
||||||
cdef unicode view
|
cdef bytes utf8_string = view.encode('utf8')
|
||||||
cdef bytes utf8_string
|
# Intern strings, allowing pointer comparison
|
||||||
for i, view in enumerate(strings):
|
utf8_string = intern(utf8_string)
|
||||||
view = string_views[i]
|
self.views[self.nr_views - 1] = utf8_string
|
||||||
utf8_string = view.encode('utf8')
|
|
||||||
# Intern strings, allowing pointer comparison
|
|
||||||
utf8_string = intern(utf8_string)
|
|
||||||
self.views[i] = utf8_string
|
|
||||||
|
|
||||||
cpdef bint check_flag(self, size_t flag_id) except *:
|
cpdef bint check_flag(self, size_t flag_id) except *:
|
||||||
"""Access the value of one of the pre-computed boolean distribution features.
|
"""Access the value of one of the pre-computed boolean distribution features.
|
||||||
|
@ -92,154 +85,7 @@ cdef class Lexeme:
|
||||||
Meanings depend on the language-specific distributional features being loaded.
|
Meanings depend on the language-specific distributional features being loaded.
|
||||||
The suggested features for latin-alphabet languages are: TODO
|
The suggested features for latin-alphabet languages are: TODO
|
||||||
"""
|
"""
|
||||||
assert flag_id < flags.MAX_FLAG
|
|
||||||
return self.flags & (1 << flag_id)
|
return self.flags & (1 << flag_id)
|
||||||
|
|
||||||
cpdef int set_flag(self, size_t flag_id) except -1:
|
cpdef int set_flag(self, size_t flag_id) except -1:
|
||||||
assert flag_id < flags.MAX_FLAG
|
|
||||||
self.flags |= (1 << flag_id)
|
self.flags |= (1 << flag_id)
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
#cdef class CasedWord(Word):
|
|
||||||
# def __cinit__(self, bytes string, list views):
|
|
||||||
# Word.__cinit__(self, string, string_views)
|
|
||||||
#
|
|
||||||
# cpdef bint is_often_uppered(self) except *:
|
|
||||||
# '''Check the OFT_UPPER distributional flag for the word.
|
|
||||||
#
|
|
||||||
# The OFT_UPPER flag records whether a lower-cased version of the word
|
|
||||||
# is found in all-upper case frequently in a large sample of text, where
|
|
||||||
# "frequently" is defined as P >= 0.95 (chosen for high mutual information for
|
|
||||||
# POS tagging).
|
|
||||||
#
|
|
||||||
# Case statistics are estimated from a large text corpus. Estimates are read
|
|
||||||
# from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
|
|
||||||
#
|
|
||||||
# >>> is_often_uppered(lookup(u'nato'))
|
|
||||||
# True
|
|
||||||
# >>> is_often_uppered(lookup(u'the'))
|
|
||||||
# False
|
|
||||||
# '''
|
|
||||||
# return self.dist_flags & (1 << OFT_UPPER)
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# cpdef bint is_often_titled(self) except *:
|
|
||||||
# '''Check the OFT_TITLE distributional flag for the word.
|
|
||||||
#
|
|
||||||
# The OFT_TITLE flag records whether a lower-cased version of the word
|
|
||||||
# is found title-cased (see string.istitle) frequently in a large sample of text,
|
|
||||||
# where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
|
|
||||||
# POS tagging).
|
|
||||||
#
|
|
||||||
# Case statistics are estimated from a large text corpus. Estimates are read
|
|
||||||
# from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
|
|
||||||
#
|
|
||||||
# >>> is_oft_upper(lookup(u'john'))
|
|
||||||
# True
|
|
||||||
# >>> is_oft_upper(lookup(u'Bill'))
|
|
||||||
# False
|
|
||||||
# '''
|
|
||||||
# return self.dist_flags & (1 << OFT_TITLE)
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# cpdef bint is_alpha(self) except *:
|
|
||||||
# """Check whether all characters in the word's string are alphabetic.
|
|
||||||
#
|
|
||||||
# Should match the :py:func:`unicode.isalpha()` function.
|
|
||||||
#
|
|
||||||
# >>> is_alpha(lookup(u'Hello'))
|
|
||||||
# True
|
|
||||||
# >>> is_alpha(lookup(u'العرب'))
|
|
||||||
# True
|
|
||||||
# >>> is_alpha(lookup(u'10'))
|
|
||||||
# False
|
|
||||||
# """
|
|
||||||
# return self.orth_flags & 1 << IS_ALPHA
|
|
||||||
#
|
|
||||||
# cpdef bint is_digit(self) except *:
|
|
||||||
# """Check whether all characters in the word's string are numeric.
|
|
||||||
#
|
|
||||||
# Should match the :py:func:`unicode.isdigit()` function.
|
|
||||||
#
|
|
||||||
# >>> is_digit(lookup(u'10'))
|
|
||||||
# True
|
|
||||||
# >>> is_digit(lookup(u'๐'))
|
|
||||||
# True
|
|
||||||
# >>> is_digit(lookup(u'one'))
|
|
||||||
# False
|
|
||||||
# """
|
|
||||||
# return self.orth_flags & 1 << IS_DIGIT
|
|
||||||
#
|
|
||||||
# cpdef bint is_punct(self) except *:
|
|
||||||
# """Check whether all characters belong to a punctuation unicode data category
|
|
||||||
# for a Lexeme ID.
|
|
||||||
#
|
|
||||||
# >>> is_punct(lookup(u'.'))
|
|
||||||
# True
|
|
||||||
# >>> is_punct(lookup(u'⁒'))
|
|
||||||
# True
|
|
||||||
# >>> is_punct(lookup(u' '))
|
|
||||||
# False
|
|
||||||
# """
|
|
||||||
# return self.orth_flags & 1 << IS_PUNCT
|
|
||||||
#
|
|
||||||
# cpdef bint is_space(self) except *:
|
|
||||||
# """Give the result of unicode.isspace() for a Lexeme ID.
|
|
||||||
#
|
|
||||||
# >>> is_space(lookup(u'\\t'))
|
|
||||||
# True
|
|
||||||
# >>> is_space(lookup(u'<unicode space>'))
|
|
||||||
# True
|
|
||||||
# >>> is_space(lookup(u'Hi\\n'))
|
|
||||||
# False
|
|
||||||
# """
|
|
||||||
# return self.orth_flags & 1 << IS_SPACE
|
|
||||||
#
|
|
||||||
# cpdef bint is_lower(self) except *:
|
|
||||||
# """Give the result of unicode.islower() for a Lexeme ID.
|
|
||||||
#
|
|
||||||
# >>> is_lower(lookup(u'hi'))
|
|
||||||
# True
|
|
||||||
# >>> is_lower(lookup(<unicode>))
|
|
||||||
# True
|
|
||||||
# >>> is_lower(lookup(u'10'))
|
|
||||||
# False
|
|
||||||
# """
|
|
||||||
# return self.orth_flags & 1 << IS_LOWER
|
|
||||||
#
|
|
||||||
# cpdef bint is_upper(self) except *:
|
|
||||||
# """Give the result of unicode.isupper() for a Lexeme ID.
|
|
||||||
#
|
|
||||||
# >>> is_upper(lookup(u'HI'))
|
|
||||||
# True
|
|
||||||
# >>> is_upper(lookup(u'H10'))
|
|
||||||
# True
|
|
||||||
# >>> is_upper(lookup(u'10'))
|
|
||||||
# False
|
|
||||||
# """
|
|
||||||
# return self.orth_flags & 1 << IS_UPPER
|
|
||||||
#
|
|
||||||
# cpdef bint is_title(self) except *:
|
|
||||||
# """Give the result of unicode.istitle() for a Lexeme ID.
|
|
||||||
#
|
|
||||||
# >>> is_title(lookup(u'Hi'))
|
|
||||||
# True
|
|
||||||
# >>> is_title(lookup(u'Hi1'))
|
|
||||||
# True
|
|
||||||
# >>> is_title(lookup(u'1'))
|
|
||||||
# False
|
|
||||||
# """
|
|
||||||
# return self.orth_flags & 1 << IS_TITLE
|
|
||||||
#
|
|
||||||
# cpdef bint is_ascii(self) except *:
|
|
||||||
# """Give the result of checking whether all characters in the string are ascii.
|
|
||||||
#
|
|
||||||
# >>> is_ascii(lookup(u'Hi'))
|
|
||||||
# True
|
|
||||||
# >>> is_ascii(lookup(u' '))
|
|
||||||
# True
|
|
||||||
# >>> is_title(lookup(u'<unicode>'))
|
|
||||||
# False
|
|
||||||
# """
|
|
||||||
# return self.orth_flags & 1 << IS_ASCII
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user