mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-07 13:26:02 +03:00
* Docs coming together
This commit is contained in:
parent
c282e6d5fb
commit
45a22d6b2c
|
@ -9,10 +9,14 @@ spaCy NLP Tokenizer and Lexicon
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 3
|
:maxdepth: 3
|
||||||
|
|
||||||
guide/overview
|
guide/overview.rst
|
||||||
guide/install
|
guide/install.rst
|
||||||
|
|
||||||
api/index.rst
|
api/index.rst
|
||||||
|
|
||||||
|
modules/index.rst
|
||||||
|
|
||||||
|
|
||||||
Source (GitHub)
|
Source (GitHub)
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
|
|
|
@ -4,4 +4,4 @@ cimport cython
|
||||||
|
|
||||||
|
|
||||||
cdef class English(Language):
|
cdef class English(Language):
|
||||||
cpdef int _split_one(self, unicode word)
|
cdef int _split_one(self, unicode word)
|
||||||
|
|
16
spacy/en.pyx
16
spacy/en.pyx
|
@ -5,22 +5,21 @@ scheme in several important respects:
|
||||||
|
|
||||||
* Whitespace is added as tokens, except for single spaces. e.g.,
|
* Whitespace is added as tokens, except for single spaces. e.g.,
|
||||||
|
|
||||||
>>> [w.string for w in tokenize(u'\\nHello \\tThere')]
|
>>> [w.string for w in EN.tokenize(u'\\nHello \\tThere')]
|
||||||
[u'\\n', u'Hello', u' ', u'\\t', u'There']
|
[u'\\n', u'Hello', u' ', u'\\t', u'There']
|
||||||
|
|
||||||
* Contractions are normalized, e.g.
|
* Contractions are normalized, e.g.
|
||||||
|
|
||||||
>>> [w.string for w in u"isn't ain't won't he's")]
|
>>> [w.string for w in EN.tokenize(u"isn't ain't won't he's")]
|
||||||
[u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
|
[u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
|
||||||
|
|
||||||
* Hyphenated words are split, with the hyphen preserved, e.g.:
|
* Hyphenated words are split, with the hyphen preserved, e.g.:
|
||||||
|
|
||||||
>>> [w.string for w in tokenize(u'New York-based')]
|
>>> [w.string for w in EN.tokenize(u'New York-based')]
|
||||||
[u'New', u'York', u'-', u'based']
|
[u'New', u'York', u'-', u'based']
|
||||||
|
|
||||||
Other improvements:
|
Other improvements:
|
||||||
|
|
||||||
* Full unicode support
|
|
||||||
* Email addresses, URLs, European-formatted dates and other numeric entities not
|
* Email addresses, URLs, European-formatted dates and other numeric entities not
|
||||||
found in the PTB are tokenized correctly
|
found in the PTB are tokenized correctly
|
||||||
* Heuristic handling of word-final periods (PTB expects sentence boundary detection
|
* Heuristic handling of word-final periods (PTB expects sentence boundary detection
|
||||||
|
@ -81,6 +80,13 @@ CAN_PRT = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
|
||||||
|
|
||||||
cdef class English(Language):
|
cdef class English(Language):
|
||||||
|
"""English tokenizer, tightly coupled to lexicon.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
name (unicode): The two letter code used by Wikipedia for the language.
|
||||||
|
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||||
|
"""
|
||||||
|
|
||||||
def __cinit__(self, name):
|
def __cinit__(self, name):
|
||||||
flag_funcs = [0 for _ in range(NR_FLAGS)]
|
flag_funcs = [0 for _ in range(NR_FLAGS)]
|
||||||
|
|
||||||
|
@ -110,7 +116,7 @@ cdef class English(Language):
|
||||||
|
|
||||||
Language.__init__(self, name, flag_funcs)
|
Language.__init__(self, name, flag_funcs)
|
||||||
|
|
||||||
cpdef int _split_one(self, unicode word):
|
cdef int _split_one(self, unicode word):
|
||||||
cdef size_t length = len(word)
|
cdef size_t length = len(word)
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
if word.startswith("'s") or word.startswith("'S"):
|
if word.startswith("'s") or word.startswith("'S"):
|
||||||
|
|
|
@ -4,21 +4,22 @@ from spacy.word cimport Lexeme
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
cdef list string_features
|
cpdef Lexeme lookup(self, unicode string)
|
||||||
cdef list flag_features
|
|
||||||
|
|
||||||
cdef dict _dict
|
cdef dict _dict
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string)
|
cdef list _string_features
|
||||||
|
cdef list _flag_features
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef object name
|
cdef unicode name
|
||||||
cdef dict cache
|
cdef dict cache
|
||||||
cpdef readonly Lexicon lexicon
|
cpdef readonly Lexicon lexicon
|
||||||
|
|
||||||
cpdef list tokenize(self, unicode text)
|
cpdef list tokenize(self, unicode text)
|
||||||
|
cpdef Lexeme lookup(self, unicode text)
|
||||||
|
|
||||||
cdef list _tokenize(self, unicode string)
|
cdef list _tokenize(self, unicode string)
|
||||||
cpdef list _split(self, unicode string)
|
cdef list _split(self, unicode string)
|
||||||
cpdef int _split_one(self, unicode word)
|
cdef int _split_one(self, unicode word)
|
||||||
|
|
|
@ -41,7 +41,7 @@ cdef class Language:
|
||||||
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||||
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||||
string_features, flag_features)
|
string_features, flag_features)
|
||||||
self.load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
|
|
||||||
cpdef list tokenize(self, unicode string):
|
cpdef list tokenize(self, unicode string):
|
||||||
"""Tokenize a string.
|
"""Tokenize a string.
|
||||||
|
@ -75,6 +75,17 @@ cdef class Language:
|
||||||
assert tokens
|
assert tokens
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
cpdef Lexeme lookup(self, unicode string):
|
||||||
|
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
lexeme (Lexeme): A reference to a lexical type.
|
||||||
|
"""
|
||||||
|
return self.lexicon.lookup(string)
|
||||||
|
|
||||||
cdef list _tokenize(self, unicode string):
|
cdef list _tokenize(self, unicode string):
|
||||||
if string in self.cache:
|
if string in self.cache:
|
||||||
return self.cache[string]
|
return self.cache[string]
|
||||||
|
@ -85,7 +96,7 @@ cdef class Language:
|
||||||
self.cache[string] = lexemes
|
self.cache[string] = lexemes
|
||||||
return lexemes
|
return lexemes
|
||||||
|
|
||||||
cpdef list _split(self, unicode string):
|
cdef list _split(self, unicode string):
|
||||||
"""Find how to split a contiguous span of non-space characters into substrings.
|
"""Find how to split a contiguous span of non-space characters into substrings.
|
||||||
|
|
||||||
This method calls find_split repeatedly. Most languages will want to
|
This method calls find_split repeatedly. Most languages will want to
|
||||||
|
@ -107,10 +118,10 @@ cdef class Language:
|
||||||
string = string[split:]
|
string = string[split:]
|
||||||
return substrings
|
return substrings
|
||||||
|
|
||||||
cpdef int _split_one(self, unicode word):
|
cdef int _split_one(self, unicode word):
|
||||||
return len(word)
|
return len(word)
|
||||||
|
|
||||||
def load_special_tokenization(self, token_rules):
|
def _load_special_tokenization(self, token_rules):
|
||||||
'''Load special-case tokenization rules.
|
'''Load special-case tokenization rules.
|
||||||
|
|
||||||
Loads special-case tokenization rules into the Language.cache cache,
|
Loads special-case tokenization rules into the Language.cache cache,
|
||||||
|
@ -132,14 +143,14 @@ cdef class Language:
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
|
def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
|
||||||
string_features, flag_features):
|
string_features, flag_features):
|
||||||
self.flag_features = flag_features
|
self._flag_features = flag_features
|
||||||
self.string_features = string_features
|
self._string_features = string_features
|
||||||
self._dict = {}
|
self._dict = {}
|
||||||
cdef Lexeme word
|
cdef Lexeme word
|
||||||
for string in words:
|
for string in words:
|
||||||
word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0),
|
word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0),
|
||||||
case_stats.get(string, {}), tag_stats.get(string, {}),
|
case_stats.get(string, {}), tag_stats.get(string, {}),
|
||||||
self.string_features, self.flag_features)
|
self._string_features, self._flag_features)
|
||||||
self._dict[string] = word
|
self._dict[string] = word
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string):
|
cpdef Lexeme lookup(self, unicode string):
|
||||||
|
@ -155,7 +166,7 @@ cdef class Lexicon:
|
||||||
if string in self._dict:
|
if string in self._dict:
|
||||||
return self._dict[string]
|
return self._dict[string]
|
||||||
|
|
||||||
cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self.string_features,
|
cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features,
|
||||||
self.flag_features)
|
self._flag_features)
|
||||||
self._dict[string] = word
|
self._dict[string] = word
|
||||||
return word
|
return word
|
||||||
|
|
Loading…
Reference in New Issue
Block a user