* Docs coming together

This commit is contained in:
Matthew Honnibal 2014-08-29 01:59:23 +02:00
parent c282e6d5fb
commit 45a22d6b2c
5 changed files with 47 additions and 25 deletions

View File

@ -9,10 +9,14 @@ spaCy NLP Tokenizer and Lexicon
.. toctree:: .. toctree::
:maxdepth: 3 :maxdepth: 3
guide/overview guide/overview.rst
guide/install guide/install.rst
api/index.rst api/index.rst
modules/index.rst
Source (GitHub) Source (GitHub)
---------------- ----------------

View File

@ -4,4 +4,4 @@ cimport cython
cdef class English(Language): cdef class English(Language):
cpdef int _split_one(self, unicode word) cdef int _split_one(self, unicode word)

View File

@ -5,22 +5,21 @@ scheme in several important respects:
* Whitespace is added as tokens, except for single spaces. e.g., * Whitespace is added as tokens, except for single spaces. e.g.,
>>> [w.string for w in tokenize(u'\\nHello \\tThere')] >>> [w.string for w in EN.tokenize(u'\\nHello \\tThere')]
[u'\\n', u'Hello', u' ', u'\\t', u'There'] [u'\\n', u'Hello', u' ', u'\\t', u'There']
* Contractions are normalized, e.g. * Contractions are normalized, e.g.
>>> [w.string for w in u"isn't ain't won't he's")] >>> [w.string for w in EN.tokenize(u"isn't ain't won't he's")]
[u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"] [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
* Hyphenated words are split, with the hyphen preserved, e.g.: * Hyphenated words are split, with the hyphen preserved, e.g.:
>>> [w.string for w in tokenize(u'New York-based')] >>> [w.string for w in EN.tokenize(u'New York-based')]
[u'New', u'York', u'-', u'based'] [u'New', u'York', u'-', u'based']
Other improvements: Other improvements:
* Full unicode support
* Email addresses, URLs, European-formatted dates and other numeric entities not * Email addresses, URLs, European-formatted dates and other numeric entities not
found in the PTB are tokenized correctly found in the PTB are tokenized correctly
* Heuristic handling of word-final periods (PTB expects sentence boundary detection * Heuristic handling of word-final periods (PTB expects sentence boundary detection
@ -81,6 +80,13 @@ CAN_PRT = NR_FLAGS; NR_FLAGS += 1
cdef class English(Language): cdef class English(Language):
"""English tokenizer, tightly coupled to lexicon.
Attributes:
name (unicode): The two letter code used by Wikipedia for the language.
lexicon (Lexicon): The lexicon. Exposes the lookup method.
"""
def __cinit__(self, name): def __cinit__(self, name):
flag_funcs = [0 for _ in range(NR_FLAGS)] flag_funcs = [0 for _ in range(NR_FLAGS)]
@ -110,7 +116,7 @@ cdef class English(Language):
Language.__init__(self, name, flag_funcs) Language.__init__(self, name, flag_funcs)
cpdef int _split_one(self, unicode word): cdef int _split_one(self, unicode word):
cdef size_t length = len(word) cdef size_t length = len(word)
cdef int i = 0 cdef int i = 0
if word.startswith("'s") or word.startswith("'S"): if word.startswith("'s") or word.startswith("'S"):

View File

@ -4,21 +4,22 @@ from spacy.word cimport Lexeme
cdef class Lexicon: cdef class Lexicon:
cdef list string_features
cdef list flag_features
cdef dict _dict
cpdef Lexeme lookup(self, unicode string) cpdef Lexeme lookup(self, unicode string)
cdef dict _dict
cdef list _string_features
cdef list _flag_features
cdef class Language: cdef class Language:
cdef object name cdef unicode name
cdef dict cache cdef dict cache
cpdef readonly Lexicon lexicon cpdef readonly Lexicon lexicon
cpdef list tokenize(self, unicode text) cpdef list tokenize(self, unicode text)
cpdef Lexeme lookup(self, unicode text)
cdef list _tokenize(self, unicode string) cdef list _tokenize(self, unicode string)
cpdef list _split(self, unicode string) cdef list _split(self, unicode string)
cpdef int _split_one(self, unicode word) cdef int _split_one(self, unicode word)

View File

@ -41,7 +41,7 @@ cdef class Language:
rules, words, probs, clusters, case_stats, tag_stats = lang_data rules, words, probs, clusters, case_stats, tag_stats = lang_data
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats, self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
string_features, flag_features) string_features, flag_features)
self.load_special_tokenization(rules) self._load_special_tokenization(rules)
cpdef list tokenize(self, unicode string): cpdef list tokenize(self, unicode string):
"""Tokenize a string. """Tokenize a string.
@ -75,6 +75,17 @@ cdef class Language:
assert tokens assert tokens
return tokens return tokens
cpdef Lexeme lookup(self, unicode string):
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
Args:
string (unicode): The string to be looked up. Must be unicode, not bytes.
Returns:
lexeme (Lexeme): A reference to a lexical type.
"""
return self.lexicon.lookup(string)
cdef list _tokenize(self, unicode string): cdef list _tokenize(self, unicode string):
if string in self.cache: if string in self.cache:
return self.cache[string] return self.cache[string]
@ -85,7 +96,7 @@ cdef class Language:
self.cache[string] = lexemes self.cache[string] = lexemes
return lexemes return lexemes
cpdef list _split(self, unicode string): cdef list _split(self, unicode string):
"""Find how to split a contiguous span of non-space characters into substrings. """Find how to split a contiguous span of non-space characters into substrings.
This method calls find_split repeatedly. Most languages will want to This method calls find_split repeatedly. Most languages will want to
@ -107,10 +118,10 @@ cdef class Language:
string = string[split:] string = string[split:]
return substrings return substrings
cpdef int _split_one(self, unicode word): cdef int _split_one(self, unicode word):
return len(word) return len(word)
def load_special_tokenization(self, token_rules): def _load_special_tokenization(self, token_rules):
'''Load special-case tokenization rules. '''Load special-case tokenization rules.
Loads special-case tokenization rules into the Language.cache cache, Loads special-case tokenization rules into the Language.cache cache,
@ -132,14 +143,14 @@ cdef class Language:
cdef class Lexicon: cdef class Lexicon:
def __cinit__(self, words, probs, clusters, case_stats, tag_stats, def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
string_features, flag_features): string_features, flag_features):
self.flag_features = flag_features self._flag_features = flag_features
self.string_features = string_features self._string_features = string_features
self._dict = {} self._dict = {}
cdef Lexeme word cdef Lexeme word
for string in words: for string in words:
word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0), word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0),
case_stats.get(string, {}), tag_stats.get(string, {}), case_stats.get(string, {}), tag_stats.get(string, {}),
self.string_features, self.flag_features) self._string_features, self._flag_features)
self._dict[string] = word self._dict[string] = word
cpdef Lexeme lookup(self, unicode string): cpdef Lexeme lookup(self, unicode string):
@ -155,7 +166,7 @@ cdef class Lexicon:
if string in self._dict: if string in self._dict:
return self._dict[string] return self._dict[string]
cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self.string_features, cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features,
self.flag_features) self._flag_features)
self._dict[string] = word self._dict[string] = word
return word return word