* Begin refactor

This commit is contained in:
Matthew Honnibal 2015-07-07 14:00:07 +02:00
parent 52fd80c6c6
commit 6788c86b2f
3 changed files with 116 additions and 54 deletions

View File

@ -1,9 +1,42 @@
=== =====
API Usage
=== =====
Overview
--------
spaCy is a suite of natural language processing tools, arranged into
a pipeline. It is substantially more opinionated than most similar libraries,
which often give users the choice of multiple models that compute the same annotation.
spaCy's philosophy is to just have one --- the best one. Our perspective is that
the redundant options are really only useful to researchers, who need to replicate
some prior work exactly.
Being opinionated allows us to keep the library small, fast, and up-to-date. It
also makes the API much simpler. Normal usage proceeds in three steps:
1. Loading resources;
2. Processing text;
3. Accessing annotations.
This document is divided into three parts, to match these stages. We focus here
on the library's API. See also: Installation, Annotation Standards, Algorithmic Details,
and Benchmarks.
Loading Resources
-----------------
99\% of the time, you will load spaCy's resources using a language pipeline class,
e.g. `spacy.en.English`. The pipeline class reads the data from disk, from a
specified directory. By default, spaCy installs data into each language's
package directory, and loads it from there.
.. autoclass:: spacy.en.English .. autoclass:: spacy.en.English
:members:
The class `spacy.en.English` is the main entry-point for the English pipeline
(other languages to come).
+------------+----------------------------------------+-------------+--------------------------+ +------------+----------------------------------------+-------------+--------------------------+
| Attribute | Type | Attr API | Notes | | Attribute | Type | Attr API | Notes |
@ -24,7 +57,8 @@ API
+------------+----------------------------------------+-------------+--------------------------+ +------------+----------------------------------------+-------------+--------------------------+
.. automethod:: spacy.en.English.__call__ .. autoclass:: spacy.en.English
:members:
.. autoclass:: spacy.tokens.Tokens .. autoclass:: spacy.tokens.Tokens
@ -249,7 +283,7 @@ API
.. py:method:: load_morph_exceptions(self, exc: Dict[unicode, Dict]) .. py:method:: load_morph_exceptions(self, exc: Dict[unicode, Dict])
.. py:class:: syntax.parser.GreedyParser(self, model_dir: unicode) .. py:class:: syntax.parser.Parser(self, model_dir: unicode)
.. py:method:: __call__(self, tokens: spacy.tokens.Tokens) --> None .. py:method:: __call__(self, tokens: spacy.tokens.Tokens) --> None

View File

@ -44,32 +44,82 @@ parse_if_model_present = -1
class English(object): class English(object):
"""The English NLP pipeline. """The English NLP pipeline.
Provides a tokenizer, lexicon, part-of-speech tagger and parser. Example:
Load data from default directory:
>>> nlp = English()
>>> nlp = English(data_dir=u'')
Load data from specified directory:
>>> nlp = English(data_dir=u'path/to/data_directory')
Disable (and avoid loading) parts of the processing pipeline:
>>> nlp = English(vectors=False, parser=False, tagger=False, entity=False)
Start with nothing loaded:
>>> nlp = English(data_dir=None)
Keyword args: Keyword args:
data_dir (unicode): data_dir (unicode):
A path to a directory, from which to load the pipeline. A path to a directory from which to load the pipeline;
or '', to load default; or None, to load nothing.
By default, data is installed within the spaCy package directory. So Tokenizer (bool or callable):
if no data_dir is specified, spaCy attempts to load from a desc
directory named "data" that is a sibling of the spacy/en/__init__.py
file. You can find the location of this file by running:
$ python -c "import spacy.en; print spacy.en.__file__" Vectors (bool or callable):
desc
To prevent any data files from being loaded, pass data_dir=None. This Parser (bool or callable):
is useful if you want to construct a lexicon, which you'll then save desc
for later loading.
Tagger (bool or callable):
desc
Entity (bool or callable):
desc
Senser (bool or callable):
desc
""" """
ParserTransitionSystem = ArcEager ParserTransitionSystem = ArcEager
EntityTransitionSystem = BiluoPushDown EntityTransitionSystem = BiluoPushDown
def __init__(self, data_dir='', load_vectors=True): def __init__(self, data_dir='', Tokenizer=True, Vectors=True, Parser=True,
Tagger=True, Entity=True, Senser=True, load_vectors=True):
if data_dir == '': if data_dir == '':
data_dir = LOCAL_DATA_DIR data_dir = LOCAL_DATA_DIR
self._data_dir = data_dir # TODO: Deprecation warning
if load_vectors is False:
vectors = False
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None, self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
get_lex_props=get_lex_props, load_vectors=load_vectors) get_lex_props=get_lex_props, vectors=Vectors)
if Tokenizer is True:
Tokenizer = tokenizer.Tokenizer
if Tagger is True:
Tagger = pos.EnPosTagger
if Parser is True:
transition_system = self.ParserTransitionSystem
Parser = lambda s, d: parser.Parser(s, d, transition_system
if Entity is True:
transition_system = self.EntityTransitionSystem
Entity = lambda s, d: parser.Parser(s, d, transition_system)
if Senser is True:
Senser = wsd.SuperSenseTagger
self.tokenizer = Tokenizer(self.vocab, data_dir) if Tokenizer else None
self.tagger = Tagger(self.vocab.strings, data_dir) if Tagger else None
self.parser = Parser(self.vocab.strings, data_dir) if Parser else None
self.entity = Entity(self.vocab.strings, data_dir) if Entity else None
self.senser = Senser(self.vocab.strings, data_dir) if Senser else None
self._data_dir = data_dir
tag_names = list(POS_TAGS.keys()) tag_names = list(POS_TAGS.keys())
tag_names.sort() tag_names.sort()
if data_dir is None: if data_dir is None:
@ -77,53 +127,22 @@ class English(object):
prefix_re = None prefix_re = None
suffix_re = None suffix_re = None
infix_re = None infix_re = None
self.has_parser_model = False
self.has_tagger_model = False
self.has_entity_model = False
else: else:
tok_data_dir = path.join(data_dir, 'tokenizer') tok_data_dir = path.join(data_dir, 'tokenizer')
tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir) tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
prefix_re = re.compile(prefix_re) prefix_re = re.compile(prefix_re)
suffix_re = re.compile(suffix_re) suffix_re = re.compile(suffix_re)
infix_re = re.compile(infix_re) infix_re = re.compile(infix_re)
self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))
self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))
self.has_entity_model = path.exists(path.join(self._data_dir, 'ner'))
self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re, self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
suffix_re, infix_re, suffix_re, infix_re,
POS_TAGS, tag_names) POS_TAGS, tag_names)
self.mwe_merger = RegexMerger([ self.mwe_merger = RegexMerger([
('IN', 'O', regexes.MW_PREPOSITIONS_RE), ('IN', 'O', regexes.MW_PREPOSITIONS_RE),
('CD', 'TIME', regexes.TIME_RE), ('CD', 'TIME', regexes.TIME_RE),
('NNP', 'DATE', regexes.DAYS_RE), ('NNP', 'DATE', regexes.DAYS_RE),
('CD', 'MONEY', regexes.MONEY_RE)]) ('CD', 'MONEY', regexes.MONEY_RE)])
# These are lazy-loaded
self._tagger = None
self._parser = None
self._entity = None
@property
def tagger(self):
if self._tagger is None:
self._tagger = EnPosTagger(self.vocab.strings, self._data_dir)
return self._tagger
@property
def parser(self):
if self._parser is None:
self._parser = Parser(self.vocab.strings,
path.join(self._data_dir, 'deps'),
self.ParserTransitionSystem)
return self._parser
@property
def entity(self):
if self._entity is None:
self._entity = Parser(self.vocab.strings,
path.join(self._data_dir, 'ner'),
self.EntityTransitionSystem)
return self._entity
def __call__(self, text, tag=True, parse=parse_if_model_present, def __call__(self, text, tag=True, parse=parse_if_model_present,
entity=parse_if_model_present, merge_mwes=False): entity=parse_if_model_present, merge_mwes=False):

View File

@ -20,8 +20,7 @@ from .tokens import Tokens
cdef class Tokenizer: cdef class Tokenizer:
def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re, def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re, pos_tags):
pos_tags, tag_names):
self.mem = Pool() self.mem = Pool()
self._cache = PreshMap() self._cache = PreshMap()
self._specials = PreshMap() self._specials = PreshMap()
@ -29,7 +28,17 @@ cdef class Tokenizer:
self._suffix_re = suffix_re self._suffix_re = suffix_re
self._infix_re = infix_re self._infix_re = infix_re
self.vocab = vocab self.vocab = vocab
self._load_special_tokenization(rules, pos_tags, tag_names) self._load_special_tokenization(rules, pos_tags)
@classmethod
def from_dir(cls, Vocab vocab, directory):
data_dir = path.join(data_dir, 'tokenizer')
rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
prefix_re = re.compile(prefix_re)
suffix_re = re.compile(suffix_re)
infix_re = re.compile(infix_re)
return cls(vocab, tok_rules, prefix_re, suffix_re, infix_re,
pos_tags)
cpdef Tokens tokens_from_list(self, list strings): cpdef Tokens tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings]) cdef int length = sum([len(s) for s in strings])
@ -224,7 +233,7 @@ cdef class Tokenizer:
match = self._suffix_re.search(string) match = self._suffix_re.search(string)
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
def _load_special_tokenization(self, object rules, object tag_map, object tag_names): def _load_special_tokenization(self, object rules, object tag_map):
'''Add a special-case tokenization rule. '''Add a special-case tokenization rule.
''' '''
cdef int i cdef int i