* Begin refactor

This commit is contained in:
Matthew Honnibal 2015-07-07 14:00:07 +02:00
parent 52fd80c6c6
commit 6788c86b2f
3 changed files with 116 additions and 54 deletions

View File

@ -1,9 +1,42 @@
===
API
===
=====
Usage
=====
Overview
--------
spaCy is a suite of natural language processing tools, arranged into
a pipeline. It is substantially more opinionated than most similar libraries,
which often give users the choice of multiple models that compute the same annotation.
spaCy's philosophy is to just have one --- the best one. Our perspective is that
the redundant options are really only useful to researchers, who need to replicate
some prior work exactly.
Being opinionated allows us to keep the library small, fast, and up-to-date. It
also makes the API much simpler. Normal usage proceeds in three steps:
1. Loading resources;
2. Processing text;
3. Accessing annotations.
This document is divided into three parts, to match these stages. We focus here
on the library's API. See also: Installation, Annotation Standards, Algorithmic Details,
and Benchmarks.
Loading Resources
-----------------
99\% of the time, you will load spaCy's resources using a language pipeline class,
e.g. `spacy.en.English`. The pipeline class reads the data from disk, from a
specified directory. By default, spaCy installs data into each language's
package directory, and loads it from there.
.. autoclass:: spacy.en.English
:members:
The class `spacy.en.English` is the main entry-point for the English pipeline
(other languages to come).
+------------+----------------------------------------+-------------+--------------------------+
| Attribute | Type | Attr API | Notes |
@ -24,7 +57,8 @@ API
+------------+----------------------------------------+-------------+--------------------------+
.. automethod:: spacy.en.English.__call__
.. autoclass:: spacy.en.English
:members:
.. autoclass:: spacy.tokens.Tokens
@ -249,7 +283,7 @@ API
.. py:method:: load_morph_exceptions(self, exc: Dict[unicode, Dict])
.. py:class:: syntax.parser.GreedyParser(self, model_dir: unicode)
.. py:class:: syntax.parser.Parser(self, model_dir: unicode)
.. py:method:: __call__(self, tokens: spacy.tokens.Tokens) --> None

View File

@ -44,32 +44,82 @@ parse_if_model_present = -1
class English(object):
"""The English NLP pipeline.
Provides a tokenizer, lexicon, part-of-speech tagger and parser.
Example:
Load data from default directory:
>>> nlp = English()
>>> nlp = English(data_dir=u'')
Load data from specified directory:
>>> nlp = English(data_dir=u'path/to/data_directory')
Disable (and avoid loading) parts of the processing pipeline:
>>> nlp = English(vectors=False, parser=False, tagger=False, entity=False)
Start with nothing loaded:
>>> nlp = English(data_dir=None)
Keyword args:
data_dir (unicode):
A path to a directory, from which to load the pipeline.
A path to a directory from which to load the pipeline;
or '', to load default; or None, to load nothing.
By default, data is installed within the spaCy package directory. So
if no data_dir is specified, spaCy attempts to load from a
directory named "data" that is a sibling of the spacy/en/__init__.py
file. You can find the location of this file by running:
Tokenizer (bool or callable):
desc
$ python -c "import spacy.en; print spacy.en.__file__"
Vectors (bool or callable):
desc
To prevent any data files from being loaded, pass data_dir=None. This
is useful if you want to construct a lexicon, which you'll then save
for later loading.
Parser (bool or callable):
desc
Tagger (bool or callable):
desc
Entity (bool or callable):
desc
Senser (bool or callable):
desc
"""
ParserTransitionSystem = ArcEager
EntityTransitionSystem = BiluoPushDown
def __init__(self, data_dir='', load_vectors=True):
def __init__(self, data_dir='', Tokenizer=True, Vectors=True, Parser=True,
Tagger=True, Entity=True, Senser=True, load_vectors=True):
if data_dir == '':
data_dir = LOCAL_DATA_DIR
self._data_dir = data_dir
# TODO: Deprecation warning
if load_vectors is False:
vectors = False
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
get_lex_props=get_lex_props, load_vectors=load_vectors)
get_lex_props=get_lex_props, vectors=Vectors)
if Tokenizer is True:
Tokenizer = tokenizer.Tokenizer
if Tagger is True:
Tagger = pos.EnPosTagger
if Parser is True:
transition_system = self.ParserTransitionSystem
Parser = lambda s, d: parser.Parser(s, d, transition_system
if Entity is True:
transition_system = self.EntityTransitionSystem
Entity = lambda s, d: parser.Parser(s, d, transition_system)
if Senser is True:
Senser = wsd.SuperSenseTagger
self.tokenizer = Tokenizer(self.vocab, data_dir) if Tokenizer else None
self.tagger = Tagger(self.vocab.strings, data_dir) if Tagger else None
self.parser = Parser(self.vocab.strings, data_dir) if Parser else None
self.entity = Entity(self.vocab.strings, data_dir) if Entity else None
self.senser = Senser(self.vocab.strings, data_dir) if Senser else None
self._data_dir = data_dir
tag_names = list(POS_TAGS.keys())
tag_names.sort()
if data_dir is None:
@ -77,53 +127,22 @@ class English(object):
prefix_re = None
suffix_re = None
infix_re = None
self.has_parser_model = False
self.has_tagger_model = False
self.has_entity_model = False
else:
tok_data_dir = path.join(data_dir, 'tokenizer')
tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
prefix_re = re.compile(prefix_re)
suffix_re = re.compile(suffix_re)
infix_re = re.compile(infix_re)
self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))
self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))
self.has_entity_model = path.exists(path.join(self._data_dir, 'ner'))
self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
suffix_re, infix_re,
POS_TAGS, tag_names)
self.mwe_merger = RegexMerger([
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
('CD', 'TIME', regexes.TIME_RE),
('NNP', 'DATE', regexes.DAYS_RE),
('CD', 'MONEY', regexes.MONEY_RE)])
# These are lazy-loaded
self._tagger = None
self._parser = None
self._entity = None
@property
def tagger(self):
if self._tagger is None:
self._tagger = EnPosTagger(self.vocab.strings, self._data_dir)
return self._tagger
@property
def parser(self):
if self._parser is None:
self._parser = Parser(self.vocab.strings,
path.join(self._data_dir, 'deps'),
self.ParserTransitionSystem)
return self._parser
@property
def entity(self):
if self._entity is None:
self._entity = Parser(self.vocab.strings,
path.join(self._data_dir, 'ner'),
self.EntityTransitionSystem)
return self._entity
def __call__(self, text, tag=True, parse=parse_if_model_present,
entity=parse_if_model_present, merge_mwes=False):

View File

@ -20,8 +20,7 @@ from .tokens import Tokens
cdef class Tokenizer:
def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re,
pos_tags, tag_names):
def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re, pos_tags):
self.mem = Pool()
self._cache = PreshMap()
self._specials = PreshMap()
@ -29,7 +28,17 @@ cdef class Tokenizer:
self._suffix_re = suffix_re
self._infix_re = infix_re
self.vocab = vocab
self._load_special_tokenization(rules, pos_tags, tag_names)
self._load_special_tokenization(rules, pos_tags)
@classmethod
def from_dir(cls, Vocab vocab, directory):
data_dir = path.join(data_dir, 'tokenizer')
rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
prefix_re = re.compile(prefix_re)
suffix_re = re.compile(suffix_re)
infix_re = re.compile(infix_re)
return cls(vocab, tok_rules, prefix_re, suffix_re, infix_re,
pos_tags)
cpdef Tokens tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings])
@ -224,7 +233,7 @@ cdef class Tokenizer:
match = self._suffix_re.search(string)
return (match.end() - match.start()) if match is not None else 0
def _load_special_tokenization(self, object rules, object tag_map, object tag_names):
def _load_special_tokenization(self, object rules, object tag_map):
'''Add a special-case tokenization rule.
'''
cdef int i