mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* Begin refactor
This commit is contained in:
parent
52fd80c6c6
commit
6788c86b2f
|
@ -1,9 +1,42 @@
|
|||
===
|
||||
API
|
||||
===
|
||||
=====
|
||||
Usage
|
||||
=====
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
spaCy is a suite of natural language processing tools, arranged into
|
||||
a pipeline. It is substantially more opinionated than most similar libraries,
|
||||
which often give users the choice of multiple models that compute the same annotation.
|
||||
spaCy's philosophy is to just have one --- the best one. Our perspective is that
|
||||
the redundant options are really only useful to researchers, who need to replicate
|
||||
some prior work exactly.
|
||||
|
||||
Being opinionated allows us to keep the library small, fast, and up-to-date. It
|
||||
also makes the API much simpler. Normal usage proceeds in three steps:
|
||||
|
||||
1. Loading resources;
|
||||
2. Processing text;
|
||||
3. Accessing annotations.
|
||||
|
||||
This document is divided into three parts, to match these stages. We focus here
|
||||
on the library's API. See also: Installation, Annotation Standards, Algorithmic Details,
|
||||
and Benchmarks.
|
||||
|
||||
Loading Resources
|
||||
-----------------
|
||||
|
||||
99\% of the time, you will load spaCy's resources using a language pipeline class,
|
||||
e.g. `spacy.en.English`. The pipeline class reads the data from disk, from a
|
||||
specified directory. By default, spaCy installs data into each language's
|
||||
package directory, and loads it from there.
|
||||
|
||||
.. autoclass:: spacy.en.English
|
||||
:members:
|
||||
|
||||
|
||||
The class `spacy.en.English` is the main entry-point for the English pipeline
|
||||
(other languages to come).
|
||||
|
||||
+------------+----------------------------------------+-------------+--------------------------+
|
||||
| Attribute | Type | Attr API | Notes |
|
||||
|
@ -24,7 +57,8 @@ API
|
|||
+------------+----------------------------------------+-------------+--------------------------+
|
||||
|
||||
|
||||
.. automethod:: spacy.en.English.__call__
|
||||
.. autoclass:: spacy.en.English
|
||||
:members:
|
||||
|
||||
|
||||
.. autoclass:: spacy.tokens.Tokens
|
||||
|
@ -249,7 +283,7 @@ API
|
|||
|
||||
.. py:method:: load_morph_exceptions(self, exc: Dict[unicode, Dict])
|
||||
|
||||
.. py:class:: syntax.parser.GreedyParser(self, model_dir: unicode)
|
||||
.. py:class:: syntax.parser.Parser(self, model_dir: unicode)
|
||||
|
||||
.. py:method:: __call__(self, tokens: spacy.tokens.Tokens) --> None
|
||||
|
||||
|
|
|
@ -44,32 +44,82 @@ parse_if_model_present = -1
|
|||
class English(object):
|
||||
"""The English NLP pipeline.
|
||||
|
||||
Provides a tokenizer, lexicon, part-of-speech tagger and parser.
|
||||
Example:
|
||||
|
||||
Load data from default directory:
|
||||
|
||||
>>> nlp = English()
|
||||
>>> nlp = English(data_dir=u'')
|
||||
|
||||
Load data from specified directory:
|
||||
|
||||
>>> nlp = English(data_dir=u'path/to/data_directory')
|
||||
|
||||
Disable (and avoid loading) parts of the processing pipeline:
|
||||
|
||||
>>> nlp = English(vectors=False, parser=False, tagger=False, entity=False)
|
||||
|
||||
Start with nothing loaded:
|
||||
|
||||
>>> nlp = English(data_dir=None)
|
||||
|
||||
Keyword args:
|
||||
data_dir (unicode):
|
||||
A path to a directory, from which to load the pipeline.
|
||||
A path to a directory from which to load the pipeline;
|
||||
or '', to load default; or None, to load nothing.
|
||||
|
||||
By default, data is installed within the spaCy package directory. So
|
||||
if no data_dir is specified, spaCy attempts to load from a
|
||||
directory named "data" that is a sibling of the spacy/en/__init__.py
|
||||
file. You can find the location of this file by running:
|
||||
Tokenizer (bool or callable):
|
||||
desc
|
||||
|
||||
$ python -c "import spacy.en; print spacy.en.__file__"
|
||||
Vectors (bool or callable):
|
||||
desc
|
||||
|
||||
To prevent any data files from being loaded, pass data_dir=None. This
|
||||
is useful if you want to construct a lexicon, which you'll then save
|
||||
for later loading.
|
||||
Parser (bool or callable):
|
||||
desc
|
||||
|
||||
Tagger (bool or callable):
|
||||
desc
|
||||
|
||||
Entity (bool or callable):
|
||||
desc
|
||||
|
||||
Senser (bool or callable):
|
||||
desc
|
||||
"""
|
||||
ParserTransitionSystem = ArcEager
|
||||
EntityTransitionSystem = BiluoPushDown
|
||||
|
||||
def __init__(self, data_dir='', load_vectors=True):
|
||||
def __init__(self, data_dir='', Tokenizer=True, Vectors=True, Parser=True,
|
||||
Tagger=True, Entity=True, Senser=True, load_vectors=True):
|
||||
if data_dir == '':
|
||||
data_dir = LOCAL_DATA_DIR
|
||||
self._data_dir = data_dir
|
||||
# TODO: Deprecation warning
|
||||
if load_vectors is False:
|
||||
vectors = False
|
||||
|
||||
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
|
||||
get_lex_props=get_lex_props, load_vectors=load_vectors)
|
||||
get_lex_props=get_lex_props, vectors=Vectors)
|
||||
|
||||
if Tokenizer is True:
|
||||
Tokenizer = tokenizer.Tokenizer
|
||||
if Tagger is True:
|
||||
Tagger = pos.EnPosTagger
|
||||
if Parser is True:
|
||||
transition_system = self.ParserTransitionSystem
|
||||
Parser = lambda s, d: parser.Parser(s, d, transition_system
|
||||
if Entity is True:
|
||||
transition_system = self.EntityTransitionSystem
|
||||
Entity = lambda s, d: parser.Parser(s, d, transition_system)
|
||||
if Senser is True:
|
||||
Senser = wsd.SuperSenseTagger
|
||||
|
||||
self.tokenizer = Tokenizer(self.vocab, data_dir) if Tokenizer else None
|
||||
self.tagger = Tagger(self.vocab.strings, data_dir) if Tagger else None
|
||||
self.parser = Parser(self.vocab.strings, data_dir) if Parser else None
|
||||
self.entity = Entity(self.vocab.strings, data_dir) if Entity else None
|
||||
self.senser = Senser(self.vocab.strings, data_dir) if Senser else None
|
||||
|
||||
self._data_dir = data_dir
|
||||
tag_names = list(POS_TAGS.keys())
|
||||
tag_names.sort()
|
||||
if data_dir is None:
|
||||
|
@ -77,53 +127,22 @@ class English(object):
|
|||
prefix_re = None
|
||||
suffix_re = None
|
||||
infix_re = None
|
||||
self.has_parser_model = False
|
||||
self.has_tagger_model = False
|
||||
self.has_entity_model = False
|
||||
else:
|
||||
tok_data_dir = path.join(data_dir, 'tokenizer')
|
||||
tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
|
||||
prefix_re = re.compile(prefix_re)
|
||||
suffix_re = re.compile(suffix_re)
|
||||
infix_re = re.compile(infix_re)
|
||||
self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))
|
||||
self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))
|
||||
self.has_entity_model = path.exists(path.join(self._data_dir, 'ner'))
|
||||
|
||||
self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
|
||||
suffix_re, infix_re,
|
||||
POS_TAGS, tag_names)
|
||||
|
||||
self.mwe_merger = RegexMerger([
|
||||
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
|
||||
('CD', 'TIME', regexes.TIME_RE),
|
||||
('NNP', 'DATE', regexes.DAYS_RE),
|
||||
('CD', 'MONEY', regexes.MONEY_RE)])
|
||||
# These are lazy-loaded
|
||||
self._tagger = None
|
||||
self._parser = None
|
||||
self._entity = None
|
||||
|
||||
@property
|
||||
def tagger(self):
|
||||
if self._tagger is None:
|
||||
self._tagger = EnPosTagger(self.vocab.strings, self._data_dir)
|
||||
return self._tagger
|
||||
|
||||
@property
|
||||
def parser(self):
|
||||
if self._parser is None:
|
||||
self._parser = Parser(self.vocab.strings,
|
||||
path.join(self._data_dir, 'deps'),
|
||||
self.ParserTransitionSystem)
|
||||
return self._parser
|
||||
|
||||
@property
|
||||
def entity(self):
|
||||
if self._entity is None:
|
||||
self._entity = Parser(self.vocab.strings,
|
||||
path.join(self._data_dir, 'ner'),
|
||||
self.EntityTransitionSystem)
|
||||
return self._entity
|
||||
|
||||
def __call__(self, text, tag=True, parse=parse_if_model_present,
|
||||
entity=parse_if_model_present, merge_mwes=False):
|
||||
|
|
|
@ -20,8 +20,7 @@ from .tokens import Tokens
|
|||
|
||||
|
||||
cdef class Tokenizer:
|
||||
def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re,
|
||||
pos_tags, tag_names):
|
||||
def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re, pos_tags):
|
||||
self.mem = Pool()
|
||||
self._cache = PreshMap()
|
||||
self._specials = PreshMap()
|
||||
|
@ -29,7 +28,17 @@ cdef class Tokenizer:
|
|||
self._suffix_re = suffix_re
|
||||
self._infix_re = infix_re
|
||||
self.vocab = vocab
|
||||
self._load_special_tokenization(rules, pos_tags, tag_names)
|
||||
self._load_special_tokenization(rules, pos_tags)
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, Vocab vocab, directory):
|
||||
data_dir = path.join(data_dir, 'tokenizer')
|
||||
rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
|
||||
prefix_re = re.compile(prefix_re)
|
||||
suffix_re = re.compile(suffix_re)
|
||||
infix_re = re.compile(infix_re)
|
||||
return cls(vocab, tok_rules, prefix_re, suffix_re, infix_re,
|
||||
pos_tags)
|
||||
|
||||
cpdef Tokens tokens_from_list(self, list strings):
|
||||
cdef int length = sum([len(s) for s in strings])
|
||||
|
@ -224,7 +233,7 @@ cdef class Tokenizer:
|
|||
match = self._suffix_re.search(string)
|
||||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
def _load_special_tokenization(self, object rules, object tag_map, object tag_names):
|
||||
def _load_special_tokenization(self, object rules, object tag_map):
|
||||
'''Add a special-case tokenization rule.
|
||||
'''
|
||||
cdef int i
|
||||
|
|
Loading…
Reference in New Issue
Block a user