mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-29 01:13:17 +03:00
* Begin refactor
This commit is contained in:
parent
52fd80c6c6
commit
6788c86b2f
|
@ -1,9 +1,42 @@
|
||||||
===
|
=====
|
||||||
API
|
Usage
|
||||||
===
|
=====
|
||||||
|
|
||||||
|
Overview
|
||||||
|
--------
|
||||||
|
|
||||||
|
spaCy is a suite of natural language processing tools, arranged into
|
||||||
|
a pipeline. It is substantially more opinionated than most similar libraries,
|
||||||
|
which often give users the choice of multiple models that compute the same annotation.
|
||||||
|
spaCy's philosophy is to just have one --- the best one. Our perspective is that
|
||||||
|
the redundant options are really only useful to researchers, who need to replicate
|
||||||
|
some prior work exactly.
|
||||||
|
|
||||||
|
Being opinionated allows us to keep the library small, fast, and up-to-date. It
|
||||||
|
also makes the API much simpler. Normal usage proceeds in three steps:
|
||||||
|
|
||||||
|
1. Loading resources;
|
||||||
|
2. Processing text;
|
||||||
|
3. Accessing annotations.
|
||||||
|
|
||||||
|
This document is divided into three parts, to match these stages. We focus here
|
||||||
|
on the library's API. See also: Installation, Annotation Standards, Algorithmic Details,
|
||||||
|
and Benchmarks.
|
||||||
|
|
||||||
|
Loading Resources
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
99\% of the time, you will load spaCy's resources using a language pipeline class,
|
||||||
|
e.g. `spacy.en.English`. The pipeline class reads the data from disk, from a
|
||||||
|
specified directory. By default, spaCy installs data into each language's
|
||||||
|
package directory, and loads it from there.
|
||||||
|
|
||||||
.. autoclass:: spacy.en.English
|
.. autoclass:: spacy.en.English
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
The class `spacy.en.English` is the main entry-point for the English pipeline
|
||||||
|
(other languages to come).
|
||||||
|
|
||||||
+------------+----------------------------------------+-------------+--------------------------+
|
+------------+----------------------------------------+-------------+--------------------------+
|
||||||
| Attribute | Type | Attr API | Notes |
|
| Attribute | Type | Attr API | Notes |
|
||||||
|
@ -24,7 +57,8 @@ API
|
||||||
+------------+----------------------------------------+-------------+--------------------------+
|
+------------+----------------------------------------+-------------+--------------------------+
|
||||||
|
|
||||||
|
|
||||||
.. automethod:: spacy.en.English.__call__
|
.. autoclass:: spacy.en.English
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: spacy.tokens.Tokens
|
.. autoclass:: spacy.tokens.Tokens
|
||||||
|
@ -249,7 +283,7 @@ API
|
||||||
|
|
||||||
.. py:method:: load_morph_exceptions(self, exc: Dict[unicode, Dict])
|
.. py:method:: load_morph_exceptions(self, exc: Dict[unicode, Dict])
|
||||||
|
|
||||||
.. py:class:: syntax.parser.GreedyParser(self, model_dir: unicode)
|
.. py:class:: syntax.parser.Parser(self, model_dir: unicode)
|
||||||
|
|
||||||
.. py:method:: __call__(self, tokens: spacy.tokens.Tokens) --> None
|
.. py:method:: __call__(self, tokens: spacy.tokens.Tokens) --> None
|
||||||
|
|
||||||
|
|
|
@ -44,32 +44,82 @@ parse_if_model_present = -1
|
||||||
class English(object):
|
class English(object):
|
||||||
"""The English NLP pipeline.
|
"""The English NLP pipeline.
|
||||||
|
|
||||||
Provides a tokenizer, lexicon, part-of-speech tagger and parser.
|
Example:
|
||||||
|
|
||||||
|
Load data from default directory:
|
||||||
|
|
||||||
|
>>> nlp = English()
|
||||||
|
>>> nlp = English(data_dir=u'')
|
||||||
|
|
||||||
|
Load data from specified directory:
|
||||||
|
|
||||||
|
>>> nlp = English(data_dir=u'path/to/data_directory')
|
||||||
|
|
||||||
|
Disable (and avoid loading) parts of the processing pipeline:
|
||||||
|
|
||||||
|
>>> nlp = English(vectors=False, parser=False, tagger=False, entity=False)
|
||||||
|
|
||||||
|
Start with nothing loaded:
|
||||||
|
|
||||||
|
>>> nlp = English(data_dir=None)
|
||||||
|
|
||||||
Keyword args:
|
Keyword args:
|
||||||
data_dir (unicode):
|
data_dir (unicode):
|
||||||
A path to a directory, from which to load the pipeline.
|
A path to a directory from which to load the pipeline;
|
||||||
|
or '', to load default; or None, to load nothing.
|
||||||
|
|
||||||
By default, data is installed within the spaCy package directory. So
|
Tokenizer (bool or callable):
|
||||||
if no data_dir is specified, spaCy attempts to load from a
|
desc
|
||||||
directory named "data" that is a sibling of the spacy/en/__init__.py
|
|
||||||
file. You can find the location of this file by running:
|
|
||||||
|
|
||||||
$ python -c "import spacy.en; print spacy.en.__file__"
|
Vectors (bool or callable):
|
||||||
|
desc
|
||||||
|
|
||||||
To prevent any data files from being loaded, pass data_dir=None. This
|
Parser (bool or callable):
|
||||||
is useful if you want to construct a lexicon, which you'll then save
|
desc
|
||||||
for later loading.
|
|
||||||
|
Tagger (bool or callable):
|
||||||
|
desc
|
||||||
|
|
||||||
|
Entity (bool or callable):
|
||||||
|
desc
|
||||||
|
|
||||||
|
Senser (bool or callable):
|
||||||
|
desc
|
||||||
"""
|
"""
|
||||||
ParserTransitionSystem = ArcEager
|
ParserTransitionSystem = ArcEager
|
||||||
EntityTransitionSystem = BiluoPushDown
|
EntityTransitionSystem = BiluoPushDown
|
||||||
|
|
||||||
def __init__(self, data_dir='', load_vectors=True):
|
def __init__(self, data_dir='', Tokenizer=True, Vectors=True, Parser=True,
|
||||||
|
Tagger=True, Entity=True, Senser=True, load_vectors=True):
|
||||||
if data_dir == '':
|
if data_dir == '':
|
||||||
data_dir = LOCAL_DATA_DIR
|
data_dir = LOCAL_DATA_DIR
|
||||||
self._data_dir = data_dir
|
# TODO: Deprecation warning
|
||||||
|
if load_vectors is False:
|
||||||
|
vectors = False
|
||||||
|
|
||||||
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
|
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
|
||||||
get_lex_props=get_lex_props, load_vectors=load_vectors)
|
get_lex_props=get_lex_props, vectors=Vectors)
|
||||||
|
|
||||||
|
if Tokenizer is True:
|
||||||
|
Tokenizer = tokenizer.Tokenizer
|
||||||
|
if Tagger is True:
|
||||||
|
Tagger = pos.EnPosTagger
|
||||||
|
if Parser is True:
|
||||||
|
transition_system = self.ParserTransitionSystem
|
||||||
|
Parser = lambda s, d: parser.Parser(s, d, transition_system
|
||||||
|
if Entity is True:
|
||||||
|
transition_system = self.EntityTransitionSystem
|
||||||
|
Entity = lambda s, d: parser.Parser(s, d, transition_system)
|
||||||
|
if Senser is True:
|
||||||
|
Senser = wsd.SuperSenseTagger
|
||||||
|
|
||||||
|
self.tokenizer = Tokenizer(self.vocab, data_dir) if Tokenizer else None
|
||||||
|
self.tagger = Tagger(self.vocab.strings, data_dir) if Tagger else None
|
||||||
|
self.parser = Parser(self.vocab.strings, data_dir) if Parser else None
|
||||||
|
self.entity = Entity(self.vocab.strings, data_dir) if Entity else None
|
||||||
|
self.senser = Senser(self.vocab.strings, data_dir) if Senser else None
|
||||||
|
|
||||||
|
self._data_dir = data_dir
|
||||||
tag_names = list(POS_TAGS.keys())
|
tag_names = list(POS_TAGS.keys())
|
||||||
tag_names.sort()
|
tag_names.sort()
|
||||||
if data_dir is None:
|
if data_dir is None:
|
||||||
|
@ -77,53 +127,22 @@ class English(object):
|
||||||
prefix_re = None
|
prefix_re = None
|
||||||
suffix_re = None
|
suffix_re = None
|
||||||
infix_re = None
|
infix_re = None
|
||||||
self.has_parser_model = False
|
|
||||||
self.has_tagger_model = False
|
|
||||||
self.has_entity_model = False
|
|
||||||
else:
|
else:
|
||||||
tok_data_dir = path.join(data_dir, 'tokenizer')
|
tok_data_dir = path.join(data_dir, 'tokenizer')
|
||||||
tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
|
tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
|
||||||
prefix_re = re.compile(prefix_re)
|
prefix_re = re.compile(prefix_re)
|
||||||
suffix_re = re.compile(suffix_re)
|
suffix_re = re.compile(suffix_re)
|
||||||
infix_re = re.compile(infix_re)
|
infix_re = re.compile(infix_re)
|
||||||
self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))
|
|
||||||
self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))
|
|
||||||
self.has_entity_model = path.exists(path.join(self._data_dir, 'ner'))
|
|
||||||
|
|
||||||
self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
|
self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
|
||||||
suffix_re, infix_re,
|
suffix_re, infix_re,
|
||||||
POS_TAGS, tag_names)
|
POS_TAGS, tag_names)
|
||||||
|
|
||||||
self.mwe_merger = RegexMerger([
|
self.mwe_merger = RegexMerger([
|
||||||
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
|
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
|
||||||
('CD', 'TIME', regexes.TIME_RE),
|
('CD', 'TIME', regexes.TIME_RE),
|
||||||
('NNP', 'DATE', regexes.DAYS_RE),
|
('NNP', 'DATE', regexes.DAYS_RE),
|
||||||
('CD', 'MONEY', regexes.MONEY_RE)])
|
('CD', 'MONEY', regexes.MONEY_RE)])
|
||||||
# These are lazy-loaded
|
|
||||||
self._tagger = None
|
|
||||||
self._parser = None
|
|
||||||
self._entity = None
|
|
||||||
|
|
||||||
@property
|
|
||||||
def tagger(self):
|
|
||||||
if self._tagger is None:
|
|
||||||
self._tagger = EnPosTagger(self.vocab.strings, self._data_dir)
|
|
||||||
return self._tagger
|
|
||||||
|
|
||||||
@property
|
|
||||||
def parser(self):
|
|
||||||
if self._parser is None:
|
|
||||||
self._parser = Parser(self.vocab.strings,
|
|
||||||
path.join(self._data_dir, 'deps'),
|
|
||||||
self.ParserTransitionSystem)
|
|
||||||
return self._parser
|
|
||||||
|
|
||||||
@property
|
|
||||||
def entity(self):
|
|
||||||
if self._entity is None:
|
|
||||||
self._entity = Parser(self.vocab.strings,
|
|
||||||
path.join(self._data_dir, 'ner'),
|
|
||||||
self.EntityTransitionSystem)
|
|
||||||
return self._entity
|
|
||||||
|
|
||||||
def __call__(self, text, tag=True, parse=parse_if_model_present,
|
def __call__(self, text, tag=True, parse=parse_if_model_present,
|
||||||
entity=parse_if_model_present, merge_mwes=False):
|
entity=parse_if_model_present, merge_mwes=False):
|
||||||
|
|
|
@ -20,8 +20,7 @@ from .tokens import Tokens
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re,
|
def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re, pos_tags):
|
||||||
pos_tags, tag_names):
|
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._cache = PreshMap()
|
self._cache = PreshMap()
|
||||||
self._specials = PreshMap()
|
self._specials = PreshMap()
|
||||||
|
@ -29,7 +28,17 @@ cdef class Tokenizer:
|
||||||
self._suffix_re = suffix_re
|
self._suffix_re = suffix_re
|
||||||
self._infix_re = infix_re
|
self._infix_re = infix_re
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self._load_special_tokenization(rules, pos_tags, tag_names)
|
self._load_special_tokenization(rules, pos_tags)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dir(cls, Vocab vocab, directory):
|
||||||
|
data_dir = path.join(data_dir, 'tokenizer')
|
||||||
|
rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
|
||||||
|
prefix_re = re.compile(prefix_re)
|
||||||
|
suffix_re = re.compile(suffix_re)
|
||||||
|
infix_re = re.compile(infix_re)
|
||||||
|
return cls(vocab, tok_rules, prefix_re, suffix_re, infix_re,
|
||||||
|
pos_tags)
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings):
|
cpdef Tokens tokens_from_list(self, list strings):
|
||||||
cdef int length = sum([len(s) for s in strings])
|
cdef int length = sum([len(s) for s in strings])
|
||||||
|
@ -224,7 +233,7 @@ cdef class Tokenizer:
|
||||||
match = self._suffix_re.search(string)
|
match = self._suffix_re.search(string)
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
def _load_special_tokenization(self, object rules, object tag_map, object tag_names):
|
def _load_special_tokenization(self, object rules, object tag_map):
|
||||||
'''Add a special-case tokenization rule.
|
'''Add a special-case tokenization rule.
|
||||||
'''
|
'''
|
||||||
cdef int i
|
cdef int i
|
||||||
|
|
Loading…
Reference in New Issue
Block a user