mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Rename "English" section to "Language"
This commit is contained in:
parent
e16e78a737
commit
f8322a69e7
|
@ -1,150 +1,134 @@
|
|||
//- ----------------------------------
|
||||
//- 💫 DOCS > API > ENGLISH
|
||||
//- 💫 DOCS > API > LANGUAGE
|
||||
//- ----------------------------------
|
||||
|
||||
+section("english")
|
||||
+h(2, "english", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py")
|
||||
| #[+tag class] English(Language)
|
||||
+section("language")
|
||||
+h(2, "language", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py")
|
||||
| #[+tag class] Language
|
||||
|
||||
p.
|
||||
The English analysis pipeline. Usually you"ll load this once per process,
|
||||
and pass the instance around your program.
|
||||
A pipeline that transforms text strings into annotated spaCy Doc objects. Usually you'll load the Language pipeline once and pass the instance around your program.
|
||||
|
||||
+code("python", "Overview").
|
||||
class Language:
|
||||
lang = None
|
||||
def __init__(self, data_dir=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None):
|
||||
return self
|
||||
Defaults = BaseDefaults
|
||||
|
||||
def __call__(self, text, tag=True, parse=True, entity=True):
|
||||
return Doc()
|
||||
def __init__(self, path=True, **overrides):
|
||||
self.vocab = Vocab()
|
||||
self.tokenizer = Tokenizer()
|
||||
self.tagger = Tagger()
|
||||
self.parser = DependencyParser()
|
||||
self.entity = EntityRecognizer()
|
||||
self.make_doc = lambda text: Doc()
|
||||
self.pipeline = [self.tagger, self.parser, self.entity]
|
||||
|
||||
def pipe(self, texts_iterator, batch_size=1000, n_threads=2):
|
||||
yield Doc()
|
||||
def __call__(self, text, **toggle):
|
||||
doc = self.make_doc(text)
|
||||
for proc in self.pipeline:
|
||||
if toggle.get(process.name, True):
|
||||
process(doc)
|
||||
return doc
|
||||
|
||||
def end_training(self, data_dir=None):
|
||||
def pipe(self, texts_iterator, batch_size=1000, n_threads=2, **toggle):
|
||||
docs = (self.make_doc(text) for text in texts_iterator)
|
||||
for process in self.pipeline:
|
||||
if toggle.get(process.name, True):
|
||||
docs = process.pipe(docs, batch_size=batch_size, n_threads=n_threads)
|
||||
for doc in self.docs:
|
||||
yield doc
|
||||
|
||||
def end_training(self, path=None):
|
||||
return None
|
||||
|
||||
class English(Language):
|
||||
lang = "en"
|
||||
class English(Language):
|
||||
class Defaults(BaseDefaults):
|
||||
pass
|
||||
|
||||
class German(Language):
|
||||
lang = "de"
|
||||
class German(Language):
|
||||
class Defaults(BaseDefaults):
|
||||
pass
|
||||
|
||||
+section("english-init")
|
||||
+h(3, "english-init")
|
||||
| #[+tag method] English.__init__
|
||||
| #[+tag method] Language.__init__
|
||||
|
||||
p
|
||||
| Load the pipeline. Each component can be passed
|
||||
| as an argument, or left as #[code None], in which case it will be loaded
|
||||
| from a classmethod, named e.g. #[code default_vocab()].
|
||||
| Load the pipeline. You can disable components by passing None as a value,
|
||||
| e.g. pass parser=None, vectors=None to save memory if you're not using
|
||||
| those components. You can also pass an object as the value.
|
||||
| Pass a function create_pipeline to use a custom pipeline --- see
|
||||
| the custom pipeline tutorial.
|
||||
|
||||
+aside("Efficiency").
|
||||
Loading takes 10-20 seconds, and the instance consumes 2 to 3
|
||||
gigabytes of memory. Intended use is for one instance to be
|
||||
created for each language per process, but you can create more
|
||||
if you"re doing something unusual. You may wish to make the
|
||||
if you're doing something unusual. You may wish to make the
|
||||
instance a global variable or "singleton".
|
||||
|
||||
+table(["Example", "Description"])
|
||||
+row
|
||||
+cell #[code.lang-python nlp = English()]
|
||||
+cell Load everything, from default package
|
||||
+cell #[code nlp = English()]
|
||||
+cell Load everything, from default path.
|
||||
|
||||
+row
|
||||
+cell #[code.lang-python nlp = English(data_dir='my_data')]
|
||||
+cell Load everything, from specified dir
|
||||
+cell #[code nlp = English(path='my_data')]
|
||||
+cell Load everything, from specified path
|
||||
|
||||
+row
|
||||
+cell #[code.lang-python nlp = English(parser=False)]
|
||||
+cell Load everything except the parser.
|
||||
+cell #[code nlp = English(path=path_obj)]
|
||||
+cell Load everything, from an object that follows the #[code pathlib.Path] protocol.
|
||||
|
||||
+row
|
||||
+cell #[code.lang-python nlp = English(parser=False, tagger=False)]
|
||||
+cell Load everything except the parser and tagger.
|
||||
+cell #[code nlp = English(parser=False, vectors=False)]
|
||||
+cell Load everything except the parser and the word vectors.
|
||||
|
||||
+row
|
||||
+cell #[code.lang-python nlp = English(parser=MyParser())]
|
||||
+cell Supply your own parser
|
||||
+cell #[code nlp = English(parser=my_parser)]
|
||||
+cell Load everything, and use a custom parser.
|
||||
|
||||
+row
|
||||
+cell #[code nlp = English(create_pipeline=my_pipeline)]
|
||||
+cell Load everything, and use a custom pipeline.
|
||||
|
||||
+code("python", "Definition").
|
||||
def __init__(self, data_dir=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None):
|
||||
return self
|
||||
def __init__(self, path=True, **overrides):
|
||||
D = self.Defaults
|
||||
self.vocab = Vocab(path=path, parent=self, **D.vocab) \
|
||||
if 'vocab' not in overrides \
|
||||
else overrides['vocab']
|
||||
self.tokenizer = Tokenizer(self.vocab, path=path, **D.tokenizer) \
|
||||
if 'tokenizer' not in overrides \
|
||||
else overrides['tokenizer']
|
||||
self.tagger = Tagger(self.vocab, path=path, **D.tagger) \
|
||||
if 'tagger' not in overrides \
|
||||
else overrides['tagger']
|
||||
self.parser = DependencyParser(self.vocab, path=path, **D.parser) \
|
||||
if 'parser' not in overrides \
|
||||
else overrides['parser']
|
||||
self.entity = EntityRecognizer(self.vocab, path=path, **D.entity) \
|
||||
if 'entity' not in overrides \
|
||||
else overrides['entity']
|
||||
self.matcher = Matcher(self.vocab, path=path, **D.matcher) \
|
||||
if 'matcher' not in overrides \
|
||||
else overrides['matcher']
|
||||
|
||||
+table(["Arg", "Type", "Description"])
|
||||
+row
|
||||
+cell data_dir
|
||||
+cell str
|
||||
+cell.
|
||||
The data directory. If None, value is obtained via the
|
||||
#[code default_data_dir()] method.
|
||||
if 'make_doc' in overrides:
|
||||
self.make_doc = overrides['make_doc']
|
||||
elif 'create_make_doc' in overrides:
|
||||
self.make_doc = overrides['create_make_doc'](self)
|
||||
else:
|
||||
self.make_doc = lambda text: self.tokenizer(text)
|
||||
if 'pipeline' in overrides:
|
||||
self.pipeline = overrides['pipeline']
|
||||
elif 'create_pipeline' in overrides:
|
||||
self.pipeline = overrides['create_pipeline'](self)
|
||||
else:
|
||||
self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]
|
||||
|
||||
+row
|
||||
+cell vocab
|
||||
+cell #[code Vocab]
|
||||
+cell.
|
||||
The vocab object, which should be an instance of class
|
||||
#[code spacy.vocab.Vocab]. If #[code None], the object is
|
||||
obtained from the #[code default_vocab()] class method. The
|
||||
vocab object manages all of the language specific rules and
|
||||
definitions, maintains the cache of lexical types, and manages
|
||||
the word vectors. Because the vocab owns this important data,
|
||||
most objects hold a reference to the vocab.
|
||||
|
||||
+row
|
||||
+cell tokenizer
|
||||
+cell #[code Tokenizer]
|
||||
+cell.
|
||||
The tokenizer, which should be a callable that accepts a
|
||||
unicode string, and returns a #[code Doc] object. If set to
|
||||
#[code None], the default tokenizer is constructed from the
|
||||
#[code default_tokenizer()] method.
|
||||
|
||||
+row
|
||||
+cell tagger
|
||||
+cell #[code Tagger]
|
||||
+cell.
|
||||
The part-of-speech tagger, which should be a callable that
|
||||
accepts a #[code Doc] object, and sets the part-of-speech
|
||||
tags in-place. If set to None, the default tagger is constructed
|
||||
from the #[code default_tagger()] method.
|
||||
|
||||
+row
|
||||
+cell parser
|
||||
+cell #[code Parser]
|
||||
+cell.
|
||||
The dependency parser, which should be a callable that accepts
|
||||
a #[code Doc] object, and sets the sentence boundaries,
|
||||
syntactic heads and dependency labels in-place.
|
||||
If set to #[code None], the default parser is
|
||||
constructed from the #[code default_parser()] method. To disable
|
||||
the parser and prevent it from being loaded, pass #[code parser=False].
|
||||
|
||||
+row
|
||||
+cell entity
|
||||
+cell #[code Parser]
|
||||
+cell.
|
||||
The named entity recognizer, which should be a callable that
|
||||
accepts a #[code Doc] object, and sets the named entity annotations
|
||||
in-place. If set to None, the default entity recognizer is
|
||||
constructed from the #[code default_entity()] method. To disable
|
||||
the entity recognizer and prevent it from being loaded, pass
|
||||
#[code entity=False].
|
||||
|
||||
+row
|
||||
+cell matcher
|
||||
+cell #[code Matcher]
|
||||
+cell.
|
||||
The pattern matcher, which should be a callable that accepts
|
||||
a #[code Doc] object, and sets named entity annotations in-place
|
||||
using token-based rules. If set
|
||||
to None, the default matcher is constructed from the
|
||||
#[code default_matcher()] method.
|
||||
|
||||
+section("english-call")
|
||||
+h(3, "english-call")
|
||||
| #[+tag method] English.__call__
|
||||
+section("language-call")
|
||||
+h(3, "language-call")
|
||||
| #[+tag method] Language.__call__
|
||||
|
||||
p
|
||||
| The main entry point to spaCy. Takes raw unicode text, and returns
|
||||
|
@ -152,30 +136,30 @@
|
|||
| and #[code Span] objects.
|
||||
|
||||
+aside("Efficiency").
|
||||
spaCy"s algorithms are all linear-time, so you can supply
|
||||
spaCy's algorithms are all linear-time, so you can supply
|
||||
documents of arbitrary length, e.g. whole novels.
|
||||
|
||||
+table(["Example", "Description"], "code")
|
||||
+row
|
||||
+cell #[code.lang-python doc = nlp(u'Some text.')]
|
||||
+cell #[ doc = nlp(u'Some text.')]
|
||||
+cell Apply the full pipeline.
|
||||
+row
|
||||
+cell #[code.lang-python doc = nlp(u'Some text.', parse=False)]
|
||||
+cell #[ doc = nlp(u'Some text.', parse=False)]
|
||||
+cell Applies tagger and entity, not parser
|
||||
+row
|
||||
+cell #[code.lang-python doc = nlp(u'Some text.', entity=False)]
|
||||
+cell #[ doc = nlp(u'Some text.', entity=False)]
|
||||
+cell Applies tagger and parser, not entity.
|
||||
+row
|
||||
+cell #[code.lang-python doc = nlp(u'Some text.', tag=False)]
|
||||
+cell #[ doc = nlp(u'Some text.', tag=False)]
|
||||
+cell Does not apply tagger, entity or parser
|
||||
+row
|
||||
+cell #[code.lang-python doc = nlp(u'')]
|
||||
+cell #[ doc = nlp(u'')]
|
||||
+cell Zero-length tokens, not an error
|
||||
+row
|
||||
+cell #[code.lang-python doc = nlp(b'Some text')]
|
||||
+cell #[ doc = nlp(b'Some text')]
|
||||
+cell Error: need unicode
|
||||
+row
|
||||
+cell #[code.lang-python doc = nlp(b'Some text'.decode('utf8'))]
|
||||
+cell #[ doc = nlp(b'Some text'.decode('utf8'))]
|
||||
+cell Decode bytes into unicode first.
|
||||
|
||||
+code("python", "Definition").
|
|
@ -8,7 +8,7 @@
|
|||
["Usage Examples", "#examples", "examples"]
|
||||
],
|
||||
"API": [
|
||||
["English", "#english", "english"],
|
||||
["Language", "#language", "language"],
|
||||
["Doc", "#doc", "doc"],
|
||||
["Token", "#token", "token"],
|
||||
["Span", "#span", "span"],
|
||||
|
|
|
@ -13,7 +13,7 @@ include _quickstart-examples
|
|||
|
||||
+h(2, "api") API
|
||||
|
||||
include _api-english
|
||||
include _api-language
|
||||
include _api-doc
|
||||
include _api-token
|
||||
include _api-span
|
||||
|
|
Loading…
Reference in New Issue
Block a user