mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Rename "English" section to "Language"
This commit is contained in:
parent
e16e78a737
commit
f8322a69e7
|
@ -1,150 +1,134 @@
|
||||||
//- ----------------------------------
|
//- ----------------------------------
|
||||||
//- 💫 DOCS > API > ENGLISH
|
//- 💫 DOCS > API > LANGUAGE
|
||||||
//- ----------------------------------
|
//- ----------------------------------
|
||||||
|
|
||||||
+section("english")
|
+section("language")
|
||||||
+h(2, "english", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py")
|
+h(2, "language", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py")
|
||||||
| #[+tag class] English(Language)
|
| #[+tag class] Language
|
||||||
|
|
||||||
p.
|
p.
|
||||||
The English analysis pipeline. Usually you"ll load this once per process,
|
A pipeline that transforms text strings into annotated spaCy Doc objects. Usually you'll load the Language pipeline once and pass the instance around your program.
|
||||||
and pass the instance around your program.
|
|
||||||
|
|
||||||
+code("python", "Overview").
|
+code("python", "Overview").
|
||||||
class Language:
|
class Language:
|
||||||
lang = None
|
Defaults = BaseDefaults
|
||||||
def __init__(self, data_dir=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __call__(self, text, tag=True, parse=True, entity=True):
|
def __init__(self, path=True, **overrides):
|
||||||
return Doc()
|
self.vocab = Vocab()
|
||||||
|
self.tokenizer = Tokenizer()
|
||||||
|
self.tagger = Tagger()
|
||||||
|
self.parser = DependencyParser()
|
||||||
|
self.entity = EntityRecognizer()
|
||||||
|
self.make_doc = lambda text: Doc()
|
||||||
|
self.pipeline = [self.tagger, self.parser, self.entity]
|
||||||
|
|
||||||
def pipe(self, texts_iterator, batch_size=1000, n_threads=2):
|
def __call__(self, text, **toggle):
|
||||||
yield Doc()
|
doc = self.make_doc(text)
|
||||||
|
for proc in self.pipeline:
|
||||||
|
if toggle.get(process.name, True):
|
||||||
|
process(doc)
|
||||||
|
return doc
|
||||||
|
|
||||||
def end_training(self, data_dir=None):
|
def pipe(self, texts_iterator, batch_size=1000, n_threads=2, **toggle):
|
||||||
|
docs = (self.make_doc(text) for text in texts_iterator)
|
||||||
|
for process in self.pipeline:
|
||||||
|
if toggle.get(process.name, True):
|
||||||
|
docs = process.pipe(docs, batch_size=batch_size, n_threads=n_threads)
|
||||||
|
for doc in self.docs:
|
||||||
|
yield doc
|
||||||
|
|
||||||
|
def end_training(self, path=None):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
class English(Language):
|
class English(Language):
|
||||||
lang = "en"
|
class Defaults(BaseDefaults):
|
||||||
|
pass
|
||||||
|
|
||||||
class German(Language):
|
class German(Language):
|
||||||
lang = "de"
|
class Defaults(BaseDefaults):
|
||||||
|
pass
|
||||||
|
|
||||||
+section("english-init")
|
+section("english-init")
|
||||||
+h(3, "english-init")
|
+h(3, "english-init")
|
||||||
| #[+tag method] English.__init__
|
| #[+tag method] Language.__init__
|
||||||
|
|
||||||
p
|
p
|
||||||
| Load the pipeline. Each component can be passed
|
| Load the pipeline. You can disable components by passing None as a value,
|
||||||
| as an argument, or left as #[code None], in which case it will be loaded
|
| e.g. pass parser=None, vectors=None to save memory if you're not using
|
||||||
| from a classmethod, named e.g. #[code default_vocab()].
|
| those components. You can also pass an object as the value.
|
||||||
|
| Pass a function create_pipeline to use a custom pipeline --- see
|
||||||
|
| the custom pipeline tutorial.
|
||||||
|
|
||||||
+aside("Efficiency").
|
+aside("Efficiency").
|
||||||
Loading takes 10-20 seconds, and the instance consumes 2 to 3
|
Loading takes 10-20 seconds, and the instance consumes 2 to 3
|
||||||
gigabytes of memory. Intended use is for one instance to be
|
gigabytes of memory. Intended use is for one instance to be
|
||||||
created for each language per process, but you can create more
|
created for each language per process, but you can create more
|
||||||
if you"re doing something unusual. You may wish to make the
|
if you're doing something unusual. You may wish to make the
|
||||||
instance a global variable or "singleton".
|
instance a global variable or "singleton".
|
||||||
|
|
||||||
+table(["Example", "Description"])
|
+table(["Example", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code.lang-python nlp = English()]
|
+cell #[code nlp = English()]
|
||||||
+cell Load everything, from default package
|
+cell Load everything, from default path.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code.lang-python nlp = English(data_dir='my_data')]
|
+cell #[code nlp = English(path='my_data')]
|
||||||
+cell Load everything, from specified dir
|
+cell Load everything, from specified path
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code.lang-python nlp = English(parser=False)]
|
+cell #[code nlp = English(path=path_obj)]
|
||||||
+cell Load everything except the parser.
|
+cell Load everything, from an object that follows the #[code pathlib.Path] protocol.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code.lang-python nlp = English(parser=False, tagger=False)]
|
+cell #[code nlp = English(parser=False, vectors=False)]
|
||||||
+cell Load everything except the parser and tagger.
|
+cell Load everything except the parser and the word vectors.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code.lang-python nlp = English(parser=MyParser())]
|
+cell #[code nlp = English(parser=my_parser)]
|
||||||
+cell Supply your own parser
|
+cell Load everything, and use a custom parser.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code nlp = English(create_pipeline=my_pipeline)]
|
||||||
|
+cell Load everything, and use a custom pipeline.
|
||||||
|
|
||||||
+code("python", "Definition").
|
+code("python", "Definition").
|
||||||
def __init__(self, data_dir=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None):
|
def __init__(self, path=True, **overrides):
|
||||||
return self
|
D = self.Defaults
|
||||||
|
self.vocab = Vocab(path=path, parent=self, **D.vocab) \
|
||||||
|
if 'vocab' not in overrides \
|
||||||
|
else overrides['vocab']
|
||||||
|
self.tokenizer = Tokenizer(self.vocab, path=path, **D.tokenizer) \
|
||||||
|
if 'tokenizer' not in overrides \
|
||||||
|
else overrides['tokenizer']
|
||||||
|
self.tagger = Tagger(self.vocab, path=path, **D.tagger) \
|
||||||
|
if 'tagger' not in overrides \
|
||||||
|
else overrides['tagger']
|
||||||
|
self.parser = DependencyParser(self.vocab, path=path, **D.parser) \
|
||||||
|
if 'parser' not in overrides \
|
||||||
|
else overrides['parser']
|
||||||
|
self.entity = EntityRecognizer(self.vocab, path=path, **D.entity) \
|
||||||
|
if 'entity' not in overrides \
|
||||||
|
else overrides['entity']
|
||||||
|
self.matcher = Matcher(self.vocab, path=path, **D.matcher) \
|
||||||
|
if 'matcher' not in overrides \
|
||||||
|
else overrides['matcher']
|
||||||
|
|
||||||
+table(["Arg", "Type", "Description"])
|
if 'make_doc' in overrides:
|
||||||
+row
|
self.make_doc = overrides['make_doc']
|
||||||
+cell data_dir
|
elif 'create_make_doc' in overrides:
|
||||||
+cell str
|
self.make_doc = overrides['create_make_doc'](self)
|
||||||
+cell.
|
else:
|
||||||
The data directory. If None, value is obtained via the
|
self.make_doc = lambda text: self.tokenizer(text)
|
||||||
#[code default_data_dir()] method.
|
if 'pipeline' in overrides:
|
||||||
|
self.pipeline = overrides['pipeline']
|
||||||
|
elif 'create_pipeline' in overrides:
|
||||||
|
self.pipeline = overrides['create_pipeline'](self)
|
||||||
|
else:
|
||||||
|
self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]
|
||||||
|
|
||||||
+row
|
+section("language-call")
|
||||||
+cell vocab
|
+h(3, "language-call")
|
||||||
+cell #[code Vocab]
|
| #[+tag method] Language.__call__
|
||||||
+cell.
|
|
||||||
The vocab object, which should be an instance of class
|
|
||||||
#[code spacy.vocab.Vocab]. If #[code None], the object is
|
|
||||||
obtained from the #[code default_vocab()] class method. The
|
|
||||||
vocab object manages all of the language specific rules and
|
|
||||||
definitions, maintains the cache of lexical types, and manages
|
|
||||||
the word vectors. Because the vocab owns this important data,
|
|
||||||
most objects hold a reference to the vocab.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell tokenizer
|
|
||||||
+cell #[code Tokenizer]
|
|
||||||
+cell.
|
|
||||||
The tokenizer, which should be a callable that accepts a
|
|
||||||
unicode string, and returns a #[code Doc] object. If set to
|
|
||||||
#[code None], the default tokenizer is constructed from the
|
|
||||||
#[code default_tokenizer()] method.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell tagger
|
|
||||||
+cell #[code Tagger]
|
|
||||||
+cell.
|
|
||||||
The part-of-speech tagger, which should be a callable that
|
|
||||||
accepts a #[code Doc] object, and sets the part-of-speech
|
|
||||||
tags in-place. If set to None, the default tagger is constructed
|
|
||||||
from the #[code default_tagger()] method.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell parser
|
|
||||||
+cell #[code Parser]
|
|
||||||
+cell.
|
|
||||||
The dependency parser, which should be a callable that accepts
|
|
||||||
a #[code Doc] object, and sets the sentence boundaries,
|
|
||||||
syntactic heads and dependency labels in-place.
|
|
||||||
If set to #[code None], the default parser is
|
|
||||||
constructed from the #[code default_parser()] method. To disable
|
|
||||||
the parser and prevent it from being loaded, pass #[code parser=False].
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell entity
|
|
||||||
+cell #[code Parser]
|
|
||||||
+cell.
|
|
||||||
The named entity recognizer, which should be a callable that
|
|
||||||
accepts a #[code Doc] object, and sets the named entity annotations
|
|
||||||
in-place. If set to None, the default entity recognizer is
|
|
||||||
constructed from the #[code default_entity()] method. To disable
|
|
||||||
the entity recognizer and prevent it from being loaded, pass
|
|
||||||
#[code entity=False].
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell matcher
|
|
||||||
+cell #[code Matcher]
|
|
||||||
+cell.
|
|
||||||
The pattern matcher, which should be a callable that accepts
|
|
||||||
a #[code Doc] object, and sets named entity annotations in-place
|
|
||||||
using token-based rules. If set
|
|
||||||
to None, the default matcher is constructed from the
|
|
||||||
#[code default_matcher()] method.
|
|
||||||
|
|
||||||
+section("english-call")
|
|
||||||
+h(3, "english-call")
|
|
||||||
| #[+tag method] English.__call__
|
|
||||||
|
|
||||||
p
|
p
|
||||||
| The main entry point to spaCy. Takes raw unicode text, and returns
|
| The main entry point to spaCy. Takes raw unicode text, and returns
|
||||||
|
@ -152,30 +136,30 @@
|
||||||
| and #[code Span] objects.
|
| and #[code Span] objects.
|
||||||
|
|
||||||
+aside("Efficiency").
|
+aside("Efficiency").
|
||||||
spaCy"s algorithms are all linear-time, so you can supply
|
spaCy's algorithms are all linear-time, so you can supply
|
||||||
documents of arbitrary length, e.g. whole novels.
|
documents of arbitrary length, e.g. whole novels.
|
||||||
|
|
||||||
+table(["Example", "Description"], "code")
|
+table(["Example", "Description"], "code")
|
||||||
+row
|
+row
|
||||||
+cell #[code.lang-python doc = nlp(u'Some text.')]
|
+cell #[ doc = nlp(u'Some text.')]
|
||||||
+cell Apply the full pipeline.
|
+cell Apply the full pipeline.
|
||||||
+row
|
+row
|
||||||
+cell #[code.lang-python doc = nlp(u'Some text.', parse=False)]
|
+cell #[ doc = nlp(u'Some text.', parse=False)]
|
||||||
+cell Applies tagger and entity, not parser
|
+cell Applies tagger and entity, not parser
|
||||||
+row
|
+row
|
||||||
+cell #[code.lang-python doc = nlp(u'Some text.', entity=False)]
|
+cell #[ doc = nlp(u'Some text.', entity=False)]
|
||||||
+cell Applies tagger and parser, not entity.
|
+cell Applies tagger and parser, not entity.
|
||||||
+row
|
+row
|
||||||
+cell #[code.lang-python doc = nlp(u'Some text.', tag=False)]
|
+cell #[ doc = nlp(u'Some text.', tag=False)]
|
||||||
+cell Does not apply tagger, entity or parser
|
+cell Does not apply tagger, entity or parser
|
||||||
+row
|
+row
|
||||||
+cell #[code.lang-python doc = nlp(u'')]
|
+cell #[ doc = nlp(u'')]
|
||||||
+cell Zero-length tokens, not an error
|
+cell Zero-length tokens, not an error
|
||||||
+row
|
+row
|
||||||
+cell #[code.lang-python doc = nlp(b'Some text')]
|
+cell #[ doc = nlp(b'Some text')]
|
||||||
+cell Error: need unicode
|
+cell Error: need unicode
|
||||||
+row
|
+row
|
||||||
+cell #[code.lang-python doc = nlp(b'Some text'.decode('utf8'))]
|
+cell #[ doc = nlp(b'Some text'.decode('utf8'))]
|
||||||
+cell Decode bytes into unicode first.
|
+cell Decode bytes into unicode first.
|
||||||
|
|
||||||
+code("python", "Definition").
|
+code("python", "Definition").
|
|
@ -8,7 +8,7 @@
|
||||||
["Usage Examples", "#examples", "examples"]
|
["Usage Examples", "#examples", "examples"]
|
||||||
],
|
],
|
||||||
"API": [
|
"API": [
|
||||||
["English", "#english", "english"],
|
["Language", "#language", "language"],
|
||||||
["Doc", "#doc", "doc"],
|
["Doc", "#doc", "doc"],
|
||||||
["Token", "#token", "token"],
|
["Token", "#token", "token"],
|
||||||
["Span", "#span", "span"],
|
["Span", "#span", "span"],
|
||||||
|
|
|
@ -13,7 +13,7 @@ include _quickstart-examples
|
||||||
|
|
||||||
+h(2, "api") API
|
+h(2, "api") API
|
||||||
|
|
||||||
include _api-english
|
include _api-language
|
||||||
include _api-doc
|
include _api-doc
|
||||||
include _api-token
|
include _api-token
|
||||||
include _api-span
|
include _api-span
|
||||||
|
|
Loading…
Reference in New Issue
Block a user