Rename "English" section to "Language"

2025-12-11 12:14:30 +03:00 · 2016-10-21 00:58:24 +02:00 · 2016-10-21 00:58:24 +02:00 · f8322a69e7
commit f8322a69e7
parent e16e78a737
3 changed files with 101 additions and 117 deletions
--- a/website/docs/_api-language.jade
+++ b/website/docs/_api-language.jade
@ -1,150 +1,134 @@
 //- ----------------------------------
-//- 💫 DOCS > API > ENGLISH
+//- 💫 DOCS > API > LANGUAGE
 //- ----------------------------------

-+section("english")
-    +h(2, "english", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py")
-        | #[+tag class] English(Language)
+section("language")
+    +h(2, "language", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py")
+        | #[+tag class] Language

    p.
-        The English analysis pipeline. Usually you"ll load this once per process,
-        and pass the instance around your program.
+        A pipeline that transforms text strings into annotated spaCy Doc objects. Usually you'll load the Language pipeline once and pass the instance around your program.

    +code("python", "Overview").
        class Language:
-            lang = None
-            def __init__(self, data_dir=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None):
-                return self
+            Defaults = BaseDefaults

-            def __call__(self, text, tag=True, parse=True, entity=True):
-                return Doc()
+            def __init__(self, path=True, **overrides):
+                self.vocab = Vocab()
+                self.tokenizer = Tokenizer()
+                self.tagger = Tagger()
+                self.parser = DependencyParser()
+                self.entity = EntityRecognizer()
+                self.make_doc = lambda text: Doc()
+                self.pipeline = [self.tagger, self.parser, self.entity]

-            def pipe(self, texts_iterator, batch_size=1000, n_threads=2):
-                yield Doc()
+            def __call__(self, text, **toggle):
+                doc = self.make_doc(text)
+                for proc in self.pipeline:
+                    if toggle.get(process.name, True):
+                        process(doc)
+                return doc

-            def end_training(self, data_dir=None):
+            def pipe(self, texts_iterator, batch_size=1000, n_threads=2, **toggle):
+                docs = (self.make_doc(text) for text in texts_iterator)
+                for process in self.pipeline:
+                    if toggle.get(process.name, True):
+                        docs = process.pipe(docs, batch_size=batch_size, n_threads=n_threads)
+                for doc in self.docs:
+                    yield doc
+
+            def end_training(self, path=None):
                return None

-        class English(Language):
-            lang = "en"
+            class English(Language):
+                class Defaults(BaseDefaults):
+                    pass

-        class German(Language):
-            lang = "de"
+            class German(Language):
+                class Defaults(BaseDefaults):
+                    pass

    +section("english-init")
        +h(3, "english-init")
-            | #[+tag method] English.__init__
+            | #[+tag method] Language.__init__

        p
-            | Load the pipeline.  Each component can be passed
-            | as an argument, or left as #[code None], in which case it will be loaded
-            | from a classmethod, named e.g. #[code default_vocab()].
+            | Load the pipeline.  You can disable components by passing None as a value,
+            | e.g. pass parser=None, vectors=None to save memory if you're not using
+            | those components. You can also pass an object as the value.
+            | Pass a function create_pipeline to use a custom pipeline --- see
+            | the custom pipeline tutorial.

            +aside("Efficiency").
                Loading takes 10-20 seconds, and the instance consumes 2 to 3
                gigabytes of memory.  Intended use is for one instance to be
                created for each language per process, but you can create more
-                if you"re doing something unusual. You may wish to make the
+                if you're doing something unusual. You may wish to make the
                instance a global variable or "singleton".

        +table(["Example", "Description"])
            +row
-                +cell #[code.lang-python nlp = English()]
-                +cell Load everything, from default package
+                +cell #[code nlp = English()]
+                +cell Load everything, from default path.

            +row
-                +cell #[code.lang-python nlp = English(data_dir='my_data')]
-                +cell Load everything, from specified dir
+                +cell #[code nlp = English(path='my_data')]
+                +cell Load everything, from specified path

            +row
-                +cell #[code.lang-python nlp = English(parser=False)]
-                +cell Load everything except the parser.
+                +cell #[code nlp = English(path=path_obj)]
+                +cell Load everything, from an object that follows the #[code pathlib.Path] protocol.

            +row
-                +cell #[code.lang-python nlp = English(parser=False, tagger=False)]
-                +cell Load everything except the parser and tagger.
+                +cell #[code nlp = English(parser=False, vectors=False)]
+                +cell Load everything except the parser and the word vectors.

            +row
-                +cell #[code.lang-python nlp = English(parser=MyParser())]
-                +cell Supply your own parser
+                +cell #[code nlp = English(parser=my_parser)]
+                +cell Load everything, and use a custom parser.
+
+            +row
+                +cell #[code nlp = English(create_pipeline=my_pipeline)]
+                +cell Load everything, and use a custom pipeline.

        +code("python", "Definition").
-            def __init__(self, data_dir=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None):
-                return self
+            def __init__(self, path=True, **overrides):
+                D = self.Defaults
+                self.vocab     = Vocab(path=path, parent=self, **D.vocab) \
+                                 if 'vocab' not in overrides \
+                                 else overrides['vocab']
+                self.tokenizer = Tokenizer(self.vocab, path=path, **D.tokenizer) \
+                                 if 'tokenizer' not in overrides \
+                                 else overrides['tokenizer']
+                self.tagger    = Tagger(self.vocab, path=path, **D.tagger) \
+                                 if 'tagger' not in overrides \
+                                 else overrides['tagger']
+                self.parser    = DependencyParser(self.vocab, path=path, **D.parser) \
+                                 if 'parser' not in overrides \
+                                 else overrides['parser']
+                self.entity    = EntityRecognizer(self.vocab, path=path, **D.entity) \
+                                 if 'entity' not in overrides \
+                                 else overrides['entity']
+                self.matcher   = Matcher(self.vocab, path=path, **D.matcher) \
+                                 if 'matcher' not in overrides \
+                                 else overrides['matcher']

-        +table(["Arg", "Type", "Description"])
-            +row
-                +cell data_dir
-                +cell str
-                +cell.
-                    The data directory. If None, value is obtained via the
-                    #[code default_data_dir()] method.
+                if 'make_doc' in overrides:
+                    self.make_doc = overrides['make_doc']
+                elif 'create_make_doc' in overrides:
+                    self.make_doc = overrides['create_make_doc'](self)
+                else:
+                    self.make_doc = lambda text: self.tokenizer(text)
+                if 'pipeline' in overrides:
+                    self.pipeline = overrides['pipeline']
+                elif 'create_pipeline' in overrides:
+                    self.pipeline = overrides['create_pipeline'](self)
+                else:
+                    self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]

-            +row
-                +cell vocab
-                +cell #[code Vocab]
-                +cell.
-                    The vocab object, which should be an instance of class
-                    #[code spacy.vocab.Vocab]. If #[code None], the object is
-                    obtained from the #[code default_vocab()] class method. The
-                    vocab object manages all of the language specific rules and
-                    definitions, maintains the cache of lexical types, and manages
-                    the word vectors. Because the vocab owns this important data,
-                    most objects hold a reference to the vocab.
-
-            +row
-                +cell tokenizer
-                +cell #[code Tokenizer]
-                +cell.
-                    The tokenizer, which should be a callable that accepts a
-                    unicode string, and returns a #[code Doc] object. If set to
-                    #[code None], the default tokenizer is constructed from the
-                    #[code default_tokenizer()] method.
-
-            +row
-                +cell tagger
-                +cell #[code Tagger]
-                +cell.
-                    The part-of-speech tagger, which should be a callable that
-                    accepts a #[code Doc] object, and sets the part-of-speech
-                    tags in-place. If set to None, the default tagger is constructed
-                    from the #[code default_tagger()] method.
-
-            +row
-                +cell parser
-                +cell #[code Parser]
-                +cell.
-                    The dependency parser, which should be a callable that accepts
-                    a #[code Doc] object, and sets the sentence boundaries,
-                    syntactic heads and dependency labels in-place.
-                    If set to #[code None], the default parser is
-                    constructed from the #[code default_parser()] method. To disable
-                    the parser and prevent it from being loaded, pass #[code parser=False].
-
-            +row
-                +cell entity
-                +cell #[code Parser]
-                +cell.
-                    The named entity recognizer, which should be a callable that
-                    accepts a #[code Doc] object, and sets the named entity annotations
-                    in-place. If set to None, the default entity recognizer is
-                    constructed from the #[code default_entity()] method. To disable
-                    the entity recognizer and prevent it from being loaded, pass
-                    #[code entity=False].
-
-            +row
-                +cell matcher
-                +cell #[code Matcher]
-                +cell.
-                    The pattern matcher, which should be a callable that accepts
-                    a #[code Doc] object, and sets named entity annotations in-place
-                    using token-based rules. If set
-                    to None, the default matcher is constructed from the
-                    #[code default_matcher()] method.
-
-        +section("english-call")
-            +h(3, "english-call")
-                | #[+tag method] English.__call__
+        +section("language-call")
+            +h(3, "language-call")
+                | #[+tag method] Language.__call__

            p
                | The main entry point to spaCy. Takes raw unicode text, and returns
@ -152,30 +136,30 @@
                | and #[code Span] objects.

                +aside("Efficiency").
-                    spaCy"s algorithms are all linear-time, so you can supply
+                    spaCy's algorithms are all linear-time, so you can supply
                    documents of arbitrary length, e.g. whole novels.

            +table(["Example", "Description"], "code")
                +row
-                    +cell #[code.lang-python doc = nlp(u'Some text.')]
+                    +cell #[ doc = nlp(u'Some text.')]
                    +cell Apply the full pipeline.
                +row
-                    +cell #[code.lang-python doc = nlp(u'Some text.', parse=False)]
+                    +cell #[ doc = nlp(u'Some text.', parse=False)]
                    +cell Applies tagger and entity, not parser
                +row
-                    +cell #[code.lang-python doc = nlp(u'Some text.', entity=False)]
+                    +cell #[ doc = nlp(u'Some text.', entity=False)]
                    +cell Applies tagger and parser, not entity.
                +row
-                    +cell #[code.lang-python doc = nlp(u'Some text.', tag=False)]
+                    +cell #[ doc = nlp(u'Some text.', tag=False)]
                    +cell Does not apply tagger, entity or parser
                +row
-                    +cell #[code.lang-python doc = nlp(u'')]
+                    +cell #[ doc = nlp(u'')]
                    +cell Zero-length tokens, not an error
                +row
-                    +cell #[code.lang-python doc = nlp(b'Some text')]
+                    +cell #[ doc = nlp(b'Some text')]
                    +cell Error: need unicode
                +row
-                    +cell #[code.lang-python doc = nlp(b'Some text'.decode('utf8'))]
+                    +cell #[ doc = nlp(b'Some text'.decode('utf8'))]
                    +cell Decode bytes into unicode first.

            +code("python", "Definition").
--- a/website/docs/_data.json
+++ b/website/docs/_data.json
@ -8,7 +8,7 @@
                ["Usage Examples", "#examples", "examples"]
            ],
            "API": [
-                ["English", "#english", "english"],
+                ["Language", "#language", "language"],
                ["Doc", "#doc", "doc"],
                ["Token", "#token", "token"],
                ["Span", "#span", "span"],
--- a/website/docs/index.jade
+++ b/website/docs/index.jade
@ -13,7 +13,7 @@ include _quickstart-examples

 +h(2, "api") API

-include _api-english
+include _api-language
 include _api-doc
 include _api-token
 include _api-span