From f8322a69e77719c185b3a97efcab322fd1207636 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 21 Oct 2016 00:58:24 +0200
Subject: [PATCH] Rename "English" section to "Language"

---
 .../{_api-english.jade => _api-language.jade} | 214 ++++++++----------
 website/docs/_data.json                       |   2 +-
 website/docs/index.jade                       |   2 +-
 3 files changed, 101 insertions(+), 117 deletions(-)
 rename website/docs/{_api-english.jade => _api-language.jade} (51%)

diff --git a/website/docs/_api-english.jade b/website/docs/_api-language.jade
similarity index 51%
rename from website/docs/_api-english.jade
rename to website/docs/_api-language.jade
index 2a951a8a4..fae3916b2 100644
--- a/website/docs/_api-english.jade
+++ b/website/docs/_api-language.jade
@@ -1,150 +1,134 @@
 //- ----------------------------------
-//- 💫 DOCS > API > ENGLISH
+//- 💫 DOCS > API > LANGUAGE
 //- ----------------------------------
 
-+section("english")
-    +h(2, "english", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py")
-        | #[+tag class] English(Language)
++section("language")
+    +h(2, "language", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py")
+        | #[+tag class] Language
 
     p.
-        The English analysis pipeline. Usually you"ll load this once per process,
-        and pass the instance around your program.
+        A pipeline that transforms text strings into annotated spaCy Doc objects. Usually you'll load the Language pipeline once and pass the instance around your program.
 
     +code("python", "Overview").
         class Language:
-            lang = None
-            def __init__(self, data_dir=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None):
-                return self
+            Defaults = BaseDefaults
 
-            def __call__(self, text, tag=True, parse=True, entity=True):
-                return Doc()
+            def __init__(self, path=True, **overrides):
+                self.vocab = Vocab()
+                self.tokenizer = Tokenizer()
+                self.tagger = Tagger()
+                self.parser = DependencyParser()
+                self.entity = EntityRecognizer()
+                self.make_doc = lambda text: Doc()
+                self.pipeline = [self.tagger, self.parser, self.entity]
 
-            def pipe(self, texts_iterator, batch_size=1000, n_threads=2):
-                yield Doc()
+            def __call__(self, text, **toggle):
+                doc = self.make_doc(text)
+                for proc in self.pipeline:
+                    if toggle.get(process.name, True):
+                        process(doc)
+                return doc
 
-            def end_training(self, data_dir=None):
+            def pipe(self, texts_iterator, batch_size=1000, n_threads=2, **toggle):
+                docs = (self.make_doc(text) for text in texts_iterator)
+                for process in self.pipeline:
+                    if toggle.get(process.name, True):
+                        docs = process.pipe(docs, batch_size=batch_size, n_threads=n_threads)
+                for doc in self.docs:
+                    yield doc
+
+            def end_training(self, path=None):
                 return None
 
-        class English(Language):
-            lang = "en"
+            class English(Language):
+                class Defaults(BaseDefaults):
+                    pass
 
-        class German(Language):
-            lang = "de"
+            class German(Language):
+                class Defaults(BaseDefaults):
+                    pass
 
     +section("english-init")
         +h(3, "english-init")
-            | #[+tag method] English.__init__
+            | #[+tag method] Language.__init__
 
         p
-            | Load the pipeline.  Each component can be passed
-            | as an argument, or left as #[code None], in which case it will be loaded
-            | from a classmethod, named e.g. #[code default_vocab()].
+            | Load the pipeline.  You can disable components by passing None as a value,
+            | e.g. pass parser=None, vectors=None to save memory if you're not using
+            | those components. You can also pass an object as the value.
+            | Pass a function create_pipeline to use a custom pipeline --- see
+            | the custom pipeline tutorial.
 
             +aside("Efficiency").
                 Loading takes 10-20 seconds, and the instance consumes 2 to 3
                 gigabytes of memory.  Intended use is for one instance to be
                 created for each language per process, but you can create more
-                if you"re doing something unusual. You may wish to make the
+                if you're doing something unusual. You may wish to make the
                 instance a global variable or "singleton".
 
         +table(["Example", "Description"])
             +row
-                +cell #[code.lang-python nlp = English()]
-                +cell Load everything, from default package
+                +cell #[code nlp = English()]
+                +cell Load everything, from default path.
 
             +row
-                +cell #[code.lang-python nlp = English(data_dir='my_data')]
-                +cell Load everything, from specified dir
+                +cell #[code nlp = English(path='my_data')]
+                +cell Load everything, from specified path
 
             +row
-                +cell #[code.lang-python nlp = English(parser=False)]
-                +cell Load everything except the parser.
+                +cell #[code nlp = English(path=path_obj)]
+                +cell Load everything, from an object that follows the #[code pathlib.Path] protocol.
 
             +row
-                +cell #[code.lang-python nlp = English(parser=False, tagger=False)]
-                +cell Load everything except the parser and tagger.
+                +cell #[code nlp = English(parser=False, vectors=False)]
+                +cell Load everything except the parser and the word vectors.
 
             +row
-                +cell #[code.lang-python nlp = English(parser=MyParser())]
-                +cell Supply your own parser
+                +cell #[code nlp = English(parser=my_parser)]
+                +cell Load everything, and use a custom parser.
+
+            +row
+                +cell #[code nlp = English(create_pipeline=my_pipeline)]
+                +cell Load everything, and use a custom pipeline.
 
         +code("python", "Definition").
-            def __init__(self, data_dir=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None):
-                return self
+            def __init__(self, path=True, **overrides):
+                D = self.Defaults
+                self.vocab     = Vocab(path=path, parent=self, **D.vocab) \
+                                 if 'vocab' not in overrides \
+                                 else overrides['vocab']
+                self.tokenizer = Tokenizer(self.vocab, path=path, **D.tokenizer) \
+                                 if 'tokenizer' not in overrides \
+                                 else overrides['tokenizer']
+                self.tagger    = Tagger(self.vocab, path=path, **D.tagger) \
+                                 if 'tagger' not in overrides \
+                                 else overrides['tagger']
+                self.parser    = DependencyParser(self.vocab, path=path, **D.parser) \
+                                 if 'parser' not in overrides \
+                                 else overrides['parser']
+                self.entity    = EntityRecognizer(self.vocab, path=path, **D.entity) \
+                                 if 'entity' not in overrides \
+                                 else overrides['entity']
+                self.matcher   = Matcher(self.vocab, path=path, **D.matcher) \
+                                 if 'matcher' not in overrides \
+                                 else overrides['matcher']
 
-        +table(["Arg", "Type", "Description"])
-            +row
-                +cell data_dir
-                +cell str
-                +cell.
-                    The data directory. If None, value is obtained via the
-                    #[code default_data_dir()] method.
+                if 'make_doc' in overrides:
+                    self.make_doc = overrides['make_doc']
+                elif 'create_make_doc' in overrides:
+                    self.make_doc = overrides['create_make_doc'](self)
+                else:
+                    self.make_doc = lambda text: self.tokenizer(text)
+                if 'pipeline' in overrides:
+                    self.pipeline = overrides['pipeline']
+                elif 'create_pipeline' in overrides:
+                    self.pipeline = overrides['create_pipeline'](self)
+                else:
+                    self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]
 
-            +row
-                +cell vocab
-                +cell #[code Vocab]
-                +cell.
-                    The vocab object, which should be an instance of class
-                    #[code spacy.vocab.Vocab]. If #[code None], the object is
-                    obtained from the #[code default_vocab()] class method. The
-                    vocab object manages all of the language specific rules and
-                    definitions, maintains the cache of lexical types, and manages
-                    the word vectors. Because the vocab owns this important data,
-                    most objects hold a reference to the vocab.
-
-            +row
-                +cell tokenizer
-                +cell #[code Tokenizer]
-                +cell.
-                    The tokenizer, which should be a callable that accepts a
-                    unicode string, and returns a #[code Doc] object. If set to
-                    #[code None], the default tokenizer is constructed from the
-                    #[code default_tokenizer()] method.
-
-            +row
-                +cell tagger
-                +cell #[code Tagger]
-                +cell.
-                    The part-of-speech tagger, which should be a callable that
-                    accepts a #[code Doc] object, and sets the part-of-speech
-                    tags in-place. If set to None, the default tagger is constructed
-                    from the #[code default_tagger()] method.
-
-            +row
-                +cell parser
-                +cell #[code Parser]
-                +cell.
-                    The dependency parser, which should be a callable that accepts
-                    a #[code Doc] object, and sets the sentence boundaries,
-                    syntactic heads and dependency labels in-place.
-                    If set to #[code None], the default parser is
-                    constructed from the #[code default_parser()] method. To disable
-                    the parser and prevent it from being loaded, pass #[code parser=False].
-
-            +row
-                +cell entity
-                +cell #[code Parser]
-                +cell.
-                    The named entity recognizer, which should be a callable that
-                    accepts a #[code Doc] object, and sets the named entity annotations
-                    in-place. If set to None, the default entity recognizer is
-                    constructed from the #[code default_entity()] method. To disable
-                    the entity recognizer and prevent it from being loaded, pass
-                    #[code entity=False].
-
-            +row
-                +cell matcher
-                +cell #[code Matcher]
-                +cell.
-                    The pattern matcher, which should be a callable that accepts
-                    a #[code Doc] object, and sets named entity annotations in-place
-                    using token-based rules. If set
-                    to None, the default matcher is constructed from the
-                    #[code default_matcher()] method.
-
-        +section("english-call")
-            +h(3, "english-call")
-                | #[+tag method] English.__call__
+        +section("language-call")
+            +h(3, "language-call")
+                | #[+tag method] Language.__call__
 
             p
                 | The main entry point to spaCy. Takes raw unicode text, and returns
@@ -152,30 +136,30 @@
                 | and #[code Span] objects.
 
                 +aside("Efficiency").
-                    spaCy"s algorithms are all linear-time, so you can supply
+                    spaCy's algorithms are all linear-time, so you can supply
                     documents of arbitrary length, e.g. whole novels.
 
             +table(["Example", "Description"], "code")
                 +row
-                    +cell #[code.lang-python doc = nlp(u'Some text.')]
+                    +cell #[ doc = nlp(u'Some text.')]
                     +cell Apply the full pipeline.
                 +row
-                    +cell #[code.lang-python doc = nlp(u'Some text.', parse=False)]
+                    +cell #[ doc = nlp(u'Some text.', parse=False)]
                     +cell Applies tagger and entity, not parser
                 +row
-                    +cell #[code.lang-python doc = nlp(u'Some text.', entity=False)]
+                    +cell #[ doc = nlp(u'Some text.', entity=False)]
                     +cell Applies tagger and parser, not entity.
                 +row
-                    +cell #[code.lang-python doc = nlp(u'Some text.', tag=False)]
+                    +cell #[ doc = nlp(u'Some text.', tag=False)]
                     +cell Does not apply tagger, entity or parser
                 +row
-                    +cell #[code.lang-python doc = nlp(u'')]
+                    +cell #[ doc = nlp(u'')]
                     +cell Zero-length tokens, not an error
                 +row
-                    +cell #[code.lang-python doc = nlp(b'Some text')]
+                    +cell #[ doc = nlp(b'Some text')]
                     +cell Error: need unicode
                 +row
-                    +cell #[code.lang-python doc = nlp(b'Some text'.decode('utf8'))]
+                    +cell #[ doc = nlp(b'Some text'.decode('utf8'))]
                     +cell Decode bytes into unicode first.
 
             +code("python", "Definition").
diff --git a/website/docs/_data.json b/website/docs/_data.json
index cde95e48b..37cafbc20 100644
--- a/website/docs/_data.json
+++ b/website/docs/_data.json
@@ -8,7 +8,7 @@
                 ["Usage Examples", "#examples", "examples"]
             ],
             "API": [
-                ["English", "#english", "english"],
+                ["Language", "#language", "language"],
                 ["Doc", "#doc", "doc"],
                 ["Token", "#token", "token"],
                 ["Span", "#span", "span"],
diff --git a/website/docs/index.jade b/website/docs/index.jade
index 043021193..9d745777e 100644
--- a/website/docs/index.jade
+++ b/website/docs/index.jade
@@ -13,7 +13,7 @@ include _quickstart-examples
 
 +h(2, "api") API
 
-include _api-english
+include _api-language
 include _api-doc
 include _api-token
 include _api-span