diff --git a/website/docs/_api-english.jade b/website/docs/_api-language.jade similarity index 51% rename from website/docs/_api-english.jade rename to website/docs/_api-language.jade index 2a951a8a4..fae3916b2 100644 --- a/website/docs/_api-english.jade +++ b/website/docs/_api-language.jade @@ -1,150 +1,134 @@ //- ---------------------------------- -//- 💫 DOCS > API > ENGLISH +//- 💫 DOCS > API > LANGUAGE //- ---------------------------------- -+section("english") - +h(2, "english", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py") - | #[+tag class] English(Language) ++section("language") + +h(2, "language", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py") + | #[+tag class] Language p. - The English analysis pipeline. Usually you"ll load this once per process, - and pass the instance around your program. + A pipeline that transforms text strings into annotated spaCy Doc objects. Usually you'll load the Language pipeline once and pass the instance around your program. +code("python", "Overview"). class Language: - lang = None - def __init__(self, data_dir=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None): - return self + Defaults = BaseDefaults - def __call__(self, text, tag=True, parse=True, entity=True): - return Doc() + def __init__(self, path=True, **overrides): + self.vocab = Vocab() + self.tokenizer = Tokenizer() + self.tagger = Tagger() + self.parser = DependencyParser() + self.entity = EntityRecognizer() + self.make_doc = lambda text: Doc() + self.pipeline = [self.tagger, self.parser, self.entity] - def pipe(self, texts_iterator, batch_size=1000, n_threads=2): - yield Doc() + def __call__(self, text, **toggle): + doc = self.make_doc(text) + for proc in self.pipeline: + if toggle.get(process.name, True): + process(doc) + return doc - def end_training(self, data_dir=None): + def pipe(self, texts_iterator, batch_size=1000, n_threads=2, **toggle): + docs = (self.make_doc(text) for text in texts_iterator) + for process in self.pipeline: + if toggle.get(process.name, True): + docs = process.pipe(docs, batch_size=batch_size, n_threads=n_threads) + for doc in self.docs: + yield doc + + def end_training(self, path=None): return None - class English(Language): - lang = "en" + class English(Language): + class Defaults(BaseDefaults): + pass - class German(Language): - lang = "de" + class German(Language): + class Defaults(BaseDefaults): + pass +section("english-init") +h(3, "english-init") - | #[+tag method] English.__init__ + | #[+tag method] Language.__init__ p - | Load the pipeline. Each component can be passed - | as an argument, or left as #[code None], in which case it will be loaded - | from a classmethod, named e.g. #[code default_vocab()]. + | Load the pipeline. You can disable components by passing None as a value, + | e.g. pass parser=None, vectors=None to save memory if you're not using + | those components. You can also pass an object as the value. + | Pass a function create_pipeline to use a custom pipeline --- see + | the custom pipeline tutorial. +aside("Efficiency"). Loading takes 10-20 seconds, and the instance consumes 2 to 3 gigabytes of memory. Intended use is for one instance to be created for each language per process, but you can create more - if you"re doing something unusual. You may wish to make the + if you're doing something unusual. You may wish to make the instance a global variable or "singleton". +table(["Example", "Description"]) +row - +cell #[code.lang-python nlp = English()] - +cell Load everything, from default package + +cell #[code nlp = English()] + +cell Load everything, from default path. +row - +cell #[code.lang-python nlp = English(data_dir='my_data')] - +cell Load everything, from specified dir + +cell #[code nlp = English(path='my_data')] + +cell Load everything, from specified path +row - +cell #[code.lang-python nlp = English(parser=False)] - +cell Load everything except the parser. + +cell #[code nlp = English(path=path_obj)] + +cell Load everything, from an object that follows the #[code pathlib.Path] protocol. +row - +cell #[code.lang-python nlp = English(parser=False, tagger=False)] - +cell Load everything except the parser and tagger. + +cell #[code nlp = English(parser=False, vectors=False)] + +cell Load everything except the parser and the word vectors. +row - +cell #[code.lang-python nlp = English(parser=MyParser())] - +cell Supply your own parser + +cell #[code nlp = English(parser=my_parser)] + +cell Load everything, and use a custom parser. + + +row + +cell #[code nlp = English(create_pipeline=my_pipeline)] + +cell Load everything, and use a custom pipeline. +code("python", "Definition"). - def __init__(self, data_dir=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None): - return self + def __init__(self, path=True, **overrides): + D = self.Defaults + self.vocab = Vocab(path=path, parent=self, **D.vocab) \ + if 'vocab' not in overrides \ + else overrides['vocab'] + self.tokenizer = Tokenizer(self.vocab, path=path, **D.tokenizer) \ + if 'tokenizer' not in overrides \ + else overrides['tokenizer'] + self.tagger = Tagger(self.vocab, path=path, **D.tagger) \ + if 'tagger' not in overrides \ + else overrides['tagger'] + self.parser = DependencyParser(self.vocab, path=path, **D.parser) \ + if 'parser' not in overrides \ + else overrides['parser'] + self.entity = EntityRecognizer(self.vocab, path=path, **D.entity) \ + if 'entity' not in overrides \ + else overrides['entity'] + self.matcher = Matcher(self.vocab, path=path, **D.matcher) \ + if 'matcher' not in overrides \ + else overrides['matcher'] - +table(["Arg", "Type", "Description"]) - +row - +cell data_dir - +cell str - +cell. - The data directory. If None, value is obtained via the - #[code default_data_dir()] method. + if 'make_doc' in overrides: + self.make_doc = overrides['make_doc'] + elif 'create_make_doc' in overrides: + self.make_doc = overrides['create_make_doc'](self) + else: + self.make_doc = lambda text: self.tokenizer(text) + if 'pipeline' in overrides: + self.pipeline = overrides['pipeline'] + elif 'create_pipeline' in overrides: + self.pipeline = overrides['create_pipeline'](self) + else: + self.pipeline = [self.tagger, self.parser, self.matcher, self.entity] - +row - +cell vocab - +cell #[code Vocab] - +cell. - The vocab object, which should be an instance of class - #[code spacy.vocab.Vocab]. If #[code None], the object is - obtained from the #[code default_vocab()] class method. The - vocab object manages all of the language specific rules and - definitions, maintains the cache of lexical types, and manages - the word vectors. Because the vocab owns this important data, - most objects hold a reference to the vocab. - - +row - +cell tokenizer - +cell #[code Tokenizer] - +cell. - The tokenizer, which should be a callable that accepts a - unicode string, and returns a #[code Doc] object. If set to - #[code None], the default tokenizer is constructed from the - #[code default_tokenizer()] method. - - +row - +cell tagger - +cell #[code Tagger] - +cell. - The part-of-speech tagger, which should be a callable that - accepts a #[code Doc] object, and sets the part-of-speech - tags in-place. If set to None, the default tagger is constructed - from the #[code default_tagger()] method. - - +row - +cell parser - +cell #[code Parser] - +cell. - The dependency parser, which should be a callable that accepts - a #[code Doc] object, and sets the sentence boundaries, - syntactic heads and dependency labels in-place. - If set to #[code None], the default parser is - constructed from the #[code default_parser()] method. To disable - the parser and prevent it from being loaded, pass #[code parser=False]. - - +row - +cell entity - +cell #[code Parser] - +cell. - The named entity recognizer, which should be a callable that - accepts a #[code Doc] object, and sets the named entity annotations - in-place. If set to None, the default entity recognizer is - constructed from the #[code default_entity()] method. To disable - the entity recognizer and prevent it from being loaded, pass - #[code entity=False]. - - +row - +cell matcher - +cell #[code Matcher] - +cell. - The pattern matcher, which should be a callable that accepts - a #[code Doc] object, and sets named entity annotations in-place - using token-based rules. If set - to None, the default matcher is constructed from the - #[code default_matcher()] method. - - +section("english-call") - +h(3, "english-call") - | #[+tag method] English.__call__ + +section("language-call") + +h(3, "language-call") + | #[+tag method] Language.__call__ p | The main entry point to spaCy. Takes raw unicode text, and returns @@ -152,30 +136,30 @@ | and #[code Span] objects. +aside("Efficiency"). - spaCy"s algorithms are all linear-time, so you can supply + spaCy's algorithms are all linear-time, so you can supply documents of arbitrary length, e.g. whole novels. +table(["Example", "Description"], "code") +row - +cell #[code.lang-python doc = nlp(u'Some text.')] + +cell #[ doc = nlp(u'Some text.')] +cell Apply the full pipeline. +row - +cell #[code.lang-python doc = nlp(u'Some text.', parse=False)] + +cell #[ doc = nlp(u'Some text.', parse=False)] +cell Applies tagger and entity, not parser +row - +cell #[code.lang-python doc = nlp(u'Some text.', entity=False)] + +cell #[ doc = nlp(u'Some text.', entity=False)] +cell Applies tagger and parser, not entity. +row - +cell #[code.lang-python doc = nlp(u'Some text.', tag=False)] + +cell #[ doc = nlp(u'Some text.', tag=False)] +cell Does not apply tagger, entity or parser +row - +cell #[code.lang-python doc = nlp(u'')] + +cell #[ doc = nlp(u'')] +cell Zero-length tokens, not an error +row - +cell #[code.lang-python doc = nlp(b'Some text')] + +cell #[ doc = nlp(b'Some text')] +cell Error: need unicode +row - +cell #[code.lang-python doc = nlp(b'Some text'.decode('utf8'))] + +cell #[ doc = nlp(b'Some text'.decode('utf8'))] +cell Decode bytes into unicode first. +code("python", "Definition"). diff --git a/website/docs/_data.json b/website/docs/_data.json index cde95e48b..37cafbc20 100644 --- a/website/docs/_data.json +++ b/website/docs/_data.json @@ -8,7 +8,7 @@ ["Usage Examples", "#examples", "examples"] ], "API": [ - ["English", "#english", "english"], + ["Language", "#language", "language"], ["Doc", "#doc", "doc"], ["Token", "#token", "token"], ["Span", "#span", "span"], diff --git a/website/docs/index.jade b/website/docs/index.jade index 043021193..9d745777e 100644 --- a/website/docs/index.jade +++ b/website/docs/index.jade @@ -13,7 +13,7 @@ include _quickstart-examples +h(2, "api") API -include _api-english +include _api-language include _api-doc include _api-token include _api-span