Update API docs and fix typos

2025-10-27 22:21:08 +03:00 · 2017-05-26 12:43:16 +02:00 · 2017-05-26 12:43:16 +02:00 · d48530835a
commit d48530835a
parent ea9474f71c
6 changed files with 153 additions and 68 deletions
--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@ -278,7 +278,8 @@ p Loads state from a directory. Modifies the object in place and returns it.

 +aside-code("Example").
    from spacy.tokens import Doc
-    doc = Doc().from_disk('/path/to/doc')
+    from spacy.vocab import Vocab
+    doc = Doc(Vocab()).from_disk('/path/to/doc')

 +table(["Name", "Type", "Description"])
    +row
--- a/website/docs/api/lexeme.jade
+++ b/website/docs/api/lexeme.jade
@ -212,62 +212,74 @@ p The L2 norm of the lexeme's vector representation.
    +row
        +cell #[code is_alpha]
        +cell bool
-        +cell Equivalent to #[code word.orth_.isalpha()].
+        +cell
+            |  Does the lexeme consist of alphabetic characters? Equivalent to
+            |  #[code lexeme.text.isalpha()].

    +row
        +cell #[code is_ascii]
        +cell bool
-        +cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]].
+        +cell
+            |  Does the lexeme consist of ASCII characters? Equivalent to
+            |  #[code [any(ord(c) >= 128 for c in lexeme.text)]].

    +row
        +cell #[code is_digit]
        +cell bool
-        +cell Equivalent to #[code word.orth_.isdigit()].
+        +cell
+            |  Does the lexeme consist of digits? Equivalent to
+            |  #[code lexeme.text.isdigit()].

    +row
        +cell #[code is_lower]
        +cell bool
-        +cell Equivalent to #[code word.orth_.islower()].
+        +cell
+            |  Is the lexeme in lowercase? Equivalent to
+            |  #[code lexeme.text.islower()].

    +row
        +cell #[code is_title]
        +cell bool
-        +cell Equivalent to #[code word.orth_.istitle()].
+        +cell
+            |  Is the lexeme in titlecase? Equivalent to
+            |  #[code lexeme.text.istitle()].

    +row
        +cell #[code is_punct]
        +cell bool
-        +cell Equivalent to #[code word.orth_.ispunct()].
+        +cell Is the lexeme punctuation?

    +row
        +cell #[code is_space]
        +cell bool
-        +cell Equivalent to #[code word.orth_.isspace()].
+        +cell
+            |  Does the lexeme consist of whitespace characters? Equivalent to
+            |  #[code lexeme.text.isspace()].

    +row
        +cell #[code like_url]
        +cell bool
-        +cell Does the word resemble a URL?
+        +cell Does the lexeme resemble a URL?

    +row
        +cell #[code like_num]
        +cell bool
-        +cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.
+        +cell Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc.

    +row
        +cell #[code like_email]
        +cell bool
-        +cell Does the word resemble an email address?
+        +cell Does the lexeme resemble an email address?

    +row
        +cell #[code is_oov]
        +cell bool
-        +cell Is the word out-of-vocabulary?
+        +cell Is the lexeme out-of-vocabulary?

    +row
        +cell #[code is_stop]
        +cell bool
-        +cell Is the word part of a "stop list"?
+        +cell Is the lexeme part of a "stop list"?

    +row
        +cell #[code lang]
--- a/website/docs/api/matcher.jade
+++ b/website/docs/api/matcher.jade
@ -5,13 +5,14 @@ include ../../_includes/_mixins
 p Match sequences of tokens, based on pattern rules.

 +infobox("⚠️ Deprecation note")
-    |  As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
-    |  are deprecated and have been replaced with a simpler
-    |  #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
-    |  patterns and a callback for a given match ID. #[code Matcher.get_entity]
-    |  is now called #[+api("matcher#get") #[code matcher.get]].
-    |  #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
-    |  and #[code Matcher.has_entity] (now redundant) have been removed.
+    .o-block
+        |  As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
+        |  are deprecated and have been replaced with a simpler
+        |  #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
+        |  patterns and a callback for a given match ID. #[code Matcher.get_entity]
+        |  is now called #[+api("matcher#get") #[code matcher.get]].
+        |  #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
+        |  and #[code Matcher.has_entity] (now redundant) have been removed.

 +h(2, "init") Matcher.__init__
    +tag method
@ -56,17 +57,6 @@ p Find all token sequences matching the supplied patterns on the #[code Doc].
    doc = nlp(u'hello world!')
    matches = matcher(doc)

-+infobox("Important note")
-    |  By default, the matcher #[strong does not perform any action] on matches,
-    |  like tagging matched phrases with entity types. Instead, actions need to
-    |  be specified when #[strong adding patterns or entities], by
-    |  passing in a callback function as the #[code on_match] argument on
-    |  #[+api("matcher#add") #[code add]]. This allows you to define custom
-    |  actions per pattern within the same matcher. For example, you might only
-    |  want to merge some entity types, and set custom flags for other matched
-    |  patterns. For more details and examples, see the usage workflow on
-    |  #[+a("/docs/usage/rule-based-matching") rule-based matching].
-
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code doc]
@ -81,6 +71,17 @@ p Find all token sequences matching the supplied patterns on the #[code Doc].
            |  matches. A match tuple describes a span #[code doc[start:end]].
            |  The #[code match_id] is the ID of the added match pattern.

+infobox("Important note")
+    |  By default, the matcher #[strong does not perform any action] on matches,
+    |  like tagging matched phrases with entity types. Instead, actions need to
+    |  be specified when #[strong adding patterns or entities], by
+    |  passing in a callback function as the #[code on_match] argument on
+    |  #[+api("matcher#add") #[code add]]. This allows you to define custom
+    |  actions per pattern within the same matcher. For example, you might only
+    |  want to merge some entity types, and set custom flags for other matched
+    |  patterns. For more details and examples, see the usage workflow on
+    |  #[+a("/docs/usage/rule-based-matching") rule-based matching].
+
 +h(2, "pipe") Matcher.pipe
    +tag method

@ -201,6 +202,20 @@ p
            |  Match pattern. A pattern consists of a list of dicts, where each
            |  dict describes a token.

+infobox("⚠️ Deprecation note")
+    .o-block
+        |  As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
+        |  are deprecated and have been replaced with a simpler
+        |  #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
+        |  patterns and a callback for a given match ID.
+
+    +code-new.
+        matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
+
+    +code-old.
+        matcher.add_entity('GoogleNow', on_match=merge_phrases)
+        matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
+
 +h(2, "remove") Matcher.remove
    +tag method
    +tag-new(2)
--- a/website/docs/api/spacy.jade
+++ b/website/docs/api/spacy.jade
@ -20,12 +20,7 @@ p
    nlp = spacy.load('/path/to/en') # unicode path
    nlp = spacy.load(Path('/path/to/en')) # pathlib Path

-+infobox("⚠️ Deprecation note")
-    |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
-    |  will also raise an error if no model could be loaded and never just
-    |  return an empty #[code Language] object. If you need a blank language,
-    |  you need to import it explicitly (#[code from spacy.lang.en import English])
-    |  or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
+    nlp = spacy.load('en', disable['parser', 'tagger'])

 +table(["Name", "Type", "Description"])
    +row
@ -34,15 +29,28 @@ p
        +cell Model to load, i.e. shortcut link, package name or path.

    +row
-        +cell #[code **overrides]
-        +cell -
-        +cell Override or disable components.
+        +cell #[code disable]
+        +cell list
+        +cell
+            |  Names of pipeline components to
+            |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable].

    +footrow
        +cell returns
        +cell #[code Language]
        +cell A #[code Language] object with the loaded model.

+infobox("⚠️ Deprecation note")
+    .o-block
+        |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
+        |  will also raise an error if no model could be loaded and never just
+        |  return an empty #[code Language] object. If you need a blank language,
+        |  you need to import it explicitly (#[code from spacy.lang.en import English])
+        |  or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
+
+    +code-new nlp = spacy.load('/model')
+    +code-old nlp = spacy.load('en', path='/model')
+
 +h(2, "info") spacy.info
    +tag function

@ -98,3 +106,37 @@ p
        +cell returns
        +cell unicode
        +cell The explanation, or #[code None] if not found in the glossary.
+
+h(2, "set_factory") spacy.set_factory
+    +tag function
+    +tag-new(2)
+
+p
+    |  Set a factory that returns a custom
+    |  #[+a("/docs/usage/language-processing-pipeline") processing pipeline]
+    |  component. Factories are useful for creating stateful components, especially ones which depend on shared data.
+
+aside-code("Example").
+    def my_factory(vocab):
+        def my_component(doc):
+            return doc
+        return my_component
+
+    spacy.set_factory('my_factory', my_factory)
+    nlp = Language(pipeline=['my_factory'])
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code factory_id]
+        +cell unicode
+        +cell
+            |  Unique name of factory. If added to a new pipeline, spaCy will
+            |  look up the factory for this ID and use it to create the
+            |  component.
+
+    +row
+        +cell #[code factory]
+        +cell callable
+        +cell
+            |  Callable that takes a #[code Vocab] object and returns a pipeline
+            |  component.
--- a/website/docs/api/stringstore.jade
+++ b/website/docs/api/stringstore.jade
@ -119,7 +119,7 @@ p Save the current state to a directory.
            |  A path to a directory, which will be created if it doesn't exist.
            |  Paths may be either strings or #[code Path]-like objects.

-+h(2, "from_disk") Tokenizer.from_disk
+h(2, "from_disk") StringStore.from_disk
    +tag method
    +tag-new(2)

@ -139,10 +139,10 @@ p Loads state from a directory. Modifies the object in place and returns it.

    +footrow
        +cell returns
-        +cell #[code Tokenizer]
-        +cell The modified #[code Tokenizer] object.
+        +cell #[code StringStore]
+        +cell The modified #[code StringStore] object.

-+h(2, "to_bytes") Tokenizer.to_bytes
+h(2, "to_bytes") StringStore.to_bytes
    +tag method

 p Serialize the current state to a binary string.
@ -159,9 +159,9 @@ p Serialize the current state to a binary string.
    +footrow
        +cell returns
        +cell bytes
-        +cell The serialized form of the #[code Tokenizer] object.
+        +cell The serialized form of the #[code StringStore] object.

-+h(2, "from_bytes") Tokenizer.from_bytes
+h(2, "from_bytes") StringStore.from_bytes
    +tag method

 p Load state from a binary string.
--- a/website/docs/api/token.jade
+++ b/website/docs/api/token.jade
@ -370,116 +370,131 @@ p The L2 norm of the token's vector representation.
        +cell #[code lemma]
        +cell int
        +cell
-            |  Base form of the word, with no inflectional suffixes.
+            |  Base form of the token, with no inflectional suffixes.

    +row
        +cell #[code lemma_]
        +cell unicode
-        +cell Base form of the word, with no inflectional suffixes.
+        +cell Base form of the token, with no inflectional suffixes.

    +row
        +cell #[code lower]
        +cell int
-        +cell Lower-case form of the word.
+        +cell Lower-case form of the token.

    +row
        +cell #[code lower_]
        +cell unicode
-        +cell Lower-case form of the word.
+        +cell Lower-case form of the token.

    +row
        +cell #[code shape]
        +cell int
-        +cell Transform of the word's string, to show orthographic features.
+        +cell
+            |  Transform of the tokens's string, to show orthographic features.
+            |  For example, "Xxxx" or "dd".

    +row
        +cell #[code shape_]
        +cell unicode
-        +cell A transform of the word's string, to show orthographic features.
+            |  Transform of the tokens's string, to show orthographic features.
+            |  For example, "Xxxx" or "dd".

    +row
        +cell #[code prefix]
        +cell int
        +cell Integer ID of a length-N substring from the start of the
-            |  word. Defaults to #[code N=1].
+            |  token. Defaults to #[code N=1].

    +row
        +cell #[code prefix_]
        +cell unicode
        +cell
-            |  A length-N substring from the start of the word. Defaults to
+            |  A length-N substring from the start of the token. Defaults to
            |  #[code N=1].

    +row
        +cell #[code suffix]
        +cell int
        +cell
-            |  Length-N substring from the end of the word. Defaults to #[code N=3].
+            |  Length-N substring from the end of the token. Defaults to #[code N=3].

    +row
        +cell #[code suffix_]
        +cell unicode
-        +cell Length-N substring from the end of the word. Defaults to #[code N=3].
+        +cell Length-N substring from the end of the token. Defaults to #[code N=3].

    +row
        +cell #[code is_alpha]
        +cell bool
-        +cell Equivalent to #[code word.orth_.isalpha()].
+        +cell
+            |  Does the token consist of alphabetic characters? Equivalent to
+            |  #[code token.text.isalpha()].

    +row
        +cell #[code is_ascii]
        +cell bool
-        +cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]].
+        +cell
+            |  Does the token consist of ASCII characters? Equivalent to
+            |  #[code [any(ord(c) >= 128 for c in token.text)]].

    +row
        +cell #[code is_digit]
        +cell bool
-        +cell Equivalent to #[code word.orth_.isdigit()].
+        +cell
+            |  Does the token consist of digits? Equivalent to
+            |  #[code token.text.isdigit()].

    +row
        +cell #[code is_lower]
        +cell bool
-        +cell Equivalent to #[code word.orth_.islower()].
+        +cell
+            |  Is the token in lowercase? Equivalent to
+            |  #[code token.text.islower()].

    +row
        +cell #[code is_title]
        +cell bool
-        +cell Equivalent to #[code word.orth_.istitle()].
+        +cell
+            |  Is the token in titlecase? Equivalent to
+            |  #[code token.text.istitle()].

    +row
        +cell #[code is_punct]
        +cell bool
-        +cell Equivalent to #[code word.orth_.ispunct()].
+        +cell Is the token punctuation?

    +row
        +cell #[code is_space]
        +cell bool
-        +cell Equivalent to #[code word.orth_.isspace()].
+        +cell
+            |  Does the token consist of whitespace characters? Equivalent to
+            |  #[code token.text.isspace()].

    +row
        +cell #[code like_url]
        +cell bool
-        +cell Does the word resemble a URL?
+        +cell Does the token resemble a URL?

    +row
        +cell #[code like_num]
        +cell bool
-        +cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.
+        +cell Does the token represent a number? e.g. "10.9", "10", "ten", etc.

    +row
        +cell #[code like_email]
        +cell bool
-        +cell Does the word resemble an email address?
+        +cell Does the token resemble an email address?

    +row
        +cell #[code is_oov]
        +cell bool
-        +cell Is the word out-of-vocabulary?
+        +cell Is the token out-of-vocabulary?

    +row
        +cell #[code is_stop]
        +cell bool
-        +cell Is the word part of a "stop list"?
+        +cell Is the token part of a "stop list"?

    +row
        +cell #[code pos]