mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Update API docs and fix typos
This commit is contained in:
parent
ea9474f71c
commit
d48530835a
|
@ -278,7 +278,8 @@ p Loads state from a directory. Modifies the object in place and returns it.
|
|||
|
||||
+aside-code("Example").
|
||||
from spacy.tokens import Doc
|
||||
doc = Doc().from_disk('/path/to/doc')
|
||||
from spacy.vocab import Vocab
|
||||
doc = Doc(Vocab()).from_disk('/path/to/doc')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
|
|
@ -212,62 +212,74 @@ p The L2 norm of the lexeme's vector representation.
|
|||
+row
|
||||
+cell #[code is_alpha]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.isalpha()].
|
||||
+cell
|
||||
| Does the lexeme consist of alphabetic characters? Equivalent to
|
||||
| #[code lexeme.text.isalpha()].
|
||||
|
||||
+row
|
||||
+cell #[code is_ascii]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]].
|
||||
+cell
|
||||
| Does the lexeme consist of ASCII characters? Equivalent to
|
||||
| #[code [any(ord(c) >= 128 for c in lexeme.text)]].
|
||||
|
||||
+row
|
||||
+cell #[code is_digit]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.isdigit()].
|
||||
+cell
|
||||
| Does the lexeme consist of digits? Equivalent to
|
||||
| #[code lexeme.text.isdigit()].
|
||||
|
||||
+row
|
||||
+cell #[code is_lower]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.islower()].
|
||||
+cell
|
||||
| Is the lexeme in lowercase? Equivalent to
|
||||
| #[code lexeme.text.islower()].
|
||||
|
||||
+row
|
||||
+cell #[code is_title]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.istitle()].
|
||||
+cell
|
||||
| Is the lexeme in titlecase? Equivalent to
|
||||
| #[code lexeme.text.istitle()].
|
||||
|
||||
+row
|
||||
+cell #[code is_punct]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.ispunct()].
|
||||
+cell Is the lexeme punctuation?
|
||||
|
||||
+row
|
||||
+cell #[code is_space]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.isspace()].
|
||||
+cell
|
||||
| Does the lexeme consist of whitespace characters? Equivalent to
|
||||
| #[code lexeme.text.isspace()].
|
||||
|
||||
+row
|
||||
+cell #[code like_url]
|
||||
+cell bool
|
||||
+cell Does the word resemble a URL?
|
||||
+cell Does the lexeme resemble a URL?
|
||||
|
||||
+row
|
||||
+cell #[code like_num]
|
||||
+cell bool
|
||||
+cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.
|
||||
+cell Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc.
|
||||
|
||||
+row
|
||||
+cell #[code like_email]
|
||||
+cell bool
|
||||
+cell Does the word resemble an email address?
|
||||
+cell Does the lexeme resemble an email address?
|
||||
|
||||
+row
|
||||
+cell #[code is_oov]
|
||||
+cell bool
|
||||
+cell Is the word out-of-vocabulary?
|
||||
+cell Is the lexeme out-of-vocabulary?
|
||||
|
||||
+row
|
||||
+cell #[code is_stop]
|
||||
+cell bool
|
||||
+cell Is the word part of a "stop list"?
|
||||
+cell Is the lexeme part of a "stop list"?
|
||||
|
||||
+row
|
||||
+cell #[code lang]
|
||||
|
|
|
@ -5,13 +5,14 @@ include ../../_includes/_mixins
|
|||
p Match sequences of tokens, based on pattern rules.
|
||||
|
||||
+infobox("⚠️ Deprecation note")
|
||||
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
|
||||
| are deprecated and have been replaced with a simpler
|
||||
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
|
||||
| patterns and a callback for a given match ID. #[code Matcher.get_entity]
|
||||
| is now called #[+api("matcher#get") #[code matcher.get]].
|
||||
| #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
|
||||
| and #[code Matcher.has_entity] (now redundant) have been removed.
|
||||
.o-block
|
||||
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
|
||||
| are deprecated and have been replaced with a simpler
|
||||
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
|
||||
| patterns and a callback for a given match ID. #[code Matcher.get_entity]
|
||||
| is now called #[+api("matcher#get") #[code matcher.get]].
|
||||
| #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
|
||||
| and #[code Matcher.has_entity] (now redundant) have been removed.
|
||||
|
||||
+h(2, "init") Matcher.__init__
|
||||
+tag method
|
||||
|
@ -56,17 +57,6 @@ p Find all token sequences matching the supplied patterns on the #[code Doc].
|
|||
doc = nlp(u'hello world!')
|
||||
matches = matcher(doc)
|
||||
|
||||
+infobox("Important note")
|
||||
| By default, the matcher #[strong does not perform any action] on matches,
|
||||
| like tagging matched phrases with entity types. Instead, actions need to
|
||||
| be specified when #[strong adding patterns or entities], by
|
||||
| passing in a callback function as the #[code on_match] argument on
|
||||
| #[+api("matcher#add") #[code add]]. This allows you to define custom
|
||||
| actions per pattern within the same matcher. For example, you might only
|
||||
| want to merge some entity types, and set custom flags for other matched
|
||||
| patterns. For more details and examples, see the usage workflow on
|
||||
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code doc]
|
||||
|
@ -81,6 +71,17 @@ p Find all token sequences matching the supplied patterns on the #[code Doc].
|
|||
| matches. A match tuple describes a span #[code doc[start:end]].
|
||||
| The #[code match_id] is the ID of the added match pattern.
|
||||
|
||||
+infobox("Important note")
|
||||
| By default, the matcher #[strong does not perform any action] on matches,
|
||||
| like tagging matched phrases with entity types. Instead, actions need to
|
||||
| be specified when #[strong adding patterns or entities], by
|
||||
| passing in a callback function as the #[code on_match] argument on
|
||||
| #[+api("matcher#add") #[code add]]. This allows you to define custom
|
||||
| actions per pattern within the same matcher. For example, you might only
|
||||
| want to merge some entity types, and set custom flags for other matched
|
||||
| patterns. For more details and examples, see the usage workflow on
|
||||
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
|
||||
|
||||
+h(2, "pipe") Matcher.pipe
|
||||
+tag method
|
||||
|
||||
|
@ -201,6 +202,20 @@ p
|
|||
| Match pattern. A pattern consists of a list of dicts, where each
|
||||
| dict describes a token.
|
||||
|
||||
+infobox("⚠️ Deprecation note")
|
||||
.o-block
|
||||
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
|
||||
| are deprecated and have been replaced with a simpler
|
||||
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
|
||||
| patterns and a callback for a given match ID.
|
||||
|
||||
+code-new.
|
||||
matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
|
||||
|
||||
+code-old.
|
||||
matcher.add_entity('GoogleNow', on_match=merge_phrases)
|
||||
matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
|
||||
|
||||
+h(2, "remove") Matcher.remove
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
|
|
@ -20,12 +20,7 @@ p
|
|||
nlp = spacy.load('/path/to/en') # unicode path
|
||||
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
|
||||
|
||||
+infobox("⚠️ Deprecation note")
|
||||
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
|
||||
| will also raise an error if no model could be loaded and never just
|
||||
| return an empty #[code Language] object. If you need a blank language,
|
||||
| you need to import it explicitly (#[code from spacy.lang.en import English])
|
||||
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
|
||||
nlp = spacy.load('en', disable['parser', 'tagger'])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
@ -34,15 +29,28 @@ p
|
|||
+cell Model to load, i.e. shortcut link, package name or path.
|
||||
|
||||
+row
|
||||
+cell #[code **overrides]
|
||||
+cell -
|
||||
+cell Override or disable components.
|
||||
+cell #[code disable]
|
||||
+cell list
|
||||
+cell
|
||||
| Names of pipeline components to
|
||||
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell A #[code Language] object with the loaded model.
|
||||
|
||||
+infobox("⚠️ Deprecation note")
|
||||
.o-block
|
||||
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
|
||||
| will also raise an error if no model could be loaded and never just
|
||||
| return an empty #[code Language] object. If you need a blank language,
|
||||
| you need to import it explicitly (#[code from spacy.lang.en import English])
|
||||
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
|
||||
|
||||
+code-new nlp = spacy.load('/model')
|
||||
+code-old nlp = spacy.load('en', path='/model')
|
||||
|
||||
+h(2, "info") spacy.info
|
||||
+tag function
|
||||
|
||||
|
@ -98,3 +106,37 @@ p
|
|||
+cell returns
|
||||
+cell unicode
|
||||
+cell The explanation, or #[code None] if not found in the glossary.
|
||||
|
||||
+h(2, "set_factory") spacy.set_factory
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Set a factory that returns a custom
|
||||
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline]
|
||||
| component. Factories are useful for creating stateful components, especially ones which depend on shared data.
|
||||
|
||||
+aside-code("Example").
|
||||
def my_factory(vocab):
|
||||
def my_component(doc):
|
||||
return doc
|
||||
return my_component
|
||||
|
||||
spacy.set_factory('my_factory', my_factory)
|
||||
nlp = Language(pipeline=['my_factory'])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code factory_id]
|
||||
+cell unicode
|
||||
+cell
|
||||
| Unique name of factory. If added to a new pipeline, spaCy will
|
||||
| look up the factory for this ID and use it to create the
|
||||
| component.
|
||||
|
||||
+row
|
||||
+cell #[code factory]
|
||||
+cell callable
|
||||
+cell
|
||||
| Callable that takes a #[code Vocab] object and returns a pipeline
|
||||
| component.
|
||||
|
|
|
@ -119,7 +119,7 @@ p Save the current state to a directory.
|
|||
| A path to a directory, which will be created if it doesn't exist.
|
||||
| Paths may be either strings or #[code Path]-like objects.
|
||||
|
||||
+h(2, "from_disk") Tokenizer.from_disk
|
||||
+h(2, "from_disk") StringStore.from_disk
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
|
@ -139,10 +139,10 @@ p Loads state from a directory. Modifies the object in place and returns it.
|
|||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Tokenizer]
|
||||
+cell The modified #[code Tokenizer] object.
|
||||
+cell #[code StringStore]
|
||||
+cell The modified #[code StringStore] object.
|
||||
|
||||
+h(2, "to_bytes") Tokenizer.to_bytes
|
||||
+h(2, "to_bytes") StringStore.to_bytes
|
||||
+tag method
|
||||
|
||||
p Serialize the current state to a binary string.
|
||||
|
@ -159,9 +159,9 @@ p Serialize the current state to a binary string.
|
|||
+footrow
|
||||
+cell returns
|
||||
+cell bytes
|
||||
+cell The serialized form of the #[code Tokenizer] object.
|
||||
+cell The serialized form of the #[code StringStore] object.
|
||||
|
||||
+h(2, "from_bytes") Tokenizer.from_bytes
|
||||
+h(2, "from_bytes") StringStore.from_bytes
|
||||
+tag method
|
||||
|
||||
p Load state from a binary string.
|
||||
|
|
|
@ -370,116 +370,131 @@ p The L2 norm of the token's vector representation.
|
|||
+cell #[code lemma]
|
||||
+cell int
|
||||
+cell
|
||||
| Base form of the word, with no inflectional suffixes.
|
||||
| Base form of the token, with no inflectional suffixes.
|
||||
|
||||
+row
|
||||
+cell #[code lemma_]
|
||||
+cell unicode
|
||||
+cell Base form of the word, with no inflectional suffixes.
|
||||
+cell Base form of the token, with no inflectional suffixes.
|
||||
|
||||
+row
|
||||
+cell #[code lower]
|
||||
+cell int
|
||||
+cell Lower-case form of the word.
|
||||
+cell Lower-case form of the token.
|
||||
|
||||
+row
|
||||
+cell #[code lower_]
|
||||
+cell unicode
|
||||
+cell Lower-case form of the word.
|
||||
+cell Lower-case form of the token.
|
||||
|
||||
+row
|
||||
+cell #[code shape]
|
||||
+cell int
|
||||
+cell Transform of the word's string, to show orthographic features.
|
||||
+cell
|
||||
| Transform of the tokens's string, to show orthographic features.
|
||||
| For example, "Xxxx" or "dd".
|
||||
|
||||
+row
|
||||
+cell #[code shape_]
|
||||
+cell unicode
|
||||
+cell A transform of the word's string, to show orthographic features.
|
||||
| Transform of the tokens's string, to show orthographic features.
|
||||
| For example, "Xxxx" or "dd".
|
||||
|
||||
+row
|
||||
+cell #[code prefix]
|
||||
+cell int
|
||||
+cell Integer ID of a length-N substring from the start of the
|
||||
| word. Defaults to #[code N=1].
|
||||
| token. Defaults to #[code N=1].
|
||||
|
||||
+row
|
||||
+cell #[code prefix_]
|
||||
+cell unicode
|
||||
+cell
|
||||
| A length-N substring from the start of the word. Defaults to
|
||||
| A length-N substring from the start of the token. Defaults to
|
||||
| #[code N=1].
|
||||
|
||||
+row
|
||||
+cell #[code suffix]
|
||||
+cell int
|
||||
+cell
|
||||
| Length-N substring from the end of the word. Defaults to #[code N=3].
|
||||
| Length-N substring from the end of the token. Defaults to #[code N=3].
|
||||
|
||||
+row
|
||||
+cell #[code suffix_]
|
||||
+cell unicode
|
||||
+cell Length-N substring from the end of the word. Defaults to #[code N=3].
|
||||
+cell Length-N substring from the end of the token. Defaults to #[code N=3].
|
||||
|
||||
+row
|
||||
+cell #[code is_alpha]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.isalpha()].
|
||||
+cell
|
||||
| Does the token consist of alphabetic characters? Equivalent to
|
||||
| #[code token.text.isalpha()].
|
||||
|
||||
+row
|
||||
+cell #[code is_ascii]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]].
|
||||
+cell
|
||||
| Does the token consist of ASCII characters? Equivalent to
|
||||
| #[code [any(ord(c) >= 128 for c in token.text)]].
|
||||
|
||||
+row
|
||||
+cell #[code is_digit]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.isdigit()].
|
||||
+cell
|
||||
| Does the token consist of digits? Equivalent to
|
||||
| #[code token.text.isdigit()].
|
||||
|
||||
+row
|
||||
+cell #[code is_lower]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.islower()].
|
||||
+cell
|
||||
| Is the token in lowercase? Equivalent to
|
||||
| #[code token.text.islower()].
|
||||
|
||||
+row
|
||||
+cell #[code is_title]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.istitle()].
|
||||
+cell
|
||||
| Is the token in titlecase? Equivalent to
|
||||
| #[code token.text.istitle()].
|
||||
|
||||
+row
|
||||
+cell #[code is_punct]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.ispunct()].
|
||||
+cell Is the token punctuation?
|
||||
|
||||
+row
|
||||
+cell #[code is_space]
|
||||
+cell bool
|
||||
+cell Equivalent to #[code word.orth_.isspace()].
|
||||
+cell
|
||||
| Does the token consist of whitespace characters? Equivalent to
|
||||
| #[code token.text.isspace()].
|
||||
|
||||
+row
|
||||
+cell #[code like_url]
|
||||
+cell bool
|
||||
+cell Does the word resemble a URL?
|
||||
+cell Does the token resemble a URL?
|
||||
|
||||
+row
|
||||
+cell #[code like_num]
|
||||
+cell bool
|
||||
+cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.
|
||||
+cell Does the token represent a number? e.g. "10.9", "10", "ten", etc.
|
||||
|
||||
+row
|
||||
+cell #[code like_email]
|
||||
+cell bool
|
||||
+cell Does the word resemble an email address?
|
||||
+cell Does the token resemble an email address?
|
||||
|
||||
+row
|
||||
+cell #[code is_oov]
|
||||
+cell bool
|
||||
+cell Is the word out-of-vocabulary?
|
||||
+cell Is the token out-of-vocabulary?
|
||||
|
||||
+row
|
||||
+cell #[code is_stop]
|
||||
+cell bool
|
||||
+cell Is the word part of a "stop list"?
|
||||
+cell Is the token part of a "stop list"?
|
||||
|
||||
+row
|
||||
+cell #[code pos]
|
||||
|
|
Loading…
Reference in New Issue
Block a user