diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index bb56331f7..9b8392fcb 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -278,7 +278,8 @@ p Loads state from a directory. Modifies the object in place and returns it. +aside-code("Example"). from spacy.tokens import Doc - doc = Doc().from_disk('/path/to/doc') + from spacy.vocab import Vocab + doc = Doc(Vocab()).from_disk('/path/to/doc') +table(["Name", "Type", "Description"]) +row diff --git a/website/docs/api/lexeme.jade b/website/docs/api/lexeme.jade index dba6fdf59..a0487be9b 100644 --- a/website/docs/api/lexeme.jade +++ b/website/docs/api/lexeme.jade @@ -212,62 +212,74 @@ p The L2 norm of the lexeme's vector representation. +row +cell #[code is_alpha] +cell bool - +cell Equivalent to #[code word.orth_.isalpha()]. + +cell + | Does the lexeme consist of alphabetic characters? Equivalent to + | #[code lexeme.text.isalpha()]. +row +cell #[code is_ascii] +cell bool - +cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]]. + +cell + | Does the lexeme consist of ASCII characters? Equivalent to + | #[code [any(ord(c) >= 128 for c in lexeme.text)]]. +row +cell #[code is_digit] +cell bool - +cell Equivalent to #[code word.orth_.isdigit()]. + +cell + | Does the lexeme consist of digits? Equivalent to + | #[code lexeme.text.isdigit()]. +row +cell #[code is_lower] +cell bool - +cell Equivalent to #[code word.orth_.islower()]. + +cell + | Is the lexeme in lowercase? Equivalent to + | #[code lexeme.text.islower()]. +row +cell #[code is_title] +cell bool - +cell Equivalent to #[code word.orth_.istitle()]. + +cell + | Is the lexeme in titlecase? Equivalent to + | #[code lexeme.text.istitle()]. +row +cell #[code is_punct] +cell bool - +cell Equivalent to #[code word.orth_.ispunct()]. + +cell Is the lexeme punctuation? +row +cell #[code is_space] +cell bool - +cell Equivalent to #[code word.orth_.isspace()]. + +cell + | Does the lexeme consist of whitespace characters? Equivalent to + | #[code lexeme.text.isspace()]. +row +cell #[code like_url] +cell bool - +cell Does the word resemble a URL? + +cell Does the lexeme resemble a URL? +row +cell #[code like_num] +cell bool - +cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc. + +cell Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. +row +cell #[code like_email] +cell bool - +cell Does the word resemble an email address? + +cell Does the lexeme resemble an email address? +row +cell #[code is_oov] +cell bool - +cell Is the word out-of-vocabulary? + +cell Is the lexeme out-of-vocabulary? +row +cell #[code is_stop] +cell bool - +cell Is the word part of a "stop list"? + +cell Is the lexeme part of a "stop list"? +row +cell #[code lang] diff --git a/website/docs/api/matcher.jade b/website/docs/api/matcher.jade index 541cceeda..e2972fdc0 100644 --- a/website/docs/api/matcher.jade +++ b/website/docs/api/matcher.jade @@ -5,13 +5,14 @@ include ../../_includes/_mixins p Match sequences of tokens, based on pattern rules. +infobox("⚠️ Deprecation note") - | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] - | are deprecated and have been replaced with a simpler - | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of - | patterns and a callback for a given match ID. #[code Matcher.get_entity] - | is now called #[+api("matcher#get") #[code matcher.get]]. - | #[code Matcher.load] (not useful, as it didn't allow specifying callbacks), - | and #[code Matcher.has_entity] (now redundant) have been removed. + .o-block + | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] + | are deprecated and have been replaced with a simpler + | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of + | patterns and a callback for a given match ID. #[code Matcher.get_entity] + | is now called #[+api("matcher#get") #[code matcher.get]]. + | #[code Matcher.load] (not useful, as it didn't allow specifying callbacks), + | and #[code Matcher.has_entity] (now redundant) have been removed. +h(2, "init") Matcher.__init__ +tag method @@ -56,17 +57,6 @@ p Find all token sequences matching the supplied patterns on the #[code Doc]. doc = nlp(u'hello world!') matches = matcher(doc) -+infobox("Important note") - | By default, the matcher #[strong does not perform any action] on matches, - | like tagging matched phrases with entity types. Instead, actions need to - | be specified when #[strong adding patterns or entities], by - | passing in a callback function as the #[code on_match] argument on - | #[+api("matcher#add") #[code add]]. This allows you to define custom - | actions per pattern within the same matcher. For example, you might only - | want to merge some entity types, and set custom flags for other matched - | patterns. For more details and examples, see the usage workflow on - | #[+a("/docs/usage/rule-based-matching") rule-based matching]. - +table(["Name", "Type", "Description"]) +row +cell #[code doc] @@ -81,6 +71,17 @@ p Find all token sequences matching the supplied patterns on the #[code Doc]. | matches. A match tuple describes a span #[code doc[start:end]]. | The #[code match_id] is the ID of the added match pattern. ++infobox("Important note") + | By default, the matcher #[strong does not perform any action] on matches, + | like tagging matched phrases with entity types. Instead, actions need to + | be specified when #[strong adding patterns or entities], by + | passing in a callback function as the #[code on_match] argument on + | #[+api("matcher#add") #[code add]]. This allows you to define custom + | actions per pattern within the same matcher. For example, you might only + | want to merge some entity types, and set custom flags for other matched + | patterns. For more details and examples, see the usage workflow on + | #[+a("/docs/usage/rule-based-matching") rule-based matching]. + +h(2, "pipe") Matcher.pipe +tag method @@ -201,6 +202,20 @@ p | Match pattern. A pattern consists of a list of dicts, where each | dict describes a token. ++infobox("⚠️ Deprecation note") + .o-block + | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] + | are deprecated and have been replaced with a simpler + | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of + | patterns and a callback for a given match ID. + + +code-new. + matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}]) + + +code-old. + matcher.add_entity('GoogleNow', on_match=merge_phrases) + matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}]) + +h(2, "remove") Matcher.remove +tag method +tag-new(2) diff --git a/website/docs/api/spacy.jade b/website/docs/api/spacy.jade index 6ad88c1a8..f2fcfde2c 100644 --- a/website/docs/api/spacy.jade +++ b/website/docs/api/spacy.jade @@ -20,12 +20,7 @@ p nlp = spacy.load('/path/to/en') # unicode path nlp = spacy.load(Path('/path/to/en')) # pathlib Path -+infobox("⚠️ Deprecation note") - | As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy - | will also raise an error if no model could be loaded and never just - | return an empty #[code Language] object. If you need a blank language, - | you need to import it explicitly (#[code from spacy.lang.en import English]) - | or use #[+api("util#get_lang_class") #[code util.get_lang_class]]. + nlp = spacy.load('en', disable['parser', 'tagger']) +table(["Name", "Type", "Description"]) +row @@ -34,15 +29,28 @@ p +cell Model to load, i.e. shortcut link, package name or path. +row - +cell #[code **overrides] - +cell - - +cell Override or disable components. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. +footrow +cell returns +cell #[code Language] +cell A #[code Language] object with the loaded model. ++infobox("⚠️ Deprecation note") + .o-block + | As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy + | will also raise an error if no model could be loaded and never just + | return an empty #[code Language] object. If you need a blank language, + | you need to import it explicitly (#[code from spacy.lang.en import English]) + | or use #[+api("util#get_lang_class") #[code util.get_lang_class]]. + + +code-new nlp = spacy.load('/model') + +code-old nlp = spacy.load('en', path='/model') + +h(2, "info") spacy.info +tag function @@ -98,3 +106,37 @@ p +cell returns +cell unicode +cell The explanation, or #[code None] if not found in the glossary. + ++h(2, "set_factory") spacy.set_factory + +tag function + +tag-new(2) + +p + | Set a factory that returns a custom + | #[+a("/docs/usage/language-processing-pipeline") processing pipeline] + | component. Factories are useful for creating stateful components, especially ones which depend on shared data. + ++aside-code("Example"). + def my_factory(vocab): + def my_component(doc): + return doc + return my_component + + spacy.set_factory('my_factory', my_factory) + nlp = Language(pipeline=['my_factory']) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code factory_id] + +cell unicode + +cell + | Unique name of factory. If added to a new pipeline, spaCy will + | look up the factory for this ID and use it to create the + | component. + + +row + +cell #[code factory] + +cell callable + +cell + | Callable that takes a #[code Vocab] object and returns a pipeline + | component. diff --git a/website/docs/api/stringstore.jade b/website/docs/api/stringstore.jade index f684d48ad..f09352c79 100644 --- a/website/docs/api/stringstore.jade +++ b/website/docs/api/stringstore.jade @@ -119,7 +119,7 @@ p Save the current state to a directory. | A path to a directory, which will be created if it doesn't exist. | Paths may be either strings or #[code Path]-like objects. -+h(2, "from_disk") Tokenizer.from_disk ++h(2, "from_disk") StringStore.from_disk +tag method +tag-new(2) @@ -139,10 +139,10 @@ p Loads state from a directory. Modifies the object in place and returns it. +footrow +cell returns - +cell #[code Tokenizer] - +cell The modified #[code Tokenizer] object. + +cell #[code StringStore] + +cell The modified #[code StringStore] object. -+h(2, "to_bytes") Tokenizer.to_bytes ++h(2, "to_bytes") StringStore.to_bytes +tag method p Serialize the current state to a binary string. @@ -159,9 +159,9 @@ p Serialize the current state to a binary string. +footrow +cell returns +cell bytes - +cell The serialized form of the #[code Tokenizer] object. + +cell The serialized form of the #[code StringStore] object. -+h(2, "from_bytes") Tokenizer.from_bytes ++h(2, "from_bytes") StringStore.from_bytes +tag method p Load state from a binary string. diff --git a/website/docs/api/token.jade b/website/docs/api/token.jade index 744446ec2..ee989047c 100644 --- a/website/docs/api/token.jade +++ b/website/docs/api/token.jade @@ -370,116 +370,131 @@ p The L2 norm of the token's vector representation. +cell #[code lemma] +cell int +cell - | Base form of the word, with no inflectional suffixes. + | Base form of the token, with no inflectional suffixes. +row +cell #[code lemma_] +cell unicode - +cell Base form of the word, with no inflectional suffixes. + +cell Base form of the token, with no inflectional suffixes. +row +cell #[code lower] +cell int - +cell Lower-case form of the word. + +cell Lower-case form of the token. +row +cell #[code lower_] +cell unicode - +cell Lower-case form of the word. + +cell Lower-case form of the token. +row +cell #[code shape] +cell int - +cell Transform of the word's string, to show orthographic features. + +cell + | Transform of the tokens's string, to show orthographic features. + | For example, "Xxxx" or "dd". +row +cell #[code shape_] +cell unicode - +cell A transform of the word's string, to show orthographic features. + | Transform of the tokens's string, to show orthographic features. + | For example, "Xxxx" or "dd". +row +cell #[code prefix] +cell int +cell Integer ID of a length-N substring from the start of the - | word. Defaults to #[code N=1]. + | token. Defaults to #[code N=1]. +row +cell #[code prefix_] +cell unicode +cell - | A length-N substring from the start of the word. Defaults to + | A length-N substring from the start of the token. Defaults to | #[code N=1]. +row +cell #[code suffix] +cell int +cell - | Length-N substring from the end of the word. Defaults to #[code N=3]. + | Length-N substring from the end of the token. Defaults to #[code N=3]. +row +cell #[code suffix_] +cell unicode - +cell Length-N substring from the end of the word. Defaults to #[code N=3]. + +cell Length-N substring from the end of the token. Defaults to #[code N=3]. +row +cell #[code is_alpha] +cell bool - +cell Equivalent to #[code word.orth_.isalpha()]. + +cell + | Does the token consist of alphabetic characters? Equivalent to + | #[code token.text.isalpha()]. +row +cell #[code is_ascii] +cell bool - +cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]]. + +cell + | Does the token consist of ASCII characters? Equivalent to + | #[code [any(ord(c) >= 128 for c in token.text)]]. +row +cell #[code is_digit] +cell bool - +cell Equivalent to #[code word.orth_.isdigit()]. + +cell + | Does the token consist of digits? Equivalent to + | #[code token.text.isdigit()]. +row +cell #[code is_lower] +cell bool - +cell Equivalent to #[code word.orth_.islower()]. + +cell + | Is the token in lowercase? Equivalent to + | #[code token.text.islower()]. +row +cell #[code is_title] +cell bool - +cell Equivalent to #[code word.orth_.istitle()]. + +cell + | Is the token in titlecase? Equivalent to + | #[code token.text.istitle()]. +row +cell #[code is_punct] +cell bool - +cell Equivalent to #[code word.orth_.ispunct()]. + +cell Is the token punctuation? +row +cell #[code is_space] +cell bool - +cell Equivalent to #[code word.orth_.isspace()]. + +cell + | Does the token consist of whitespace characters? Equivalent to + | #[code token.text.isspace()]. +row +cell #[code like_url] +cell bool - +cell Does the word resemble a URL? + +cell Does the token resemble a URL? +row +cell #[code like_num] +cell bool - +cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc. + +cell Does the token represent a number? e.g. "10.9", "10", "ten", etc. +row +cell #[code like_email] +cell bool - +cell Does the word resemble an email address? + +cell Does the token resemble an email address? +row +cell #[code is_oov] +cell bool - +cell Is the word out-of-vocabulary? + +cell Is the token out-of-vocabulary? +row +cell #[code is_stop] +cell bool - +cell Is the word part of a "stop list"? + +cell Is the token part of a "stop list"? +row +cell #[code pos]