mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-15 03:56:23 +03:00
220 lines
7.6 KiB
Plaintext
220 lines
7.6 KiB
Plaintext
//- 💫 DOCS > USAGE > WHAT'S NEW IN V2.0
|
|
|
|
include ../../_includes/_mixins
|
|
|
|
p
|
|
| We also re-wrote a large part of the documentation and usage workflows,
|
|
| and added more examples.
|
|
|
|
+h(2, "features") New features
|
|
|
|
+h(3, "features-displacy") displaCy visualizer with Jupyter support
|
|
|
|
+aside-code("Example").
|
|
from spacy import displacy
|
|
doc = nlp(u'This is a sentence about Facebook.')
|
|
displacy.serve(doc, style='dep') # run the web server
|
|
html = displacy.render(doc, style='ent') # generate HTML
|
|
|
|
p
|
|
| Our popular dependency and named entity visualizers are now an official
|
|
| part of the spaCy library! displaCy can run a simple web server, or
|
|
| generate raw HTML markup or SVG files to be exported. You can pass in one
|
|
| or more docs, and customise the style. displaCy also auto-detects whether
|
|
| you're running #[+a("https://jupyter.org") Jupyter] and will render the
|
|
| visualizations in your notebook.
|
|
|
|
+infobox
|
|
| #[strong API:] #[+api("displacy") #[code displacy]]
|
|
| #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizing spaCy]
|
|
|
|
+h(3, "features-loading") Loading
|
|
|
|
+aside-code("Example").
|
|
nlp = spacy.load('en') # shortcut link
|
|
nlp = spacy.load('en_core_web_sm') # package
|
|
nlp = spacy.load('/path/to/en') # unicode path
|
|
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
|
|
|
|
p
|
|
| The improved #[code spacy.load] makes loading models easier and more
|
|
| transparent. You can load a model by supplying its
|
|
| #[+a("/docs/usage/models#usage") shortcut link], the name of an installed
|
|
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
|
|
| path or a #[code Path]-like object. spaCy will try resolving the load
|
|
| argument in this order. The #[code path] keyword argument is now deprecated.
|
|
|
|
p
|
|
| The #[code Language] class to initialise will be determined based on the
|
|
| model's settings. If no model is found, spaCy will let you know and won't
|
|
| just return an empty #[code Language] object anymore. If you want a blank
|
|
| language, you can always import the class directly, e.g.
|
|
| #[code from spacy.lang.en import English].
|
|
|
|
+infobox
|
|
| #[strong API:] #[+api("spacy#load") #[code spacy.load]]
|
|
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
|
|
|
|
+h(3, "features-language") Improved language data and processing pipelines
|
|
|
|
+aside-code("Example").
|
|
from spacy.language import Language
|
|
nlp = Language(pipeline=['token_vectors', 'tags',
|
|
'dependencies'])
|
|
|
|
+infobox
|
|
| #[strong API:] #[+api("language") #[code Language]]
|
|
| #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
|
|
|
|
+h(3, "features-lemmatizer") Simple lookup-based lemmatization
|
|
|
|
+aside-code("Example").
|
|
LOOKUP = {
|
|
"aba": "abar",
|
|
"ababa": "abar",
|
|
"ababais": "abar",
|
|
"ababan": "abar",
|
|
"ababanes": "ababán"
|
|
}
|
|
|
|
p
|
|
| spaCy now supports simple lookup-based lemmatization. The data is stored
|
|
| in a dictionary mapping a string to its lemma. To determine a token's
|
|
| lemma, spaCy simply looks it up in the table. The lookup lemmatizer can
|
|
| be imported from #[code spacy.lemmatizerlookup]. It's initialised with
|
|
| the lookup table, and should be returned by the #[code create_lemmatizer]
|
|
| classmethod of the language's defaults.
|
|
|
|
+infobox
|
|
| #[strong API:] #[+api("language") #[code Language]]
|
|
| #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
|
|
|
|
+h(3, "features-matcher") Revised matcher API
|
|
|
|
+aside-code("Example").
|
|
from spacy.matcher import Matcher
|
|
from spacy.attrs import LOWER, IS_PUNCT
|
|
matcher = Matcher(nlp.vocab)
|
|
matcher.add('HelloWorld', on_match=None,
|
|
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
|
|
[{LOWER: 'hello'}, {LOWER: 'world'}])
|
|
assert len(matcher) == 1
|
|
assert 'HelloWorld' in matcher
|
|
|
|
p
|
|
| Patterns can now be added to the matcher by calling
|
|
| #[+api("matcher-add") #[code matcher.add()]] with a match ID, an optional
|
|
| callback function to be invoked on each match, and one or more patterns.
|
|
| This allows you to write powerful, pattern-specific logic using only one
|
|
| matcher. For example, you might only want to merge some entity types,
|
|
| and set custom flags for other matched patterns.
|
|
|
|
+infobox
|
|
| #[strong API:] #[+api("matcher") #[code Matcher]]
|
|
| #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
|
|
|
|
+h(3, "features-serializer") Serialization
|
|
|
|
+infobox
|
|
| #[strong API:] #[+api("serializer") #[code Serializer]]
|
|
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
|
|
|
|
+h(3, "features-models") Neural network models for English, German, French and Spanish
|
|
|
|
+infobox
|
|
| #[strong Details:] #[+src(gh("spacy-models")) spacy-models]
|
|
| #[strong Usage:] #[+a("/docs/usage/models") Models]
|
|
|
|
+h(2, "incompat") Backwards incompatibilities
|
|
|
|
+table(["Old", "New"])
|
|
+row
|
|
+cell #[code Language.save_to_directory]
|
|
+cell #[+api("language#to_disk") #[code Language.to_disk]]
|
|
|
|
+row
|
|
+cell #[code Tokenizer.load]
|
|
+cell
|
|
| #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
|
|
| #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
|
|
|
|
+row
|
|
+cell #[code Tagger.load]
|
|
+cell
|
|
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
|
|
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
|
|
|
|
+row
|
|
+cell #[code DependencyParser.load]
|
|
+cell
|
|
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
|
|
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
|
|
|
|
+row
|
|
+cell #[code EntityRecognizer.load]
|
|
+cell
|
|
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
|
|
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
|
|
|
|
+row
|
|
+cell
|
|
| #[code Vocab.load]
|
|
| #[code Vocab.load_lexemes]
|
|
| #[code Vocab.load_vectors]
|
|
| #[code Vocab.load_vectors_from_bin_loc]
|
|
+cell
|
|
| #[+api("vocab#from_disk") #[code Vocab.from_disk]]
|
|
| #[+api("vocab#from_bytes") #[code Vocab.from_bytes]]
|
|
|
|
+row
|
|
+cell
|
|
| #[code Vocab.dump]
|
|
| #[code Vocab.dump_vectors]
|
|
+cell
|
|
| #[+api("vocab#to_disk") #[code Vocab.to_disk]]
|
|
| #[+api("vocab#to_bytes") #[code Vocab.to_bytes]]
|
|
|
|
+row
|
|
+cell
|
|
| #[code StringStore.load]
|
|
+cell
|
|
| #[+api("stringstore#from_disk") #[code StringStore.from_disk]]
|
|
| #[+api("stringstore#from_bytes") #[code StringStore.from_bytes]]
|
|
|
|
+row
|
|
+cell
|
|
| #[code StringStore.dump]
|
|
+cell
|
|
| #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
|
|
| #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
|
|
|
|
+row
|
|
+cell #[code Matcher.load]
|
|
+cell -
|
|
|
|
+row
|
|
+cell
|
|
| #[code Matcher.add_pattern]
|
|
| #[code Matcher.add_entity]
|
|
+cell #[+api("matcher#add") #[code Matcher.add]]
|
|
|
|
+row
|
|
+cell #[code Matcher.get_entity]
|
|
+cell #[+api("matcher#get") #[code Matcher.get]]
|
|
|
|
+row
|
|
+cell #[code Matcher.has_entity]
|
|
+cell #[+api("matcher#contains") #[code Matcher.__contains__]]
|
|
|
|
+row
|
|
+cell #[code Doc.read_bytes]
|
|
+cell
|
|
|
|
+row
|
|
+cell #[code Token.is_ancestor_of]
|
|
+cell #[+api("token#is_ancestor") #[code Token.is_ancestor]]
|
|
|
|
|
|
|
|
+h(2, "migrating") Migrating from spaCy 1.x
|