mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Tidy up and merge usage pages
This commit is contained in:
parent
990a70732a
commit
10afb3c796
|
@ -1,14 +0,0 @@
|
||||||
//- 💫 DOCS > API > PHILOSOPHY
|
|
||||||
|
|
||||||
include ../../_includes/_mixins
|
|
||||||
|
|
||||||
p Every product needs to know why it exists. Here's what we're trying to with spaCy and why it's different from other NLP libraries.
|
|
||||||
|
|
||||||
+h(2) 1. No job too big.
|
|
||||||
p Most programs get cheaper to run over time, but NLP programs often get more expensive. The data often grows faster than the hardware improves. For web-scale tasks, Moore's law can't save us — so if we want to read the web, we have to sweat performance.
|
|
||||||
|
|
||||||
+h(2) 2. Take a stand.
|
|
||||||
p Most NLP toolkits position themselves as platforms, rather than libraries. They offer a pluggable architecture, and leave it to the user to arrange the components they offer into a useful system. This is fine for researchers, but for production users, this does too little. Components go out of date quickly, and configuring a good system takes very detailed knowledge. Compatibility problems can be extremely subtle. spaCy is therefore extremely opinionated. The API does not expose any algorithmic details. You're free to configure another pipeline, but the core library eliminates redundancy, and only offers one choice of each component.
|
|
||||||
|
|
||||||
+h(2) 3. Stay current.
|
|
||||||
p There's often significant improvement in NLP models year-on-year. This has been especially true recently, given the success of deep learning models. With spaCy, you should be able to build things you couldn't build yesterday. To deliver on that promise, we need to be giving you the latest stuff.
|
|
|
@ -5,26 +5,23 @@
|
||||||
"Models": "models",
|
"Models": "models",
|
||||||
"spaCy 101": "spacy-101",
|
"spaCy 101": "spacy-101",
|
||||||
"Lightning tour": "lightning-tour",
|
"Lightning tour": "lightning-tour",
|
||||||
"Visualizers": "visualizers",
|
|
||||||
"Troubleshooting": "troubleshooting",
|
|
||||||
"What's new in v2.0": "v2"
|
"What's new in v2.0": "v2"
|
||||||
},
|
},
|
||||||
"Workflows": {
|
"Workflows": {
|
||||||
"Loading the pipeline": "language-processing-pipeline",
|
|
||||||
"Processing text": "processing-text",
|
|
||||||
"spaCy's data model": "data-model",
|
|
||||||
"POS tagging": "pos-tagging",
|
"POS tagging": "pos-tagging",
|
||||||
"Using the parse": "dependency-parse",
|
"Using the parse": "dependency-parse",
|
||||||
"Entity recognition": "entity-recognition",
|
"Entity recognition": "entity-recognition",
|
||||||
"Custom pipelines": "customizing-pipeline",
|
|
||||||
"Rule-based matching": "rule-based-matching",
|
|
||||||
"Word vectors": "word-vectors-similarities",
|
"Word vectors": "word-vectors-similarities",
|
||||||
"Deep learning": "deep-learning",
|
|
||||||
"Custom tokenization": "customizing-tokenizer",
|
"Custom tokenization": "customizing-tokenizer",
|
||||||
|
"Rule-based matching": "rule-based-matching",
|
||||||
"Adding languages": "adding-languages",
|
"Adding languages": "adding-languages",
|
||||||
|
"Processing text": "processing-text",
|
||||||
|
"NLP pipelines": "language-processing-pipeline",
|
||||||
|
"Deep learning": "deep-learning",
|
||||||
"Training": "training",
|
"Training": "training",
|
||||||
"Training NER": "training-ner",
|
"Training NER": "training-ner",
|
||||||
"Saving & loading": "saving-loading"
|
"Saving & loading": "saving-loading",
|
||||||
|
"Visualizers": "visualizers"
|
||||||
},
|
},
|
||||||
"Examples": {
|
"Examples": {
|
||||||
"Tutorials": "tutorials",
|
"Tutorials": "tutorials",
|
||||||
|
@ -38,10 +35,6 @@
|
||||||
"quickstart": true
|
"quickstart": true
|
||||||
},
|
},
|
||||||
|
|
||||||
"v2": {
|
|
||||||
"title": "What's new in v2.0"
|
|
||||||
},
|
|
||||||
|
|
||||||
"models": {
|
"models": {
|
||||||
"title": "Models",
|
"title": "Models",
|
||||||
"next": "spacy-101",
|
"next": "spacy-101",
|
||||||
|
@ -67,27 +60,13 @@
|
||||||
"next": "resources"
|
"next": "resources"
|
||||||
},
|
},
|
||||||
|
|
||||||
"resources": {
|
"v2": {
|
||||||
"title": "Resources"
|
"title": "What's new in v2.0"
|
||||||
},
|
},
|
||||||
|
|
||||||
"language-processing-pipeline": {
|
"pos-tagging": {
|
||||||
"title": "Loading a language processing pipeline",
|
"title": "Part-of-speech tagging",
|
||||||
"next": "processing-text"
|
"next": "dependency-parse"
|
||||||
},
|
|
||||||
|
|
||||||
"customizing-pipeline": {
|
|
||||||
"title": "Customizing the pipeline",
|
|
||||||
"next": "customizing-tokenizer"
|
|
||||||
},
|
|
||||||
|
|
||||||
"processing-text": {
|
|
||||||
"title": "Processing text",
|
|
||||||
"next": "data-model"
|
|
||||||
},
|
|
||||||
|
|
||||||
"data-model": {
|
|
||||||
"title": "Understanding spaCy's data model"
|
|
||||||
},
|
},
|
||||||
|
|
||||||
"dependency-parse": {
|
"dependency-parse": {
|
||||||
|
@ -97,26 +76,44 @@
|
||||||
|
|
||||||
"entity-recognition": {
|
"entity-recognition": {
|
||||||
"title": "Named Entity Recognition",
|
"title": "Named Entity Recognition",
|
||||||
"next": "rule-based-matching"
|
"next": "training-ner"
|
||||||
},
|
|
||||||
|
|
||||||
"rule-based-matching": {
|
|
||||||
"title": "Rule-based matching"
|
|
||||||
},
|
},
|
||||||
|
|
||||||
"word-vectors-similarities": {
|
"word-vectors-similarities": {
|
||||||
"title": "Using word vectors and semantic similarities"
|
"title": "Using word vectors and semantic similarities",
|
||||||
},
|
"next": "customizing-tokenizer"
|
||||||
|
|
||||||
"deep-learning": {
|
|
||||||
"title": "Hooking a deep learning model into spaCy"
|
|
||||||
},
|
},
|
||||||
|
|
||||||
"customizing-tokenizer": {
|
"customizing-tokenizer": {
|
||||||
"title": "Customizing the tokenizer",
|
"title": "Customizing the tokenizer",
|
||||||
|
"next": "rule-based-matching"
|
||||||
|
},
|
||||||
|
|
||||||
|
"rule-based-matching": {
|
||||||
|
"title": "Rule-based matching",
|
||||||
"next": "adding-languages"
|
"next": "adding-languages"
|
||||||
},
|
},
|
||||||
|
|
||||||
|
"adding-languages": {
|
||||||
|
"title": "Adding languages",
|
||||||
|
"next": "training"
|
||||||
|
},
|
||||||
|
|
||||||
|
"processing-text": {
|
||||||
|
"title": "Processing text",
|
||||||
|
"next": "language-processing-pipeline"
|
||||||
|
},
|
||||||
|
|
||||||
|
"language-processing-pipeline": {
|
||||||
|
"title": "Natural language processing pipelines",
|
||||||
|
"next": "deep-learning"
|
||||||
|
},
|
||||||
|
|
||||||
|
"deep-learning": {
|
||||||
|
"title": "Hooking a deep learning model into spaCy",
|
||||||
|
"next": "training"
|
||||||
|
},
|
||||||
|
|
||||||
"training": {
|
"training": {
|
||||||
"title": "Training spaCy's statistical models",
|
"title": "Training spaCy's statistical models",
|
||||||
"next": "saving-loading"
|
"next": "saving-loading"
|
||||||
|
@ -131,16 +128,6 @@
|
||||||
"title": "Saving and loading models"
|
"title": "Saving and loading models"
|
||||||
},
|
},
|
||||||
|
|
||||||
"pos-tagging": {
|
|
||||||
"title": "Part-of-speech tagging",
|
|
||||||
"next": "dependency-parse"
|
|
||||||
},
|
|
||||||
|
|
||||||
"adding-languages": {
|
|
||||||
"title": "Adding languages",
|
|
||||||
"next": "training"
|
|
||||||
},
|
|
||||||
|
|
||||||
"showcase": {
|
"showcase": {
|
||||||
"title": "Showcase",
|
"title": "Showcase",
|
||||||
|
|
||||||
|
|
|
@ -104,6 +104,9 @@ p
|
||||||
|
|
||||||
+image
|
+image
|
||||||
include ../../assets/img/docs/language_data.svg
|
include ../../assets/img/docs/language_data.svg
|
||||||
|
.u-text-right
|
||||||
|
+button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
|
||||||
|
|
||||||
|
|
||||||
+table(["File name", "Variables", "Description"])
|
+table(["File name", "Variables", "Description"])
|
||||||
+row
|
+row
|
||||||
|
|
|
@ -11,18 +11,56 @@ p
|
||||||
| #[code spaces] booleans, which allow you to maintain alignment of the
|
| #[code spaces] booleans, which allow you to maintain alignment of the
|
||||||
| tokens into the original string.
|
| tokens into the original string.
|
||||||
|
|
||||||
+aside("See Also")
|
+aside("spaCy's data model")
|
||||||
| If you haven't read up on spaCy's #[+a("data-model") data model] yet,
|
| The main point to keep in mind is that spaCy's #[code Doc] doesn't
|
||||||
| you should probably have a look. The main point to keep in mind is that
|
| copy or refer to the original string. The string is reconstructed from
|
||||||
| spaCy's #[code Doc] doesn't copy or refer to the original string. The
|
| the tokens when required.
|
||||||
| string is reconstructed from the tokens when required.
|
|
||||||
|
|
||||||
|
+h(2, "101") Tokenizer 101
|
||||||
|
|
||||||
|
include _spacy-101/_tokenization
|
||||||
|
|
||||||
|
|
||||||
|
+h(3, "101-data") Tokenizer data
|
||||||
|
|
||||||
|
p
|
||||||
|
| #[strong Global] and #[strong language-specific] tokenizer data is
|
||||||
|
| supplied via the language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang].
|
||||||
|
| The tokenizer exceptions define special cases like "don't" in English,
|
||||||
|
| which needs to be split into two tokens: #[code {ORTH: "do"}] and
|
||||||
|
| #[code {ORTH: "n't", LEMMA: "not"}]. The prefixes, suffixes and infixes
|
||||||
|
| mosty define punctuation rules – for example, when to split off periods
|
||||||
|
| (at the end of a sentence), and when to leave token containing periods
|
||||||
|
| intact (abbreviations like "U.S.").
|
||||||
|
|
||||||
|
+image
|
||||||
|
include ../../assets/img/docs/language_data.svg
|
||||||
|
.u-text-right
|
||||||
|
+button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
|
||||||
|
|
||||||
|
+infobox
|
||||||
|
| For more details on the language-specific data, see the
|
||||||
|
| usage workflow on #[+a("/docs/usage/adding-languages") adding languages].
|
||||||
|
|
||||||
+h(2, "special-cases") Adding special case tokenization rules
|
+h(2, "special-cases") Adding special case tokenization rules
|
||||||
|
|
||||||
p
|
p
|
||||||
| Most domains have at least some idiosyncracies that require custom
|
| Most domains have at least some idiosyncracies that require custom
|
||||||
| tokenization rules. Here's how to add a special case rule to an existing
|
| tokenization rules. This could be very certain expressions, or
|
||||||
|
| abbreviations only used in this specific field.
|
||||||
|
|
||||||
|
+aside("Language data vs. custom tokenization")
|
||||||
|
| Tokenization rules that are specific to one language, but can be
|
||||||
|
| #[strong generalised across that language] should ideally live in the
|
||||||
|
| language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang] – we
|
||||||
|
| always appreciate pull requests! Anything that's specific to a domain or
|
||||||
|
| text type – like financial trading abbreviations, or Bavarian youth slang
|
||||||
|
| – should be added as a special case rule to your tokenizer instance. If
|
||||||
|
| you're dealing with a lot of customisations, it might make sense to create
|
||||||
|
| an entirely custom subclass.
|
||||||
|
|
||||||
|
p
|
||||||
|
| Here's how to add a special case rule to an existing
|
||||||
| #[+api("tokenizer") #[code Tokenizer]] instance:
|
| #[+api("tokenizer") #[code Tokenizer]] instance:
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
|
@ -30,15 +68,12 @@ p
|
||||||
from spacy.symbols import ORTH, LEMMA, POS
|
from spacy.symbols import ORTH, LEMMA, POS
|
||||||
|
|
||||||
nlp = spacy.load('en')
|
nlp = spacy.load('en')
|
||||||
assert [w.text for w in nlp(u'gimme that')] == [u'gimme', u'that']
|
doc = nlp(u'gimme that') # phrase to tokenize
|
||||||
nlp.tokenizer.add_special_case(u'gimme',
|
assert [w.text for w in doc] == [u'gimme', u'that'] # current tokenization
|
||||||
[
|
|
||||||
{
|
# add special case rule
|
||||||
ORTH: u'gim',
|
special_case = [{ORTH: u'gim', LEMMA: u'give', POS: u'VERB'}, {ORTH: u'me'}]
|
||||||
LEMMA: u'give',
|
nlp.tokenizer.add_special_case(u'gimme', special_case)
|
||||||
POS: u'VERB'},
|
|
||||||
{
|
|
||||||
ORTH: u'me'}])
|
|
||||||
assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that']
|
assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that']
|
||||||
assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that']
|
assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that']
|
||||||
|
|
||||||
|
@ -55,9 +90,8 @@ p
|
||||||
| The special case rules have precedence over the punctuation splitting:
|
| The special case rules have precedence over the punctuation splitting:
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
nlp.tokenizer.add_special_case(u'...gimme...?',
|
special_case = [{ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]
|
||||||
[{
|
nlp.tokenizer.add_special_case(u'...gimme...?', special_case)
|
||||||
ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}])
|
|
||||||
assert len(nlp(u'...gimme...?')) == 1
|
assert len(nlp(u'...gimme...?')) == 1
|
||||||
|
|
||||||
p
|
p
|
||||||
|
@ -137,8 +171,8 @@ p
|
||||||
+h(2, "native-tokenizers") Customizing spaCy's Tokenizer class
|
+h(2, "native-tokenizers") Customizing spaCy's Tokenizer class
|
||||||
|
|
||||||
p
|
p
|
||||||
| Let's imagine you wanted to create a tokenizer for a new language. There
|
| Let's imagine you wanted to create a tokenizer for a new language or
|
||||||
| are four things you would need to define:
|
| specific domain. There are four things you would need to define:
|
||||||
|
|
||||||
+list("numbers")
|
+list("numbers")
|
||||||
+item
|
+item
|
||||||
|
@ -170,14 +204,14 @@ p
|
||||||
import re
|
import re
|
||||||
from spacy.tokenizer import Tokenizer
|
from spacy.tokenizer import Tokenizer
|
||||||
|
|
||||||
prefix_re = re.compile(r'''[\[\("']''')
|
prefix_re = re.compile(r'''[\[\("']''')
|
||||||
suffix_re = re.compile(r'''[\]\)"']''')
|
suffix_re = re.compile(r'''[\]\)"']''')
|
||||||
|
|
||||||
def create_tokenizer(nlp):
|
def create_tokenizer(nlp):
|
||||||
return Tokenizer(nlp.vocab,
|
return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
|
||||||
prefix_search=prefix_re.search,
|
|
||||||
suffix_search=suffix_re.search)
|
suffix_search=suffix_re.search)
|
||||||
|
|
||||||
nlp = spacy.load('en', tokenizer=create_make_doc)
|
nlp = spacy.load('en', tokenizer=create_tokenizer)
|
||||||
|
|
||||||
p
|
p
|
||||||
| If you need to subclass the tokenizer instead, the relevant methods to
|
| If you need to subclass the tokenizer instead, the relevant methods to
|
||||||
|
@ -191,8 +225,6 @@ p
|
||||||
| you're creating the pipeline:
|
| you're creating the pipeline:
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
import spacy
|
|
||||||
|
|
||||||
nlp = spacy.load('en', make_doc=my_tokenizer)
|
nlp = spacy.load('en', make_doc=my_tokenizer)
|
||||||
|
|
||||||
p
|
p
|
||||||
|
|
|
@ -126,3 +126,40 @@ p
|
||||||
+row
|
+row
|
||||||
+cell #[code matcher]
|
+cell #[code matcher]
|
||||||
+cell Supply a pre-built matcher, instead of creating one.
|
+cell Supply a pre-built matcher, instead of creating one.
|
||||||
|
|
||||||
|
+h(2, "customizing") Customizing the pipeline
|
||||||
|
|
||||||
|
p
|
||||||
|
| spaCy provides several linguistic annotation functions by default. Each
|
||||||
|
| function takes a Doc object, and modifies it in-place. The default
|
||||||
|
| pipeline is #[code [nlp.tagger, nlp.entity, nlp.parser]]. spaCy 1.0
|
||||||
|
| introduced the ability to customise this pipeline with arbitrary
|
||||||
|
| functions.
|
||||||
|
|
||||||
|
+code.
|
||||||
|
def arbitrary_fixup_rules(doc):
|
||||||
|
for token in doc:
|
||||||
|
if token.text == u'bill' and token.tag_ == u'NNP':
|
||||||
|
token.tag_ = u'NN'
|
||||||
|
|
||||||
|
def custom_pipeline(nlp):
|
||||||
|
return (nlp.tagger, arbitrary_fixup_rules, nlp.parser, nlp.entity)
|
||||||
|
|
||||||
|
nlp = spacy.load('en', create_pipeline=custom_pipeline)
|
||||||
|
|
||||||
|
p
|
||||||
|
| The easiest way to customise the pipeline is to pass a
|
||||||
|
| #[code create_pipeline] callback to the #[code spacy.load()] function.
|
||||||
|
|
||||||
|
p
|
||||||
|
| The callback you pass to #[code create_pipeline] should take a single
|
||||||
|
| argument, and return a sequence of callables. Each callable in the
|
||||||
|
| sequence should accept a #[code Doc] object and modify it in place.
|
||||||
|
|
||||||
|
p
|
||||||
|
| Instead of passing a callback, you can also write to the
|
||||||
|
| #[code .pipeline] attribute directly.
|
||||||
|
|
||||||
|
+code.
|
||||||
|
nlp = spacy.load('en')
|
||||||
|
nlp.pipeline = [nlp.tagger]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user