mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Tidy up and merge usage pages
This commit is contained in:
parent
990a70732a
commit
10afb3c796
|
@ -1,14 +0,0 @@
|
|||
//- 💫 DOCS > API > PHILOSOPHY
|
||||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
p Every product needs to know why it exists. Here's what we're trying to with spaCy and why it's different from other NLP libraries.
|
||||
|
||||
+h(2) 1. No job too big.
|
||||
p Most programs get cheaper to run over time, but NLP programs often get more expensive. The data often grows faster than the hardware improves. For web-scale tasks, Moore's law can't save us — so if we want to read the web, we have to sweat performance.
|
||||
|
||||
+h(2) 2. Take a stand.
|
||||
p Most NLP toolkits position themselves as platforms, rather than libraries. They offer a pluggable architecture, and leave it to the user to arrange the components they offer into a useful system. This is fine for researchers, but for production users, this does too little. Components go out of date quickly, and configuring a good system takes very detailed knowledge. Compatibility problems can be extremely subtle. spaCy is therefore extremely opinionated. The API does not expose any algorithmic details. You're free to configure another pipeline, but the core library eliminates redundancy, and only offers one choice of each component.
|
||||
|
||||
+h(2) 3. Stay current.
|
||||
p There's often significant improvement in NLP models year-on-year. This has been especially true recently, given the success of deep learning models. With spaCy, you should be able to build things you couldn't build yesterday. To deliver on that promise, we need to be giving you the latest stuff.
|
|
@ -5,26 +5,23 @@
|
|||
"Models": "models",
|
||||
"spaCy 101": "spacy-101",
|
||||
"Lightning tour": "lightning-tour",
|
||||
"Visualizers": "visualizers",
|
||||
"Troubleshooting": "troubleshooting",
|
||||
"What's new in v2.0": "v2"
|
||||
},
|
||||
"Workflows": {
|
||||
"Loading the pipeline": "language-processing-pipeline",
|
||||
"Processing text": "processing-text",
|
||||
"spaCy's data model": "data-model",
|
||||
"POS tagging": "pos-tagging",
|
||||
"Using the parse": "dependency-parse",
|
||||
"Entity recognition": "entity-recognition",
|
||||
"Custom pipelines": "customizing-pipeline",
|
||||
"Rule-based matching": "rule-based-matching",
|
||||
"Word vectors": "word-vectors-similarities",
|
||||
"Deep learning": "deep-learning",
|
||||
"Custom tokenization": "customizing-tokenizer",
|
||||
"Rule-based matching": "rule-based-matching",
|
||||
"Adding languages": "adding-languages",
|
||||
"Processing text": "processing-text",
|
||||
"NLP pipelines": "language-processing-pipeline",
|
||||
"Deep learning": "deep-learning",
|
||||
"Training": "training",
|
||||
"Training NER": "training-ner",
|
||||
"Saving & loading": "saving-loading"
|
||||
"Saving & loading": "saving-loading",
|
||||
"Visualizers": "visualizers"
|
||||
},
|
||||
"Examples": {
|
||||
"Tutorials": "tutorials",
|
||||
|
@ -38,10 +35,6 @@
|
|||
"quickstart": true
|
||||
},
|
||||
|
||||
"v2": {
|
||||
"title": "What's new in v2.0"
|
||||
},
|
||||
|
||||
"models": {
|
||||
"title": "Models",
|
||||
"next": "spacy-101",
|
||||
|
@ -67,27 +60,13 @@
|
|||
"next": "resources"
|
||||
},
|
||||
|
||||
"resources": {
|
||||
"title": "Resources"
|
||||
"v2": {
|
||||
"title": "What's new in v2.0"
|
||||
},
|
||||
|
||||
"language-processing-pipeline": {
|
||||
"title": "Loading a language processing pipeline",
|
||||
"next": "processing-text"
|
||||
},
|
||||
|
||||
"customizing-pipeline": {
|
||||
"title": "Customizing the pipeline",
|
||||
"next": "customizing-tokenizer"
|
||||
},
|
||||
|
||||
"processing-text": {
|
||||
"title": "Processing text",
|
||||
"next": "data-model"
|
||||
},
|
||||
|
||||
"data-model": {
|
||||
"title": "Understanding spaCy's data model"
|
||||
"pos-tagging": {
|
||||
"title": "Part-of-speech tagging",
|
||||
"next": "dependency-parse"
|
||||
},
|
||||
|
||||
"dependency-parse": {
|
||||
|
@ -97,26 +76,44 @@
|
|||
|
||||
"entity-recognition": {
|
||||
"title": "Named Entity Recognition",
|
||||
"next": "rule-based-matching"
|
||||
},
|
||||
|
||||
"rule-based-matching": {
|
||||
"title": "Rule-based matching"
|
||||
"next": "training-ner"
|
||||
},
|
||||
|
||||
"word-vectors-similarities": {
|
||||
"title": "Using word vectors and semantic similarities"
|
||||
},
|
||||
|
||||
"deep-learning": {
|
||||
"title": "Hooking a deep learning model into spaCy"
|
||||
"title": "Using word vectors and semantic similarities",
|
||||
"next": "customizing-tokenizer"
|
||||
},
|
||||
|
||||
"customizing-tokenizer": {
|
||||
"title": "Customizing the tokenizer",
|
||||
"next": "rule-based-matching"
|
||||
},
|
||||
|
||||
"rule-based-matching": {
|
||||
"title": "Rule-based matching",
|
||||
"next": "adding-languages"
|
||||
},
|
||||
|
||||
"adding-languages": {
|
||||
"title": "Adding languages",
|
||||
"next": "training"
|
||||
},
|
||||
|
||||
"processing-text": {
|
||||
"title": "Processing text",
|
||||
"next": "language-processing-pipeline"
|
||||
},
|
||||
|
||||
"language-processing-pipeline": {
|
||||
"title": "Natural language processing pipelines",
|
||||
"next": "deep-learning"
|
||||
},
|
||||
|
||||
"deep-learning": {
|
||||
"title": "Hooking a deep learning model into spaCy",
|
||||
"next": "training"
|
||||
},
|
||||
|
||||
"training": {
|
||||
"title": "Training spaCy's statistical models",
|
||||
"next": "saving-loading"
|
||||
|
@ -131,16 +128,6 @@
|
|||
"title": "Saving and loading models"
|
||||
},
|
||||
|
||||
"pos-tagging": {
|
||||
"title": "Part-of-speech tagging",
|
||||
"next": "dependency-parse"
|
||||
},
|
||||
|
||||
"adding-languages": {
|
||||
"title": "Adding languages",
|
||||
"next": "training"
|
||||
},
|
||||
|
||||
"showcase": {
|
||||
"title": "Showcase",
|
||||
|
||||
|
|
|
@ -104,6 +104,9 @@ p
|
|||
|
||||
+image
|
||||
include ../../assets/img/docs/language_data.svg
|
||||
.u-text-right
|
||||
+button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
|
||||
|
||||
|
||||
+table(["File name", "Variables", "Description"])
|
||||
+row
|
||||
|
|
|
@ -11,18 +11,56 @@ p
|
|||
| #[code spaces] booleans, which allow you to maintain alignment of the
|
||||
| tokens into the original string.
|
||||
|
||||
+aside("See Also")
|
||||
| If you haven't read up on spaCy's #[+a("data-model") data model] yet,
|
||||
| you should probably have a look. The main point to keep in mind is that
|
||||
| spaCy's #[code Doc] doesn't copy or refer to the original string. The
|
||||
| string is reconstructed from the tokens when required.
|
||||
+aside("spaCy's data model")
|
||||
| The main point to keep in mind is that spaCy's #[code Doc] doesn't
|
||||
| copy or refer to the original string. The string is reconstructed from
|
||||
| the tokens when required.
|
||||
|
||||
+h(2, "101") Tokenizer 101
|
||||
|
||||
include _spacy-101/_tokenization
|
||||
|
||||
|
||||
+h(3, "101-data") Tokenizer data
|
||||
|
||||
p
|
||||
| #[strong Global] and #[strong language-specific] tokenizer data is
|
||||
| supplied via the language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang].
|
||||
| The tokenizer exceptions define special cases like "don't" in English,
|
||||
| which needs to be split into two tokens: #[code {ORTH: "do"}] and
|
||||
| #[code {ORTH: "n't", LEMMA: "not"}]. The prefixes, suffixes and infixes
|
||||
| mosty define punctuation rules – for example, when to split off periods
|
||||
| (at the end of a sentence), and when to leave token containing periods
|
||||
| intact (abbreviations like "U.S.").
|
||||
|
||||
+image
|
||||
include ../../assets/img/docs/language_data.svg
|
||||
.u-text-right
|
||||
+button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
|
||||
|
||||
+infobox
|
||||
| For more details on the language-specific data, see the
|
||||
| usage workflow on #[+a("/docs/usage/adding-languages") adding languages].
|
||||
|
||||
+h(2, "special-cases") Adding special case tokenization rules
|
||||
|
||||
p
|
||||
| Most domains have at least some idiosyncracies that require custom
|
||||
| tokenization rules. Here's how to add a special case rule to an existing
|
||||
| tokenization rules. This could be very certain expressions, or
|
||||
| abbreviations only used in this specific field.
|
||||
|
||||
+aside("Language data vs. custom tokenization")
|
||||
| Tokenization rules that are specific to one language, but can be
|
||||
| #[strong generalised across that language] should ideally live in the
|
||||
| language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang] – we
|
||||
| always appreciate pull requests! Anything that's specific to a domain or
|
||||
| text type – like financial trading abbreviations, or Bavarian youth slang
|
||||
| – should be added as a special case rule to your tokenizer instance. If
|
||||
| you're dealing with a lot of customisations, it might make sense to create
|
||||
| an entirely custom subclass.
|
||||
|
||||
p
|
||||
| Here's how to add a special case rule to an existing
|
||||
| #[+api("tokenizer") #[code Tokenizer]] instance:
|
||||
|
||||
+code.
|
||||
|
@ -30,15 +68,12 @@ p
|
|||
from spacy.symbols import ORTH, LEMMA, POS
|
||||
|
||||
nlp = spacy.load('en')
|
||||
assert [w.text for w in nlp(u'gimme that')] == [u'gimme', u'that']
|
||||
nlp.tokenizer.add_special_case(u'gimme',
|
||||
[
|
||||
{
|
||||
ORTH: u'gim',
|
||||
LEMMA: u'give',
|
||||
POS: u'VERB'},
|
||||
{
|
||||
ORTH: u'me'}])
|
||||
doc = nlp(u'gimme that') # phrase to tokenize
|
||||
assert [w.text for w in doc] == [u'gimme', u'that'] # current tokenization
|
||||
|
||||
# add special case rule
|
||||
special_case = [{ORTH: u'gim', LEMMA: u'give', POS: u'VERB'}, {ORTH: u'me'}]
|
||||
nlp.tokenizer.add_special_case(u'gimme', special_case)
|
||||
assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that']
|
||||
assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that']
|
||||
|
||||
|
@ -55,9 +90,8 @@ p
|
|||
| The special case rules have precedence over the punctuation splitting:
|
||||
|
||||
+code.
|
||||
nlp.tokenizer.add_special_case(u'...gimme...?',
|
||||
[{
|
||||
ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}])
|
||||
special_case = [{ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]
|
||||
nlp.tokenizer.add_special_case(u'...gimme...?', special_case)
|
||||
assert len(nlp(u'...gimme...?')) == 1
|
||||
|
||||
p
|
||||
|
@ -137,8 +171,8 @@ p
|
|||
+h(2, "native-tokenizers") Customizing spaCy's Tokenizer class
|
||||
|
||||
p
|
||||
| Let's imagine you wanted to create a tokenizer for a new language. There
|
||||
| are four things you would need to define:
|
||||
| Let's imagine you wanted to create a tokenizer for a new language or
|
||||
| specific domain. There are four things you would need to define:
|
||||
|
||||
+list("numbers")
|
||||
+item
|
||||
|
@ -170,14 +204,14 @@ p
|
|||
import re
|
||||
from spacy.tokenizer import Tokenizer
|
||||
|
||||
prefix_re = re.compile(r'''[\[\("']''')
|
||||
suffix_re = re.compile(r'''[\]\)"']''')
|
||||
def create_tokenizer(nlp):
|
||||
return Tokenizer(nlp.vocab,
|
||||
prefix_search=prefix_re.search,
|
||||
suffix_search=suffix_re.search)
|
||||
prefix_re = re.compile(r'''[\[\("']''')
|
||||
suffix_re = re.compile(r'''[\]\)"']''')
|
||||
|
||||
nlp = spacy.load('en', tokenizer=create_make_doc)
|
||||
def create_tokenizer(nlp):
|
||||
return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
|
||||
suffix_search=suffix_re.search)
|
||||
|
||||
nlp = spacy.load('en', tokenizer=create_tokenizer)
|
||||
|
||||
p
|
||||
| If you need to subclass the tokenizer instead, the relevant methods to
|
||||
|
@ -191,8 +225,6 @@ p
|
|||
| you're creating the pipeline:
|
||||
|
||||
+code.
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load('en', make_doc=my_tokenizer)
|
||||
|
||||
p
|
||||
|
|
|
@ -126,3 +126,40 @@ p
|
|||
+row
|
||||
+cell #[code matcher]
|
||||
+cell Supply a pre-built matcher, instead of creating one.
|
||||
|
||||
+h(2, "customizing") Customizing the pipeline
|
||||
|
||||
p
|
||||
| spaCy provides several linguistic annotation functions by default. Each
|
||||
| function takes a Doc object, and modifies it in-place. The default
|
||||
| pipeline is #[code [nlp.tagger, nlp.entity, nlp.parser]]. spaCy 1.0
|
||||
| introduced the ability to customise this pipeline with arbitrary
|
||||
| functions.
|
||||
|
||||
+code.
|
||||
def arbitrary_fixup_rules(doc):
|
||||
for token in doc:
|
||||
if token.text == u'bill' and token.tag_ == u'NNP':
|
||||
token.tag_ = u'NN'
|
||||
|
||||
def custom_pipeline(nlp):
|
||||
return (nlp.tagger, arbitrary_fixup_rules, nlp.parser, nlp.entity)
|
||||
|
||||
nlp = spacy.load('en', create_pipeline=custom_pipeline)
|
||||
|
||||
p
|
||||
| The easiest way to customise the pipeline is to pass a
|
||||
| #[code create_pipeline] callback to the #[code spacy.load()] function.
|
||||
|
||||
p
|
||||
| The callback you pass to #[code create_pipeline] should take a single
|
||||
| argument, and return a sequence of callables. Each callable in the
|
||||
| sequence should accept a #[code Doc] object and modify it in place.
|
||||
|
||||
p
|
||||
| Instead of passing a callback, you can also write to the
|
||||
| #[code .pipeline] attribute directly.
|
||||
|
||||
+code.
|
||||
nlp = spacy.load('en')
|
||||
nlp.pipeline = [nlp.tagger]
|
||||
|
|
Loading…
Reference in New Issue
Block a user