mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Rename processing text to production use and remove linear feature scheme
This commit is contained in:
parent
419d265ff0
commit
0f48fb1f97
|
@ -27,8 +27,7 @@
|
|||
"GoldCorpus": "goldcorpus"
|
||||
},
|
||||
"Other": {
|
||||
"Annotation Specs": "annotation",
|
||||
"Feature Scheme": "features"
|
||||
"Annotation Specs": "annotation"
|
||||
}
|
||||
},
|
||||
|
||||
|
@ -143,9 +142,5 @@
|
|||
|
||||
"annotation": {
|
||||
"title": "Annotation Specifications"
|
||||
},
|
||||
|
||||
"features": {
|
||||
"title": "Linear Model Feature Scheme"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,138 +0,0 @@
|
|||
//- 💫 DOCS > API > LINEAR MOEL FEATURES
|
||||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
p
|
||||
| There are two popular strategies for putting together machine learning
|
||||
| models for NLP: sparse linear models, and neural networks. To solve NLP
|
||||
| problems with linear models, feature templates need to be assembled that
|
||||
| combine multiple atomic predictors. This page documents the atomic
|
||||
| predictors used in the spaCy 1.0 #[+api("parser") #[code Parser]],
|
||||
| #[+api("tagger") #[code Tagger]] and
|
||||
| #[+api("entityrecognizer") #[code EntityRecognizer]].
|
||||
|
||||
p
|
||||
| To understand the scheme, recall that spaCy's #[code Parser] and
|
||||
| #[code EntityRecognizer] are implemented as push-down automata. They
|
||||
| maintain a "stack" that holds the current entity, and a "buffer"
|
||||
| consisting of the words to be processed.
|
||||
|
||||
p
|
||||
| Each state consists of the words on the stack (if any), which consistute
|
||||
| the current entity being constructed. We also have the current word, and
|
||||
| the two subsequent words. Finally, we also have the entities previously
|
||||
| built.
|
||||
|
||||
p
|
||||
| This gives us a number of tokens to ask questions about, to make the
|
||||
| features. About each of these tokens, we can ask about a number of
|
||||
| different properties. Each feature identifier asks about a specific
|
||||
| property of a specific token of the context.
|
||||
|
||||
+h(2, "tokens") Context tokens
|
||||
|
||||
+table([ "ID", "Description" ])
|
||||
+row
|
||||
+cell #[code S0]
|
||||
+cell
|
||||
| The first word on the stack, i.e. the token most recently added
|
||||
| to the current entity.
|
||||
|
||||
+row
|
||||
+cell #[code S1]
|
||||
+cell The second word on the stack, i.e. the second most recently added.
|
||||
|
||||
+row
|
||||
+cell #[code S2]
|
||||
+cell The third word on the stack, i.e. the third most recently added.
|
||||
|
||||
+row
|
||||
+cell #[code N0]
|
||||
+cell The first word of the buffer, i.e. the current word being tagged.
|
||||
|
||||
+row
|
||||
+cell #[code N1]
|
||||
+cell The second word of the buffer.
|
||||
|
||||
+row
|
||||
+cell #[code N2]
|
||||
+cell The third word of the buffer.
|
||||
|
||||
+row
|
||||
+cell #[code P1]
|
||||
+cell The word immediately before #[code N0].
|
||||
|
||||
+row
|
||||
+cell #[code P2]
|
||||
+cell The second word before #[code N0].
|
||||
|
||||
+row
|
||||
+cell #[code E0]
|
||||
+cell The first word of the previously constructed entity.
|
||||
|
||||
+row
|
||||
+cell #[code E1]
|
||||
+cell The first word of the second previously constructed entity.
|
||||
|
||||
p About each of these tokens, we can ask:
|
||||
|
||||
+table([ "ID", "Attribute", "Description" ])
|
||||
+row
|
||||
+cell #[code N0w]
|
||||
+cell #[code token.orth]
|
||||
+cell The word form.
|
||||
|
||||
+row
|
||||
+cell #[code N0W]
|
||||
+cell #[code token.lemma]
|
||||
+cell The word's lemma.
|
||||
|
||||
+row
|
||||
+cell #[code N0p]
|
||||
+cell #[code token.tag]
|
||||
+cell The word's (full) POS tag.
|
||||
|
||||
+row
|
||||
+cell #[code N0c]
|
||||
+cell #[code token.cluster]
|
||||
+cell The word's (full) Brown cluster.
|
||||
|
||||
+row
|
||||
+cell #[code N0c4]
|
||||
+cell -
|
||||
+cell First four digit prefix of the word's Brown cluster.
|
||||
|
||||
+row
|
||||
+cell #[code N0c6]
|
||||
+cell -
|
||||
+cell First six digit prefix of the word's Brown cluster.
|
||||
|
||||
+row
|
||||
+cell #[code N0L]
|
||||
+cell -
|
||||
+cell The word's dependency label. Not used as a feature in the NER.
|
||||
|
||||
+row
|
||||
+cell #[code N0_prefix]
|
||||
+cell #[code token.prefix]
|
||||
+cell The first three characters of the word.
|
||||
|
||||
+row
|
||||
+cell #[code N0_suffix]
|
||||
+cell #[code token.suffix]
|
||||
+cell The last three characters of the word.
|
||||
|
||||
+row
|
||||
+cell #[code N0_shape]
|
||||
+cell #[code token.shape]
|
||||
+cell The word's shape, i.e. is it alphabetic, numeric, etc.
|
||||
|
||||
+row
|
||||
+cell #[code N0_ne_iob]
|
||||
+cell #[code token.ent_iob]
|
||||
+cell The Inside/Outside/Begin code of the word's NER tag.
|
||||
|
||||
+row
|
||||
+cell #[code N0_ne_type]
|
||||
+cell #[code token.ent_type]
|
||||
+cell The word's NER type.
|
|
@ -15,9 +15,9 @@
|
|||
"Custom tokenization": "customizing-tokenizer",
|
||||
"Rule-based matching": "rule-based-matching",
|
||||
"Adding languages": "adding-languages",
|
||||
"Processing text": "processing-text",
|
||||
"NLP pipelines": "language-processing-pipeline",
|
||||
"Deep learning": "deep-learning",
|
||||
"Production use": "production-use",
|
||||
"Training": "training",
|
||||
"Training NER": "training-ner",
|
||||
"Saving & loading": "saving-loading",
|
||||
|
@ -99,11 +99,6 @@
|
|||
"next": "training"
|
||||
},
|
||||
|
||||
"processing-text": {
|
||||
"title": "Processing text",
|
||||
"next": "language-processing-pipeline"
|
||||
},
|
||||
|
||||
"language-processing-pipeline": {
|
||||
"title": "Language processing pipelines",
|
||||
"next": "deep-learning"
|
||||
|
@ -111,9 +106,15 @@
|
|||
|
||||
"deep-learning": {
|
||||
"title": "Hooking a deep learning model into spaCy",
|
||||
"next": "production use"
|
||||
},
|
||||
|
||||
"production-use": {
|
||||
"title": "Production use",
|
||||
"next": "training"
|
||||
},
|
||||
|
||||
|
||||
"training": {
|
||||
"title": "Training spaCy's statistical models",
|
||||
"next": "saving-loading"
|
||||
|
|
|
@ -6,69 +6,6 @@ p
|
|||
| Once you have loaded the #[code nlp] object, you can call it as though
|
||||
| it were a function. This allows you to process a single unicode string.
|
||||
|
||||
+code.
|
||||
doc = nlp(u'Hello, world! A three sentence document.\nWith new lines...')
|
||||
|
||||
p
|
||||
| The library should perform equally well with #[strong short or long documents].
|
||||
| All algorithms are linear-time in the length of the string, and once the
|
||||
| data is loaded, there's no significant start-up cost to consider. This
|
||||
| means that you don't have to strategically merge or split your text —
|
||||
| you should feel free to feed in either single tweets or whole novels.
|
||||
|
||||
p
|
||||
| If you run #[+api("spacy#load") #[code spacy.load('en')]], spaCy will
|
||||
| load the #[+a("/docs/usage/models") model] associated with the name
|
||||
| #[code 'en']. Each model is a Python package containing an
|
||||
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py"))__init__.py]
|
||||
|
||||
the #[code nlp] object will
|
||||
| be an instance of #[code spacy.en.English]. This means that when you run
|
||||
| #[code doc = nlp(text)], you're executing
|
||||
| #[code spacy.en.English.__call__], which is implemented on its parent
|
||||
| class, #[+api("language") #[code Language]].
|
||||
|
||||
+code.
|
||||
doc = nlp.make_doc(text)
|
||||
for proc in nlp.pipeline:
|
||||
proc(doc)
|
||||
|
||||
p
|
||||
| I've tried to make sure that the #[code Language.__call__] function
|
||||
| doesn't do any "heavy lifting", so that you won't have complicated logic
|
||||
| to replicate if you need to make your own pipeline class. This is all it
|
||||
| does.
|
||||
|
||||
p
|
||||
| The #[code .make_doc()] method and #[code .pipeline] attribute make it
|
||||
| easier to customise spaCy's behaviour. If you're using the default
|
||||
| pipeline, we can desugar one more time.
|
||||
|
||||
+code.
|
||||
doc = nlp.tokenizer(text)
|
||||
nlp.tagger(doc)
|
||||
nlp.parser(doc)
|
||||
nlp.entity(doc)
|
||||
|
||||
p Finally, here's where you can find out about each of those components:
|
||||
|
||||
+table(["Name", "Source"])
|
||||
+row
|
||||
+cell #[code tokenizer]
|
||||
+cell #[+src(gh("spacy", "spacy/tokenizer.pyx")) spacy.tokenizer.Tokenizer]
|
||||
|
||||
+row
|
||||
+cell #[code tagger]
|
||||
+cell #[+src(gh("spacy", "spacy/tagger.pyx")) spacy.pipeline.Tagger]
|
||||
|
||||
+row
|
||||
+cell #[code parser]
|
||||
+cell #[+src(gh("spacy", "spacy/syntax/parser.pyx")) spacy.pipeline.DependencyParser]
|
||||
|
||||
+row
|
||||
+cell #[code entity]
|
||||
+cell #[+src(gh("spacy", "spacy/syntax/parser.pyx")) spacy.pipeline.EntityRecognizer]
|
||||
|
||||
+h(2, "multithreading") Multi-threading with #[code .pipe()]
|
||||
|
||||
p
|
Loading…
Reference in New Issue
Block a user