mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Update architecture blurb and move other info
This commit is contained in:
		
							parent
							
								
									f2c4a9f690
								
							
						
					
					
						commit
						1d6377218a
					
				|  | @ -1,15 +1,83 @@ | |||
| //- 💫 DOCS > USAGE > SPACY 101 > ARCHITECTURE | ||||
| 
 | ||||
| p | ||||
|     |  The central data structures in spaCy are the #[code Doc] and the #[code Vocab]. | ||||
|     |  The #[code doc] object owns the sequence of tokens and all their annotations. | ||||
|     |  the #[code vocab] owns a set of look-up tables that make common information | ||||
|     |  available across documents. By centralising strings, word vectors and lexical | ||||
|     |  attributes, we avoid storing multiple copies of this data. This saves memory, and | ||||
|     |  ensures there's a single source of truth. Text annotations are also designed to | ||||
|     |  allow a single source of truth: the #[code Doc] object owns the data, and | ||||
|     |  #[code Span] and #[code Token] are views that point into it. The #[code Doc] | ||||
|     |  object is constructed by the #[code Tokenizer], and then modified in-place by | ||||
|     |  the components of the pipeline. The #[code Language] object coordinates these | ||||
|     |  components. It takes raw text and sends it through the pipeline, returning | ||||
|     |  an annotated document. It also orchestrates training and serialisation. | ||||
|     |  The central data structures in spaCy are the #[code Doc] and the | ||||
|     |  #[code Vocab]. The #[code Doc] object owns the | ||||
|     |  #[strong sequence of tokens] and all their annotations. The #[code Vocab] | ||||
|     |  object owns a set of #[strong look-up tables] that make common | ||||
|     |  information available across documents. By centralising strings, word | ||||
|     |  vectors and lexical attributes, we avoid storing multiple copies of this | ||||
|     |  data. This saves memory, and ensures there's a | ||||
|     |  #[strong single source of truth]. | ||||
| 
 | ||||
| p | ||||
|     |  Text annotations are also designed to allow a single source of truth: the | ||||
|     |  #[code Doc] object owns the data, and #[code Span] and #[code Token] are | ||||
|     |  #[strong views that point into it]. The #[code Doc] object is constructed | ||||
|     |  by the #[code Tokenizer], and then #[strong modified in place] by the | ||||
|     |  components of the pipeline. The #[code Language] object coordinates these | ||||
|     |  components. It takes raw text and sends it through the pipeline, | ||||
|     |  returning an #[strong annotated document]. It also orchestrates training | ||||
|     |  and serialization. | ||||
| 
 | ||||
| +image | ||||
|     include ../../../assets/img/docs/architecture.svg | ||||
|     .u-text-right | ||||
|         +button("/assets/img/docs/architecture.svg", false, "secondary").u-text-tag View large graphic | ||||
| 
 | ||||
| +table(["Name", "Description"]) | ||||
|     +row | ||||
|         +cell #[+api("language") #[code Language]] | ||||
|         +cell | ||||
|             |  A text-processing pipeline. Usually you'll load this once per | ||||
|             |  process as #[code nlp] and pass the instance around your application. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[+api("doc") #[code Doc]] | ||||
|         +cell A container for accessing linguistic annotations. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[+api("span") #[code Span]] | ||||
|         +cell A slice from a #[code Doc] object. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[+api("token") #[code Token]] | ||||
|         +cell | ||||
|             |  An individual token — i.e. a word, punctuation symbol, whitespace, | ||||
|             |  etc. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[+api("lexeme") #[code Lexeme]] | ||||
|         +cell | ||||
|             |  An entry in the vocabulary. It's a word type with no context, as | ||||
|             |  opposed to a word token. It therefore has no part-of-speech tag, | ||||
|             |  dependency parse etc. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[+api("vocab") #[code Vocab]] | ||||
|         +cell | ||||
|             |  A lookup table for the vocabulary that allows you to access | ||||
|             |  #[code Lexeme] objects. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code Morphology] | ||||
|         +cell | ||||
|             |  Assign linguistic features like lemmas, noun case, verb tense etc. | ||||
|             |  based on the word and its part-of-speech tag. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[+api("stringstore") #[code StringStore]] | ||||
|         +cell Map strings to and from hash values. | ||||
| 
 | ||||
|     +row | ||||
|         +row | ||||
|         +cell #[+api("tokenizer") #[code Tokenizer]] | ||||
|         +cell | ||||
|             |  Segment text, and create #[code Doc] objects with the discovered | ||||
|             |  segment boundaries. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[+api("matcher") #[code Matcher]] | ||||
|         +cell | ||||
|             |  Match sequences of tokens, based on pattern rules, similar to | ||||
|             |  regular expressions. | ||||
|  |  | |||
|  | @ -274,68 +274,6 @@ include _spacy-101/_language-data | |||
| 
 | ||||
| include _spacy-101/_architecture.jade | ||||
| 
 | ||||
| +image | ||||
|     include ../../assets/img/docs/architecture.svg | ||||
|     .u-text-right | ||||
|         +button("/assets/img/docs/architecture.svg", false, "secondary").u-text-tag View large graphic | ||||
| 
 | ||||
| +table(["Name", "Description"]) | ||||
|     +row | ||||
|         +cell #[+api("language") #[code Language]] | ||||
|         +cell | ||||
|             |  A text-processing pipeline. Usually you'll load this once per | ||||
|             |  process as #[code nlp] and pass the instance around your application. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[+api("doc") #[code Doc]] | ||||
|         +cell A container for accessing linguistic annotations. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[+api("span") #[code Span]] | ||||
|         +cell A slice from a #[code Doc] object. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[+api("token") #[code Token]] | ||||
|         +cell | ||||
|             |  An individual token — i.e. a word, punctuation symbol, whitespace, | ||||
|             |  etc. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[+api("lexeme") #[code Lexeme]] | ||||
|         +cell | ||||
|             |  An entry in the vocabulary. It's a word type with no context, as | ||||
|             |  opposed to a word token. It therefore has no part-of-speech tag, | ||||
|             |  dependency parse etc. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[+api("vocab") #[code Vocab]] | ||||
|         +cell | ||||
|             |  A lookup table for the vocabulary that allows you to access | ||||
|             |  #[code Lexeme] objects. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code Morphology] | ||||
|         +cell | ||||
|             |  Assign linguistic features like lemmas, noun case, verb tense etc. | ||||
|             |  based on the word and its part-of-speech tag. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[+api("stringstore") #[code StringStore]] | ||||
|         +cell Map strings to and from hash values. | ||||
| 
 | ||||
|     +row | ||||
|         +row | ||||
|         +cell #[+api("tokenizer") #[code Tokenizer]] | ||||
|         +cell | ||||
|             |  Segment text, and create #[code Doc] objects with the discovered | ||||
|             |  segment boundaries. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[+api("matcher") #[code Matcher]] | ||||
|         +cell | ||||
|             |  Match sequences of tokens, based on pattern rules, similar to | ||||
|             |  regular expressions. | ||||
| 
 | ||||
| +h(3, "architecture-pipeline") Pipeline components | ||||
| 
 | ||||
| +table(["Name", "Description"]) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user