mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			108 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			108 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //- 💫 DOCS > USAGE > SPACY 101 > LANGUAGE DATA
 | ||
| 
 | ||
| p
 | ||
|     |  Every language is different – and usually full of
 | ||
|     |  #[strong exceptions and special cases], especially amongst the most
 | ||
|     |  common words. Some of these exceptions are shared across languages, while
 | ||
|     |  others are #[strong entirely specific] – usually so specific that they need
 | ||
|     |  to be hard-coded. The #[+src(gh("spaCy", "spacy/lang")) #[code lang]] module
 | ||
|     |  contains all language-specific data, organised in simple Python files.
 | ||
|     |  This makes the data easy to update and extend.
 | ||
| 
 | ||
| p
 | ||
|     |  The #[strong shared language data] in the directory root includes rules
 | ||
|     |  that can be generalised across languages – for example, rules for basic
 | ||
|     |  punctuation, emoji, emoticons, single-letter abbreviations and norms for
 | ||
|     |  equivalent tokens with different spellings, like #[code "] and
 | ||
|     |  #[code ”]. This helps the models make more accurate predictions.
 | ||
|     |  The #[strong individual language data] in a submodule contains
 | ||
|     |  rules that are only relevant to a particular language. It also takes
 | ||
|     |  care of putting together all components and creating the #[code Language]
 | ||
|     |  subclass – for example, #[code English] or #[code German].
 | ||
| 
 | ||
| +aside-code.
 | ||
|     from spacy.lang.en import English
 | ||
|     from spacy.lang.de import German
 | ||
| 
 | ||
|     nlp_en = English() # includes English data
 | ||
|     nlp_de = German() # includes German data
 | ||
| 
 | ||
| +graphic("/assets/img/language_data.svg")
 | ||
|     include ../../assets/img/language_data.svg
 | ||
| 
 | ||
| +table(["Name", "Description"])
 | ||
|     +row
 | ||
|         +cell #[strong Stop words]#[br]
 | ||
|             |  #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) #[code stop_words.py]]
 | ||
|         +cell
 | ||
|             |  List of most common words of a language that are often useful to
 | ||
|             |  filter out, for example "and" or "I". Matching tokens will
 | ||
|             |  return #[code True] for #[code is_stop].
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[strong Tokenizer exceptions]#[br]
 | ||
|             |  #[+src(gh("spacy-dev-resources", "templates/new_language/tokenizer_exceptions.py")) #[code tokenizer_exceptions.py]]
 | ||
|         +cell
 | ||
|             |  Special-case rules for the tokenizer, for example, contractions
 | ||
|             |  like "can't" and abbreviations with punctuation, like "U.K.".
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[strong Norm exceptions]
 | ||
|             |  #[+src(gh("spaCy", "spacy/lang/norm_exceptions.py")) #[code norm_exceptions.py]]
 | ||
|         +cell
 | ||
|             |  Special-case rules for normalising tokens to improve the model's
 | ||
|             |  predictions, for example on American vs. British spelling.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[strong Punctuation rules]
 | ||
|             |  #[+src(gh("spaCy", "spacy/lang/punctuation.py")) #[code punctuation.py]]
 | ||
|         +cell
 | ||
|             |  Regular expressions for splitting tokens, e.g. on punctuation or
 | ||
|             |  special characters like emoji. Includes rules for prefixes,
 | ||
|             |  suffixes and infixes.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[strong Character classes]
 | ||
|             |  #[+src(gh("spaCy", "spacy/lang/char_classes.py")) #[code char_classes.py]]
 | ||
|         +cell
 | ||
|             |  Character classes to be used in regular expressions, for example,
 | ||
|             |  latin characters, quotes, hyphens or icons.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[strong Lexical attributes]
 | ||
|             |  #[+src(gh("spacy-dev-resources", "templates/new_language/lex_attrs.py")) #[code lex_attrs.py]]
 | ||
|         +cell
 | ||
|             |  Custom functions for setting lexical attributes on tokens, e.g.
 | ||
|             |  #[code like_num], which includes language-specific words like "ten"
 | ||
|             |  or "hundred".
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[strong Syntax iterators]
 | ||
|             |  #[+src(gh("spaCy", "spacy/lang/en/syntax_iterators.py")) #[code syntax_iterators.py]]
 | ||
|         +cell
 | ||
|             |  Functions that compute views of a #[code Doc] object based on its
 | ||
|             |  syntax. At the moment, only used for
 | ||
|             |  #[+a("/usage/linguistic-features#noun-chunks") noun chunks].
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[strong Lemmatizer]
 | ||
|             |  #[+src(gh("spacy-dev-resources", "templates/new_language/lemmatizer.py")) #[code lemmatizer.py]]
 | ||
|         +cell
 | ||
|             |  Lemmatization rules or a lookup-based lemmatization table to
 | ||
|             |  assign base forms, for example "be" for "was".
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[strong Tag map]#[br]
 | ||
|             |  #[+src(gh("spacy-dev-resources", "templates/new_language/tag_map.py")) #[code tag_map.py]]
 | ||
|         +cell
 | ||
|             |  Dictionary mapping strings in your tag set to
 | ||
|             |  #[+a("http://universaldependencies.org/u/pos/all.html") Universal Dependencies]
 | ||
|             |  tags.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[strong Morph rules]
 | ||
|             |  #[+src(gh("spaCy", "spacy/lang/en/morph_rules.py")) #[code morph_rules.py]]
 | ||
|         +cell
 | ||
|             |  Exception rules for morphological analysis of irregular words like
 | ||
|             |  personal pronouns.
 |