mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			59 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			59 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //- 💫 DOCS > USAGE > ADDING LANGUAGES
 | ||
| 
 | ||
| include ../_includes/_mixins
 | ||
| 
 | ||
| +aside("Working on spaCy's source")
 | ||
|     |  To add a new language to spaCy, you'll need to
 | ||
|     |  #[strong modify the library's code]. The easiest way to do this is to
 | ||
|     |  clone the #[+src(gh("spaCy")) repository] and #[strong build spaCy from source].
 | ||
|     |  For more information on this, see the #[+a("/usage") installation guide].
 | ||
|     |  Unlike spaCy's core, which is mostly written in Cython, all language
 | ||
|     |  data is stored in regular Python files. This means that you won't have to
 | ||
|     |  rebuild anything in between – you can simply make edits and reload spaCy
 | ||
|     |  to test them.
 | ||
| 
 | ||
| +grid.o-no-block
 | ||
|     +grid-col("half")
 | ||
|         p
 | ||
|             |  Obviously, there are lots of ways you can organise your code when
 | ||
|             |  you implement your own language data. This guide will focus on
 | ||
|             |  how it's done within spaCy. For full language support, you'll
 | ||
|             |  need to create a #[code Language] subclass, define custom
 | ||
|             |  #[strong language data], like a stop list and tokenizer
 | ||
|             |  exceptions and test the new tokenizer. Once the language is set
 | ||
|             |  up, you can #[strong build the vocabulary], including word
 | ||
|             |  frequencies, Brown clusters and word vectors. Finally, you can
 | ||
|             |  #[strong train the tagger and parser], and save the model to a
 | ||
|             |  directory.
 | ||
| 
 | ||
|         p
 | ||
|             |  For some languages, you may also want to develop a solution for
 | ||
|             |  lemmatization and morphological analysis.
 | ||
| 
 | ||
|     +table-of-contents
 | ||
|         +item #[+a("#101") Language data 101]
 | ||
|         +item #[+a("#language-subclass") The Language subclass]
 | ||
|         +item #[+a("#stop-words") Stop words]
 | ||
|         +item #[+a("#tokenizer-exceptions") Tokenizer exceptions]
 | ||
|         +item #[+a("#norm-exceptions") Norm exceptions]
 | ||
|         +item #[+a("#lex-attrs") Lexical attributes]
 | ||
|         +item #[+a("#syntax-iterators") Syntax iterators]
 | ||
|         +item #[+a("#lemmatizer") Lemmatizer]
 | ||
|         +item #[+a("#tag-map") Tag map]
 | ||
|         +item #[+a("#morph-rules") Morph rules]
 | ||
|         +item #[+a("#testing") Testing the language]
 | ||
|         +item #[+a("#training") Training]
 | ||
| 
 | ||
| +section("language-data")
 | ||
|     +h(2, "language-data") Language data
 | ||
|     include _spacy-101/_language-data
 | ||
|     include _adding-languages/_language-data
 | ||
| 
 | ||
| +section("testing")
 | ||
|     +h(2, "testing") Testing the new language
 | ||
|     include _adding-languages/_testing
 | ||
| 
 | ||
| +section("training")
 | ||
|     +h(2, "training") Training a language model
 | ||
|     include _adding-languages/_training
 |