mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						b000fca8f8
					
				|  | @ -42,6 +42,7 @@ p | |||
|         +item #[+a("#tokenizer-exceptions") Tokenizer exceptions] | ||||
|         +item #[+a("#norm-exceptions") Norm exceptions] | ||||
|         +item #[+a("#lex-attrs") Lexical attributes] | ||||
|         +item #[+a("#syntax-iterators") Syntax iterators] | ||||
|         +item #[+a("#lemmatizer") Lemmatizer] | ||||
|         +item #[+a("#tag-map") Tag map] | ||||
|         +item #[+a("#morph-rules") Morph rules] | ||||
|  | @ -104,6 +105,13 @@ p | |||
|         +cell dict | ||||
|         +cell Attribute ID mapped to function. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code SYNTAX_ITERATORS] | ||||
|         +cell dict | ||||
|         +cell | ||||
|             |  Iterator ID mapped to function. Currently only supports | ||||
|             |  #[code 'noun_chunks']. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code LOOKUP] | ||||
|         +cell dict | ||||
|  | @ -341,9 +349,12 @@ p | |||
|     |  a token's norm equals its lowercase text. If the lowercase spelling of a | ||||
|     |  word exists, norms should always be in lowercase. | ||||
| 
 | ||||
| +aside-code("Accessing norms"). | ||||
|     doc = nlp(u"I can't") | ||||
|     assert [t.norm_ for t in doc] == ['i', 'can', 'not'] | ||||
| +aside-code("Norms vs. lemmas"). | ||||
|     doc = nlp(u"I'm gonna realise") | ||||
|     norms = [token.norm_ for token in doc] | ||||
|     lemmas = [token.lemma_ for token in doc] | ||||
|     assert norms == ['i', 'am', 'going', 'to', 'realize'] | ||||
|     assert lemmas == ['i', 'be', 'go', 'to', 'realise'] | ||||
| 
 | ||||
| p | ||||
|     |  spaCy usually tries to normalise words with different spellings to a single, | ||||
|  | @ -449,6 +460,33 @@ p | |||
|     |  #[code lex_attr_getters.update(LEX_ATTRS)], only the new custom functions | ||||
|     |  are overwritten. | ||||
| 
 | ||||
| +h(3, "syntax-iterators") Syntax iterators | ||||
| 
 | ||||
| p | ||||
|     |  Syntax iterators are functions that compute views of a #[code Doc] | ||||
|     |  object based on its syntax. At the moment, this data is only used for | ||||
|     |  extracting | ||||
|     |  #[+a("/docs/usage/dependency-parse#noun-chunks") noun chunks], which | ||||
|     |  are available as the #[+api("doc#noun_chunks") #[code Doc.noun_chunks]] | ||||
|     |  property. Because base noun phrases work differently across languages, | ||||
|     |  the rules to compute them are part of the individual language's data. If | ||||
|     |  a language does not include a noun chunks iterator, the property won't | ||||
|     |  be available. For examples, see the existing syntax iterators: | ||||
| 
 | ||||
| +aside-code("Noun chunks example"). | ||||
|     doc = nlp(u'A phrase with another phrase occurs.') | ||||
|     chunks = list(doc.noun_chunks) | ||||
|     assert chunks[0].text == "A phrase" | ||||
|     assert chunks[1].text == "another phrase" | ||||
| 
 | ||||
| +table(["Language", "Source"]) | ||||
|     for lang, lang_id in {en: "English", de: "German", es: "Spanish"} | ||||
|         +row | ||||
|             +cell=lang | ||||
|             +cell | ||||
|                 +src(gh("spaCy", "spacy/lang/" + lang_id + "/syntax_iterators.py")) | ||||
|                     |  lang/#{lang_id}/syntax_iterators.py | ||||
| 
 | ||||
| +h(3, "lemmatizer") Lemmatizer | ||||
| 
 | ||||
| p | ||||
|  | @ -604,6 +642,8 @@ p | |||
| 
 | ||||
| +h(2, "vocabulary") Building the vocabulary | ||||
| 
 | ||||
| +under-construction | ||||
| 
 | ||||
| p | ||||
|     |  spaCy expects that common words will be cached in a | ||||
|     |  #[+api("vocab") #[code Vocab]] instance. The vocabulary caches lexical | ||||
|  | @ -697,6 +737,8 @@ p | |||
| 
 | ||||
| +h(3, "word-vectors") Training the word vectors | ||||
| 
 | ||||
| +under-construction | ||||
| 
 | ||||
| p | ||||
|     |  #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec] and related | ||||
|     |  algorithms let you train useful word similarity models from unlabelled | ||||
|  | @ -731,6 +773,8 @@ p | |||
| 
 | ||||
| +h(2, "train-tagger-parser") Training the tagger and parser | ||||
| 
 | ||||
| +under-construction | ||||
| 
 | ||||
| p | ||||
|     |  You can now train the model using a corpus for your language annotated | ||||
|     |  with #[+a("http://universaldependencies.org/") Universal Dependencies]. | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user