mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
		
						commit
						438050643c
					
				|  | @ -341,8 +341,12 @@ p | |||
|     |  for your custom #[code train] command while still being able to easily | ||||
|     |  tweak the hyperparameters. For example: | ||||
| 
 | ||||
| +code(false, "bash"). | ||||
|     parser_hidden_depth=2 parser_maxout_pieces=1 train-parser | ||||
| +code(false, "bash", "$"). | ||||
|     parser_hidden_depth=2 parser_maxout_pieces=1 spacy train [...] | ||||
| 
 | ||||
| +code("Usage with alias", "bash", "$"). | ||||
|     alias train-parser="spacy train en /output /data /train /dev -n 1000" | ||||
|     parser_maxout_pieces=1 train-parser | ||||
| 
 | ||||
| +table(["Name", "Description", "Default"]) | ||||
|     +row | ||||
|  |  | |||
|  | @ -305,6 +305,54 @@ p | |||
|             |  A list of #[code (match_id, start, end)] tuples, describing the | ||||
|             |  matches. A match tuple describes a span #[code doc[start:end]]. | ||||
| 
 | ||||
| +h(3, "regex") Using regular expressions | ||||
| 
 | ||||
| p | ||||
|     |  In some cases, only matching tokens and token attributes isn't enough – | ||||
|     |  for example, you might want to match different spellings of a word, | ||||
|     |  without having to add a new pattern for each spelling. A simple solution | ||||
|     |  is to match a regular expression on the #[code Doc]'s #[code text] and | ||||
|     |  use the #[+api("doc#char_span") #[code Doc.char_span]] method to | ||||
|     |  create a #[code Span] from the character indices of the match: | ||||
| 
 | ||||
| +code. | ||||
|     import spacy | ||||
|     import re | ||||
| 
 | ||||
|     nlp = spacy.load('en') | ||||
|     doc = nlp(u'The spelling is "definitely", not "definately" or "deffinitely".') | ||||
| 
 | ||||
|     DEFINITELY_PATTERN = re.compile(r'deff?in[ia]tely') | ||||
| 
 | ||||
|     for match in re.finditer(DEFINITELY_PATTERN, doc.text): | ||||
|         start, end = match.span()         # get matched indices | ||||
|         span = doc.char_span(start, end)  # create Span from indices | ||||
| 
 | ||||
| p | ||||
|     |  You can also use the regular expression with spaCy's #[code Matcher] by | ||||
|     |  converting it to a token flag. To ensure efficiency, the | ||||
|     |  #[code Matcher] can only access the C-level data. This means that it can | ||||
|     |  either use built-in token attributes or #[strong binary flags]. | ||||
|     |  #[+api("vocab#add_flag") #[code Vocab.add_flag]] returns a flag ID which | ||||
|     |  you can use as a key of a token match pattern. Tokens that match the | ||||
|     |  regular expression will return #[code True] for the #[code IS_DEFINITELY] | ||||
|     |  flag. | ||||
| 
 | ||||
| +code. | ||||
|     IS_DEFINITELY = nlp.vocab.add_flag(re.compile(r'deff?in[ia]tely').match) | ||||
| 
 | ||||
|     matcher = Matcher(nlp.vocab) | ||||
|     matcher.add('DEFINITELY', None, [{IS_DEFINITELY: True}]) | ||||
| 
 | ||||
| p | ||||
|     |  Providing the regular expressions as binary flags also lets you use them | ||||
|     |  in combination with other token patterns – for example, to match the | ||||
|     |  word "definitely" in various spellings, followed by a case-insensitive | ||||
|     |  "not" and and adjective: | ||||
| 
 | ||||
| +code. | ||||
|     [{IS_DEFINITELY: True}, {'LOWER': 'not'}, {'POS': 'ADJ'}] | ||||
| 
 | ||||
| +h(3, "example1") Example: Using linguistic annotations | ||||
| 
 | ||||
| p | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user