mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 05:01:02 +03:00 
			
		
		
		
	Add spacy.attrs.intify_attrs function, to normalize strings in token attribute dictionaries.
This commit is contained in:
		
							parent
							
								
									09f68bc641
								
							
						
					
					
						commit
						53d8ca8f51
					
				|  | @ -86,5 +86,44 @@ IDS = { | ||||||
|     "LANG": LANG, |     "LANG": LANG, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| # ATTR IDs, in order of the symbol | # ATTR IDs, in order of the symbol | ||||||
| NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] | NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): | ||||||
|  |     '''Normalize a dictionary of attributes, converting them to ints. | ||||||
|  |      | ||||||
|  |     Arguments: | ||||||
|  |         stringy_attrs (dict): | ||||||
|  |             Dictionary keyed by attribute string names. Values can be ints or strings. | ||||||
|  | 
 | ||||||
|  |         strings_map (StringStore): | ||||||
|  |             Defaults to None. If provided, encodes string values into ints. | ||||||
|  | 
 | ||||||
|  |     Returns: | ||||||
|  |         inty_attrs (dict): | ||||||
|  |             Attributes dictionary with keys and optionally values converted to | ||||||
|  |             ints. | ||||||
|  |     ''' | ||||||
|  |     inty_attrs = {} | ||||||
|  |     if _do_deprecated: | ||||||
|  |         if 'F' in stringy_attrs: | ||||||
|  |             stringy_attrs["ORTH"] = stringy_attrs.pop("F") | ||||||
|  |         if 'L' in stringy_attrs: | ||||||
|  |             stringy_attrs["LEMMA"] = stringy_attrs.pop("L") | ||||||
|  |         if 'pos' in stringy_attrs: | ||||||
|  |             stringy_attrs["TAG"] = stringy_attrs.pop("pos") | ||||||
|  |         if 'morph' in stringy_attrs: | ||||||
|  |             morphs = stringy_attrs.pop('morph') | ||||||
|  |             for name, value in morphs.items(): | ||||||
|  |                 stringy_attrs[name] = value | ||||||
|  |     for name, value in stringy_attrs.items(): | ||||||
|  |         if isinstance(name, int): | ||||||
|  |             int_key = name | ||||||
|  |         else: | ||||||
|  |             int_key = IDS[name.upper()] | ||||||
|  |         if strings_map is not None and isinstance(value, basestring): | ||||||
|  |             value = strings_map[value] | ||||||
|  |         inty_attrs[int_key] = value | ||||||
|  |     return inty_attrs | ||||||
|  |  | ||||||
							
								
								
									
										32
									
								
								spacy/tests/unit/test_attrs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								spacy/tests/unit/test_attrs.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,32 @@ | ||||||
|  | from ...attrs import * | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_key_no_value(): | ||||||
|  |     int_attrs = intify_attrs({"ORTH": "dog"}) | ||||||
|  |     assert int_attrs == {ORTH: "dog"} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_lower_key(): | ||||||
|  |     int_attrs = intify_attrs({"norm": "dog"}) | ||||||
|  |     assert int_attrs == {NORM: "dog"} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_lower_key_value(): | ||||||
|  |     vals = {'dog': 10} | ||||||
|  |     int_attrs = intify_attrs({"lemma": "dog"}, strings_map=vals) | ||||||
|  |     assert int_attrs == {LEMMA: 10} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_idempotence(): | ||||||
|  |     vals = {'dog': 10} | ||||||
|  |     int_attrs = intify_attrs({"lemma": "dog", 'is_alpha': True}, strings_map=vals) | ||||||
|  |     int_attrs = intify_attrs(int_attrs) | ||||||
|  |     assert int_attrs == {LEMMA: 10, IS_ALPHA: True} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_do_deprecated(): | ||||||
|  |     vals = {'dog': 10} | ||||||
|  |     int_attrs = intify_attrs({"F": "dog", 'is_alpha': True}, strings_map=vals, | ||||||
|  |                              _do_deprecated=True) | ||||||
|  |     assert int_attrs == {ORTH: 10, IS_ALPHA: True} | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user