mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Implement like_num getter for Dutch (via #1177)
This commit is contained in:
		
							parent
							
								
									5ee10379db
								
							
						
					
					
						commit
						adda08fe14
					
				| 
						 | 
				
			
			@ -2,6 +2,7 @@
 | 
			
		|||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
| 
						 | 
				
			
			@ -12,6 +13,7 @@ from ...util import update_exc, add_lookups
 | 
			
		|||
 | 
			
		||||
class DutchDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: 'nl'
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										36
									
								
								spacy/lang/nl/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								spacy/lang/nl/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,36 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from ...attrs import LIKE_NUM
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
_num_words = set("""
 | 
			
		||||
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
 | 
			
		||||
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
 | 
			
		||||
duizend miljoen miljard biljoen biljard triljoen triljard
 | 
			
		||||
""".split())
 | 
			
		||||
 | 
			
		||||
_ordinal_words = set("""
 | 
			
		||||
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
 | 
			
		||||
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
 | 
			
		||||
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
 | 
			
		||||
miljardste biljoenste biljardste triljoenste triljardste
 | 
			
		||||
""".split())
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def like_num(text):
 | 
			
		||||
    text = text.replace(',', '').replace('.', '')
 | 
			
		||||
    if text.isdigit():
 | 
			
		||||
        return True
 | 
			
		||||
    if text.count('/') == 1:
 | 
			
		||||
        num, denom = text.split('/')
 | 
			
		||||
        if num.isdigit() and denom.isdigit():
 | 
			
		||||
            return True
 | 
			
		||||
    if text in _num_words:
 | 
			
		||||
        return True
 | 
			
		||||
    return False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
LEX_ATTRS = {
 | 
			
		||||
    LIKE_NUM: like_num
 | 
			
		||||
}
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user