mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 18:07:26 +03:00 
			
		
		
		
	* Add orth features
This commit is contained in:
		
							parent
							
								
									6209d94f83
								
							
						
					
					
						commit
						3e3ff99ca0
					
				
							
								
								
									
										71
									
								
								spacy/orth.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										71
									
								
								spacy/orth.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,71 @@
 | 
				
			||||||
 | 
					# Binary string features
 | 
				
			||||||
 | 
					def is_alpha(string, prob, case_stats, tag_stats):
 | 
				
			||||||
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def is_digit(string, prob, case_stats, tag_stats):
 | 
				
			||||||
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def is_punct(string, prob, case_stats, tag_stats):
 | 
				
			||||||
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def is_space(string, prob, case_stats, tag_stats):
 | 
				
			||||||
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def is_ascii(string, prob, case_stats, tag_stats):
 | 
				
			||||||
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def is_title(string, prob, case_stats, tag_stats):
 | 
				
			||||||
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def is_lower(string, prob, case_stats, tag_stats):
 | 
				
			||||||
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def is_upper(string, prob, case_stats, tag_stats):
 | 
				
			||||||
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Statistics features
 | 
				
			||||||
 | 
					def oft_case(name, thresh):
 | 
				
			||||||
 | 
					    def wrapped(string, prob, case_stats, tag_stats):
 | 
				
			||||||
 | 
					        return string
 | 
				
			||||||
 | 
					    return wrapped
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def can_tag(name, thresh):
 | 
				
			||||||
 | 
					    def wrapped(string, prob, case_stats, tag_stats):
 | 
				
			||||||
 | 
					        return string
 | 
				
			||||||
 | 
					    return wrapped
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# String features
 | 
				
			||||||
 | 
					def canon_case(string, prob, cluster, case_stats, tag_stats):
 | 
				
			||||||
 | 
					    return string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def word_shape(string, *args):
 | 
				
			||||||
 | 
					    length = len(string)
 | 
				
			||||||
 | 
					    shape = ""
 | 
				
			||||||
 | 
					    last = ""
 | 
				
			||||||
 | 
					    shape_char = ""
 | 
				
			||||||
 | 
					    seq = 0
 | 
				
			||||||
 | 
					    for c in string:
 | 
				
			||||||
 | 
					        if c.isalpha():
 | 
				
			||||||
 | 
					            if c.isupper():
 | 
				
			||||||
 | 
					                shape_char = "X"
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                shape_char = "x"
 | 
				
			||||||
 | 
					        elif c.isdigit():
 | 
				
			||||||
 | 
					            shape_char = "d"
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            shape_char = c
 | 
				
			||||||
 | 
					        if shape_char == last:
 | 
				
			||||||
 | 
					            seq += 1
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            seq = 0
 | 
				
			||||||
 | 
					            last = shape_char
 | 
				
			||||||
 | 
					        if seq < 3:
 | 
				
			||||||
 | 
					            shape += shape_char
 | 
				
			||||||
 | 
					    return shape
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def non_sparse(string, prob, cluster, case_stats, tag_stats):
 | 
				
			||||||
 | 
					    return string
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user