mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						b5247c49eb
					
				| 
						 | 
					@ -10,3 +10,4 @@ six
 | 
				
			||||||
ujson>=1.35
 | 
					ujson>=1.35
 | 
				
			||||||
cloudpickle
 | 
					cloudpickle
 | 
				
			||||||
sputnik>=0.9.2,<0.10.0
 | 
					sputnik>=0.9.2,<0.10.0
 | 
				
			||||||
 | 
					dill>=0.2,<0.3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										3
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
					@ -241,7 +241,8 @@ def setup_package():
 | 
				
			||||||
                'cloudpickle',
 | 
					                'cloudpickle',
 | 
				
			||||||
                'pathlib',
 | 
					                'pathlib',
 | 
				
			||||||
                'sputnik>=0.9.2,<0.10.0',
 | 
					                'sputnik>=0.9.2,<0.10.0',
 | 
				
			||||||
                'ujson>=1.35'],
 | 
					                'ujson>=1.35',
 | 
				
			||||||
 | 
					                'dill>=0.2,<0.3'],
 | 
				
			||||||
            classifiers=[
 | 
					            classifiers=[
 | 
				
			||||||
                'Development Status :: 5 - Production/Stable',
 | 
					                'Development Status :: 5 - Production/Stable',
 | 
				
			||||||
                'Environment :: Console',
 | 
					                'Environment :: Console',
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,9 +4,15 @@ from __future__ import unicode_literals
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.xfail
 | 
					 | 
				
			||||||
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
 | 
					@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
 | 
				
			||||||
def test_issue792(en_tokenizer, text):
 | 
					def test_issue792(en_tokenizer, text):
 | 
				
			||||||
    """Test for Issue #792: Trailing whitespace is removed after parsing."""
 | 
					    """Test for Issue #792: Trailing whitespace is removed after tokenization."""
 | 
				
			||||||
    doc = en_tokenizer(text)
 | 
					    doc = en_tokenizer(text)
 | 
				
			||||||
    assert doc.text_with_ws == text
 | 
					    assert ''.join([token.text_with_ws for token in doc]) == text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
 | 
				
			||||||
 | 
					def test_control_issue792(en_tokenizer, text):
 | 
				
			||||||
 | 
					    """Test base case for Issue #792: Non-trailing whitespace"""
 | 
				
			||||||
 | 
					    doc = en_tokenizer(text)
 | 
				
			||||||
 | 
					    assert ''.join([token.text_with_ws for token in doc]) == text
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										12
									
								
								spacy/tests/regression/test_issue859.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								spacy/tests/regression/test_issue859.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,12 @@
 | 
				
			||||||
 | 
					# encoding: utf8
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!",
 | 
				
			||||||
 | 
					                                  "aaabbb@ccc.com \nThank you!"])
 | 
				
			||||||
 | 
					def test_issue859(en_tokenizer, text):
 | 
				
			||||||
 | 
					    """Test that no extra space is added in doc.text method."""
 | 
				
			||||||
 | 
					    doc = en_tokenizer(text)
 | 
				
			||||||
 | 
					    assert doc.text == text
 | 
				
			||||||
| 
						 | 
					@ -163,7 +163,6 @@ cdef class Tokenizer:
 | 
				
			||||||
                    start = i
 | 
					                    start = i
 | 
				
			||||||
                in_ws = not in_ws
 | 
					                in_ws = not in_ws
 | 
				
			||||||
            i += 1
 | 
					            i += 1
 | 
				
			||||||
        i += 1
 | 
					 | 
				
			||||||
        if start < i:
 | 
					        if start < i:
 | 
				
			||||||
            span = string[start:]
 | 
					            span = string[start:]
 | 
				
			||||||
            key = hash_string(span)
 | 
					            key = hash_string(span)
 | 
				
			||||||
| 
						 | 
					@ -275,7 +274,10 @@ cdef class Tokenizer:
 | 
				
			||||||
            if cache_hit:
 | 
					            if cache_hit:
 | 
				
			||||||
                pass
 | 
					                pass
 | 
				
			||||||
            elif self.token_match and self.token_match(string): 
 | 
					            elif self.token_match and self.token_match(string): 
 | 
				
			||||||
                tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size())
 | 
					                # We're always saying 'no' to spaces here -- the caller will
 | 
				
			||||||
 | 
					                # fix up the outermost one, with reference to the original.
 | 
				
			||||||
 | 
					                # See Issue #859
 | 
				
			||||||
 | 
					                tokens.push_back(self.vocab.get(tokens.mem, string), False)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                matches = self.find_infix(string)
 | 
					                matches = self.find_infix(string)
 | 
				
			||||||
                if not matches:
 | 
					                if not matches:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user