mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	facilitate larger training files (#4827)
* add warning for large file and change start var to long * type for file_length
This commit is contained in:
		
							parent
							
								
									cb4145adc7
								
							
						
					
					
						commit
						732142bf28
					
				| 
						 | 
					@ -105,6 +105,10 @@ class Warnings(object):
 | 
				
			||||||
    W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
 | 
					    W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
 | 
				
			||||||
            "previous components in the pipeline declare that they assign it.")
 | 
					            "previous components in the pipeline declare that they assign it.")
 | 
				
			||||||
    W026 = ("Unable to set all sentence boundaries from dependency parses.")
 | 
					    W026 = ("Unable to set all sentence boundaries from dependency parses.")
 | 
				
			||||||
 | 
					    W027 = ("Found a large training file of {size} bytes. Note that it may "
 | 
				
			||||||
 | 
					            "be more efficient to split your training data into multiple "
 | 
				
			||||||
 | 
					            "smaller JSON files instead.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@add_codes
 | 
					@add_codes
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,7 +13,7 @@ import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .syntax import nonproj
 | 
					from .syntax import nonproj
 | 
				
			||||||
from .tokens import Doc, Span
 | 
					from .tokens import Doc, Span
 | 
				
			||||||
from .errors import Errors, AlignmentError
 | 
					from .errors import Errors, AlignmentError, user_warning, Warnings
 | 
				
			||||||
from .compat import path2str
 | 
					from .compat import path2str
 | 
				
			||||||
from . import util
 | 
					from . import util
 | 
				
			||||||
from .util import minibatch, itershuffle
 | 
					from .util import minibatch, itershuffle
 | 
				
			||||||
| 
						 | 
					@ -557,12 +557,16 @@ def _json_iterate(loc):
 | 
				
			||||||
    loc = util.ensure_path(loc)
 | 
					    loc = util.ensure_path(loc)
 | 
				
			||||||
    with loc.open("rb") as file_:
 | 
					    with loc.open("rb") as file_:
 | 
				
			||||||
        py_raw = file_.read()
 | 
					        py_raw = file_.read()
 | 
				
			||||||
 | 
					    cdef long file_length = len(py_raw)
 | 
				
			||||||
 | 
					    if file_length > 2 ** 30:
 | 
				
			||||||
 | 
					        user_warning(Warnings.W027.format(size=file_length))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    raw = <char*>py_raw
 | 
					    raw = <char*>py_raw
 | 
				
			||||||
    cdef int square_depth = 0
 | 
					    cdef int square_depth = 0
 | 
				
			||||||
    cdef int curly_depth = 0
 | 
					    cdef int curly_depth = 0
 | 
				
			||||||
    cdef int inside_string = 0
 | 
					    cdef int inside_string = 0
 | 
				
			||||||
    cdef int escape = 0
 | 
					    cdef int escape = 0
 | 
				
			||||||
    cdef int start = -1
 | 
					    cdef long start = -1
 | 
				
			||||||
    cdef char c
 | 
					    cdef char c
 | 
				
			||||||
    cdef char quote = ord('"')
 | 
					    cdef char quote = ord('"')
 | 
				
			||||||
    cdef char backslash = ord("\\")
 | 
					    cdef char backslash = ord("\\")
 | 
				
			||||||
| 
						 | 
					@ -570,7 +574,7 @@ def _json_iterate(loc):
 | 
				
			||||||
    cdef char close_square = ord("]")
 | 
					    cdef char close_square = ord("]")
 | 
				
			||||||
    cdef char open_curly = ord("{")
 | 
					    cdef char open_curly = ord("{")
 | 
				
			||||||
    cdef char close_curly = ord("}")
 | 
					    cdef char close_curly = ord("}")
 | 
				
			||||||
    for i in range(len(py_raw)):
 | 
					    for i in range(file_length):
 | 
				
			||||||
        c = raw[i]
 | 
					        c = raw[i]
 | 
				
			||||||
        if escape:
 | 
					        if escape:
 | 
				
			||||||
            escape = False
 | 
					            escape = False
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user