reduce memory load when reading all vectors from file (#6945)

* reduce memory load when reading all vectors from file

* one more small typo fix
This commit is contained in:
Sofie Van Landeghem 2021-02-07 01:05:43 +01:00 committed by GitHub
parent a323ef90df
commit 6ed423c16c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 12 additions and 8 deletions

View File

@ -451,7 +451,7 @@ cdef class Lexeme:
Lexeme.c_set_flag(self.c, IS_QUOTE, x) Lexeme.c_set_flag(self.c, IS_QUOTE, x)
property is_left_punct: property is_left_punct:
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. ).""" """RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
def __get__(self): def __get__(self):
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)

View File

@ -215,8 +215,7 @@ def convert_vectors(
def read_vectors(vectors_loc: Path, truncate_vectors: int): def read_vectors(vectors_loc: Path, truncate_vectors: int):
f = open_file(vectors_loc) f = ensure_shape(vectors_loc)
f = ensure_shape(f)
shape = tuple(int(size) for size in next(f).split()) shape = tuple(int(size) for size in next(f).split())
if truncate_vectors >= 1: if truncate_vectors >= 1:
shape = (truncate_vectors, shape[1]) shape = (truncate_vectors, shape[1])
@ -251,11 +250,12 @@ def open_file(loc: Union[str, Path]) -> IO:
return loc.open("r", encoding="utf8") return loc.open("r", encoding="utf8")
def ensure_shape(lines): def ensure_shape(vectors_loc):
"""Ensure that the first line of the data is the vectors shape. """Ensure that the first line of the data is the vectors shape.
If it's not, we read in the data and output the shape as the first result, If it's not, we read in the data and output the shape as the first result,
so that the reader doesn't have to deal with the problem. so that the reader doesn't have to deal with the problem.
""" """
lines = open_file(vectors_loc)
first_line = next(lines) first_line = next(lines)
try: try:
shape = tuple(int(size) for size in first_line.split()) shape = tuple(int(size) for size in first_line.split())
@ -269,7 +269,11 @@ def ensure_shape(lines):
# Figure out the shape, make it the first value, and then give the # Figure out the shape, make it the first value, and then give the
# rest of the data. # rest of the data.
width = len(first_line.split()) - 1 width = len(first_line.split()) - 1
captured = [first_line] + list(lines) length = 1
length = len(captured) for _ in lines:
length += 1
yield f"{length} {width}" yield f"{length} {width}"
yield from captured # Reading the lines in again from file. This to avoid having to
# store all the results in a list in memory
lines2 = open_file(vectors_loc)
yield from lines2

View File

@ -727,7 +727,7 @@ capitalization by including a mix of capitalized and lowercase examples. See the
Create a data augmentation callback that uses orth-variant replacement. The Create a data augmentation callback that uses orth-variant replacement. The
callback can be added to a corpus or other data iterator during training. It's callback can be added to a corpus or other data iterator during training. It's
is especially useful for punctuation and case replacement, to help generalize especially useful for punctuation and case replacement, to help generalize
beyond corpora that don't have smart quotes, or only have smart quotes etc. beyond corpora that don't have smart quotes, or only have smart quotes etc.
| Name | Description | | Name | Description |