Disable whitespace corruption

This commit is contained in:
Matthew Honnibal 2019-08-29 15:01:58 +02:00
parent 3c1c0ec18e
commit 32842a3cd4

View File

@ -356,19 +356,19 @@ def add_noise(orig, noise_level):
if random.random() >= noise_level:
return orig
elif type(orig) == list:
corrupted = [_corrupt(word, noise_level) for word in orig]
corrupted = [_corrupt(word, noise_level, replace_space=False) for word in orig]
corrupted = [w for w in corrupted if w]
return corrupted
else:
return "".join(_corrupt(c, noise_level) for c in orig)
return "".join(_corrupt(c, noise_level, replace_space=False) for c in orig)
def _corrupt(c, noise_level):
def _corrupt(c, noise_level, replace_space=False):
if random.random() >= noise_level:
return c
elif c == " ":
elif replace_space and c == " ":
return "\n"
elif c == "\n":
elif replace_space and c == "\n":
return " "
elif c in [".", "'", "!", "?", ","]:
return ""