If you want a pure iterator solution for large strings with constant memory usage:
from typing import Iterable import itertoolsdef ngrams_iter(input: str, ngram_size: int, token_regex=r"[^\s]+") -> Iterable[str]: input_iters = [ map(lambda m: m.group(0), re.finditer(token_regex, input)) for n in range(ngram_size) ] # Skip first words for n in range(1, ngram_size): list(map(next, input_iters[n:])) output_iter = itertools.starmap( lambda *args: "".join(args), zip(*input_iters) ) return output_iter
Test:
input = "If you want a pure iterator solution for large strings with constant memory usage"list(ngrams_iter(input, 5))
Output:
['If you want a pure','you want a pure iterator','want a pure iterator solution','a pure iterator solution for','pure iterator solution for large','iterator solution for large strings','solution for large strings with','for large strings with constant','large strings with constant memory','strings with constant memory usage']