-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcut_up.py
49 lines (36 loc) · 1.17 KB
/
cut_up.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from itertools import chain
from normalizer import Normalizer
from random import Random
def cut_up(corpus: str, seed: int = 1):
"""An implementation of the [cut-up literary technique]
(http://en.wikipedia.org/wiki/Cut-up_technique). The seed value allows for repeatable random
rearrangement.
"""
# Init our tools
normalizer = Normalizer()
random = Random(seed)
# Normalize the corpus
tokens = normalizer.normalize_symbol_boundaries(corpus).split()
# Segment the tokens and randomize the segments
blocks = _get_blocks(tokens, random)
random.shuffle(blocks)
# Reconstitute and denormalize the corpus
tokens = chain.from_iterable(blocks)
res = " ".join(tokens)
res = normalizer.denormalize_symbol_boundaries(res)
return res
def _get_blocks(tokens: list, random: Random):
"""Chunk the tokens into random blocks.
"""
tokens_len = len(tokens)
blocks = []
pos = 0
while pos < tokens_len:
i = random.randint(0, tokens_len) + pos
try:
block = tokens[pos:i]
except:
block = tokens[pos:]
pos += len(block)
blocks.append(block)
return blocks