Skip to content

Commit

Permalink
docs: creating tokenizers with TokenizerConfig
Browse files Browse the repository at this point in the history
  • Loading branch information
Natooz committed Jul 4, 2023
1 parent b12d270 commit de4a6ba
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 26 deletions.
50 changes: 34 additions & 16 deletions colab-notebooks/Full_Example_HuggingFace_GPT2_Transformer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "gOd93yV0sGd2"
Expand Down Expand Up @@ -60,7 +61,7 @@
"from transformers import GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments, GenerationConfig\n",
"from transformers.data.data_collator import DataCollatorMixin\n",
"from evaluate import load as load_metric\n",
"from miditok import REMI, MIDITokenizer\n",
"from miditok import REMI, MIDITokenizer, TokenizerConfig\n",
"from miditok.constants import CHORD_MAPS\n",
"from miditoolkit import MidiFile\n",
"from tqdm import tqdm\n",
Expand Down Expand Up @@ -136,6 +137,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand All @@ -148,31 +150,44 @@
"metadata": {},
"outputs": [],
"source": [
"# Our parameters\n",
"pitch_range = range(21, 109)\n",
"beat_res = {(0, 4): 8, (4, 12): 4}\n",
"nb_velocities = 32\n",
"additional_tokens = {'Chord': True, 'Rest': True, 'Tempo': True,\n",
" 'rest_range': (2, 8), # (half, 8 beats)\n",
" 'nb_tempos': 32, # nb of tempo bins\n",
" 'tempo_range': (40, 250), # (min, max)\n",
" 'Program': False,\n",
" \"chord_maps\": CHORD_MAPS,\n",
" \"chord_tokens_with_root_note\": True,\n",
" \"chord_unknown\": False}\n",
"special_tokens = [\"PAD\", \"BOS\", \"EOS\"]\n",
"# Our tokenizer's configuration\n",
"PITCH_RANGE = range(21, 109)\n",
"BEAT_RES = {(0, 1): 8, (1, 2): 4, (2, 4): 2, (4, 8): 1}\n",
"NB_VELOCITIES = 24\n",
"SPECIAL_TOKENS = [\"PAD\", \"MASK\", \"BOS\", \"EOS\"]\n",
"USE_CHORDS = False\n",
"USE_RESTS = False\n",
"USE_TEMPOS = True\n",
"USE_TIME_SIGNATURE = False\n",
"USE_PROGRAMS = True\n",
"NB_TEMPOS = 32\n",
"TEMPO_RANGE = (50, 200) # (min_tempo, max_tempo)\n",
"TOKENIZER_PARAMS = {\n",
" \"pitch_range\": PITCH_RANGE,\n",
" \"beat_res\": BEAT_RES,\n",
" \"nb_velocities\": NB_VELOCITIES,\n",
" \"special_tokens\": SPECIAL_TOKENS,\n",
" \"use_chords\": USE_CHORDS,\n",
" \"use_rests\": USE_RESTS,\n",
" \"use_tempos\": USE_TEMPOS,\n",
" \"use_time_signatures\": USE_TIME_SIGNATURE,\n",
" \"use_programs\": USE_PROGRAMS,\n",
" \"nb_tempos\": NB_TEMPOS,\n",
" \"tempo_range\": TEMPO_RANGE,\n",
"}\n",
"config = TokenizerConfig(**TOKENIZER_PARAMS)\n",
"\n",
"# Creates the tokenizer convert MIDIs to tokens\n",
"tokens_path = Path('Maestro_tokens_no_bpe')\n",
"tokenizer = REMI(pitch_range, beat_res, nb_velocities, additional_tokens, special_tokens=special_tokens) # REMI\n",
"tokenizer = REMI(config) # REMI\n",
"midi_paths = list(Path('Maestro').glob('**/*.mid')) + list(Path('Maestro').glob('**/*.midi'))\n",
"tokenizer.tokenize_midi_dataset(midi_paths, tokens_path)\n",
"\n",
"# Learn and apply BPE to data we just tokenized\n",
"tokens_bpe_path = Path('Maestro_tokens_bpe')\n",
"tokens_bpe_path.mkdir(exist_ok=True, parents=True)\n",
"tokenizer.learn_bpe(\n",
" vocab_size=1000,\n",
" vocab_size=10000,\n",
" tokens_paths=list(tokens_path.glob(\"**/*.json\")),\n",
" start_from_empty_voc=False,\n",
")\n",
Expand All @@ -190,6 +205,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -224,6 +240,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -306,6 +323,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down
26 changes: 16 additions & 10 deletions docs/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,27 @@ A basic example showing how to create a tokenizer, with a selection of custom pa

.. code-block:: python
from miditok import REMI # here we choose to use REMI
from miditok import REMI, TokenizerConfig # here we choose to use REMI
from miditok.utils import get_midi_programs
# Our parameters
pitch_range = range(21, 109)
beat_res = {(0, 4): 8, (4, 12): 4}
nb_velocities = 32
additional_tokens = {'Chord': True, 'Rest': False, 'Tempo': True, 'Program': False, 'TimeSignature': False,
'rest_range': (2, 8), # (half, 8 beats)
'nb_tempos': 32, # nb of tempo bins
'tempo_range': (40, 250)} # (min, max)
special_tokens = ["PAD", "BOS", "EOS", "MASK"]
TOKENIZER_PARAMS = {
"pitch_range": range(21, 109),
"beat_res": {(0, 4): 8, (4, 12): 4},
"nb_velocities": 32,
"special_tokens": ["PAD", "BOS", "EOS", "MASK"],
"use_chords": True,
"use_rests": False,
"use_tempos": True,
"use_time_signatures": False,
"use_programs": False,
"nb_tempos": 32, # nb of tempo bins
"tempo_range": (40, 250), # (min, max)
}
config = TokenizerConfig(**TOKENIZER_PARAMS)
# Creates the tokenizer and loads a MIDI
tokenizer = REMI(pitch_range, beat_res, nb_velocities, additional_tokens, special_tokens)
tokenizer = REMI(config)
MIDI - Tokens conversion
-------------------------------
Expand Down

0 comments on commit de4a6ba

Please sign in to comment.