From 0552383228d462d069a36f2c3699376a75339977 Mon Sep 17 00:00:00 2001
From: James McDermott <jamesmichaelmcdermott@gmail.com>
Date: Tue, 26 Oct 2021 15:37:58 +0100
Subject: [PATCH] Remove the sequence match problem.

---
 grammars/sequence_match.pybnf |  32 -----
 parameters/sequence_match.txt |  24 ----
 src/fitness/sequence_match.py | 230 ----------------------------------
 3 files changed, 286 deletions(-)
 delete mode 100644 grammars/sequence_match.pybnf
 delete mode 100644 parameters/sequence_match.txt
 delete mode 100644 src/fitness/sequence_match.py
diff --git a/grammars/sequence_match.pybnf b/grammars/sequence_match.pybnf
deleted file mode 100644
index f5fe8b88..00000000
--- a/grammars/sequence_match.pybnf
+++ /dev/null
@@ -1,32 +0,0 @@
-<p> ::= global state{::}state = <i>{::}<defp>{::}<callp>
-<defp> ::= def p():{:global state{::}<defadf1>{::}<defadf2>{::}<defhoadf1>{::}<code>{::}yield from hoadf1(<adf>, <item>):}
-<defadf1> ::= def adf1(n):{:global state{::}<code_adf2_n>{::}yield from <seq>:}
-<defadf2> ::= def adf2(n):{:global state{::}<code_n>{::}yield from <seq>:}
-<defhoadf1> ::= def hoadf1(f, n):{:global state{::}<code_f_n>:}
-<callp> ::= XXX_output_XXX=p()
-
-<adf> ::= adf1 | adf2
-
-<seq> ::= [<csitems>] | map(<f_i_i>, <seq>) | range(<i>+1)
-<csitems> ::= <item> | <item>, <csitems>
-<item> ::= state | <i> | <f_i_i>(<item>)
-<f_i_i> ::= succ | pred
-
-#| double(<item>) | sq(<item>)
-
-<code> ::= <stmt> | <stmt>{::}<code>
-<code_n> ::= <stmt_n> | <stmt_n>{::}<code_n>
-<code_x> ::= <stmt_x> | <stmt_x>{::}<code_x>
-<stmt> ::= yield <item> | yield from <seq> | <if> | <for> | state = <item>
-<stmt_x> ::= yield x | state = x | <stmt>
-<stmt_n> ::= yield n | state = n | <stmt>
-<code_adf2_n> ::= <stmt_adf2> | <stmt_n> | <stmt_adf2>{::}<code_adf2_n> | <stmt_n>{::}<code_adf2_n>
-<stmt_adf2> ::= yield from adf2(<item>) | <stmt>
-<code_f_n> ::= yield from f(n) | yield from f(<item>) | yield n
-
-<if> ::= if <cond>:{:<code>:}
-<for> ::= for x in <seq>:{:<code_x>:}
-<i> ::= 0 | 1 | 2 | 3 | 4 | 5 | 6
-
-<cond> ::= (state <op> <item>)
-<op> ::= > | < | ==
diff --git a/parameters/sequence_match.txt b/parameters/sequence_match.txt
deleted file mode 100644
index 90d52b17..00000000
--- a/parameters/sequence_match.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-CACHE:                  True
-CODON_SIZE:             100000
-CROSSOVER:              variable_onepoint
-CROSSOVER_PROBABILITY:  0.75
-DATASET_TRAIN:          Vladislavleva4/Train.txt
-DATASET_TEST:           Vladislavleva4/Test.txt
-DEBUG:                  False
-ERROR_METRIC:           mse
-GENERATIONS:            50
-MAX_GENOME_LENGTH:      500
-GRAMMAR_FILE:           sequence_match.pybnf
-INITIALISATION:         PI_grow
-INVALID_SELECTION:      False
-MAX_INIT_TREE_DEPTH:    10
-MAX_TREE_DEPTH:         17
-MUTATION:               int_flip_per_codon
-POPULATION_SIZE:        500
-FITNESS_FUNCTION:       sequence_match
-REPLACEMENT:            generational
-SELECTION:              tournament
-TARGET:                 "[0, 5, 0, 5, 0, 5]"
-EXTRA_PARAMETERS:       "alpha=0.5, beta=0.5, gamma=0.5"
-TOURNAMENT_SIZE:        2
-VERBOSE:                False
diff --git a/src/fitness/sequence_match.py b/src/fitness/sequence_match.py
deleted file mode 100644
index ef1ac6ae..00000000
--- a/src/fitness/sequence_match.py
+++ /dev/null
@@ -1,230 +0,0 @@
-import dtw  # https://pypi.python.org/pypi/dtw
-import editdistance  # https://pypi.python.org/pypi/editdistance
-import lzstring  # https://pypi.python.org/pypi/lzstring/
-from algorithm.parameters import params
-from fitness.base_ff_classes.base_ff import base_ff
-
-"""
-
-This fitness function is for a sequence-match problem: we're given
-an integer sequence target, say [0, 5, 0, 5, 0, 5], and we try to synthesize a
-program (loops, if-statements, etc) which will *yield* that sequence,
-one item at a time.
-
-There are several components of the fitness:
-
-1. concerning the program:
-    i. length of the program (shorter is better)
-    ii. compressibility of the program (non-compressible, ie DRY, is better)
-
-2. concerning distance from the target:
-    i. dynamic time warping distance from the program's output to the target
-    (lower is better).
-    ii. Levenshtein distance from the program's output to the target
-    (lower is better).
-
-"""
-
-
-# available for use in synthesized programs
-def succ(n, maxv=6):
-    """
-    Available for use in synthesized programs.
-    
-    :param n:
-    :param maxv:
-    :return:
-    """
-
-    return min(n + 1, maxv)
-
-
-def pred(n, minv=0):
-    """
-    Available for use in synthesized programs.
-    
-    :param n:
-    :param minv:
-    :return:
-    """
-
-    return max(n - 1, minv)
-
-
-def truncate(n, g):
-    """
-    the program will yield one item at a time, potentially forever. We only
-    up to n items.
-    
-    :param n:
-    :param g:
-    :return:
-    """
-
-    for i in range(n):
-        yield next(g)
-
-
-def dist(t0, x0):
-    """
-    numerical difference, used as a component in DTW.
-    
-    :param t0:
-    :param x0:
-    :return:
-    """
-
-    return abs(t0 - x0)
-
-
-def dtw_dist(s, t):
-    """
-    Dynamic time warping distance between two sequences.
-    
-    :param s:
-    :param t:
-    :return:
-    """
-
-    s = list(map(int, s))
-    t = list(map(int, t))
-    d, M, C, path = dtw.dtw(s, t, dist)
-
-    return d
-
-
-def lev_dist(s, t):
-    """
-    Levenshtein distance between two sequences, normalised by length of the
-    target -- hence this is *asymmetric*, not really a distance. Don't
-    normalise by length of the longer, because it would encourage evolution
-    to create longer and longer sequences.
-    
-    :param s:
-    :param t:
-    :return:
-    """
-
-    return editdistance.eval(s, t) / len(s)
-
-
-def compress(s):
-    """
-    Convert to a string and compress. lzstring is a special-purpose compressor,
-    more suitable for short strings than typical compressors.
-    
-    :param s:
-    :return:
-    """
-
-    s = ''.join(map(str, s))
-    return lzstring.LZString().compress(s)
-
-
-def compressibility(s):
-    """
-    Compressibility is in [0, 1]. It's high when the compressed string
-    is much shorter than the original.
-    
-    :param s:
-    :return:
-    """
-
-    return 1 - len(compress(s)) / len(s)
-
-
-def proglen(s):
-    """
-    Program length is measured in characters, but in order to keep the values
-    in a similar range to that of compressibility, DTW and Levenshtein, we
-    divide by 100. This is a bit arbitrary.
-    
-    :param s: A string of a program phenotype.
-    :return: The length of the program divided by 100.
-    """
-
-    return len(s) / 100.0
-
-
-class sequence_match(base_ff):
-
-    def __init__(self):
-        """
-        Initialise class instance
-        """
-        # Initialise base fitness function class.
-        super().__init__()
-
-        # --target will be a sequence such as (0, 5, 0, 5)
-        self.target = eval(params['TARGET'])
-
-        # we assume --extra_parameters is a comma-separated kv sequence, eg:
-        # "alpha=0.5, beta=0.5, gamma=0.5"
-        # which we can pass to the dict() constructor
-        extra_fit_params = eval("dict(" + params['EXTRA_PARAMETERS'] + ")")
-        self.alpha = extra_fit_params['alpha']
-        self.beta = extra_fit_params['beta']
-        self.gamma = extra_fit_params['gamma']
-
-    def evaluate(self, ind, **kwargs):
-        """
-        ind.phenotype will be a string incl fn defns etc. when we exec it
-        will create a value XXX_output_XXX, but we exec inside an empty dict
-        for safety. But we put a couple of useful primitives in the dict too.
-        
-        :param ind:
-        :return:
-        """
-
-        p, d = ind.phenotype, {'pred': pred, 'succ': succ}
-        exec(p, d)
-
-        # this is the program's output: a generator
-        s = d['XXX_output_XXX']
-
-        # Truncate the generator and convert to list
-        s = list(truncate(len(self.target), s))
-
-        # Set target
-        t = self.target
-
-        # various weightings of four aspects of our fitness. the formula is:
-        # fitness = gamma * dist + (1 - gamma) * length
-        # where dist = alpha * lev_dist(t, s) + (1 - alpha) * dtw_dist(t, s)
-        # and length = beta * proglen(t) + (1 - beta) * compressibility(t)
-        # but when any of alpha, beta and gamma is 0 or 1, we can save some
-        # calculation:
-
-        if self.gamma > 0.0:
-            if self.alpha > 0.0:
-                lev_dist_v = lev_dist(t, s)
-            else:
-                lev_dist_v = 0.0
-            if self.alpha < 1.0:
-                dtw_dist_v = dtw_dist(t, s)
-            else:
-                dtw_dist_v = 0.0
-            dist_v = self.alpha * lev_dist_v + (1 - self.alpha) * dtw_dist_v
-        else:
-            dist_v = 0.0
-
-        if self.gamma < 1.0:
-            if self.beta > 0.0:
-                proglen_v = proglen(p)
-            else:
-                proglen_v = 0.0
-            if self.beta < 1.0:
-                compressibility_v = compressibility(p)
-            else:
-                compressibility_v = 0.0
-            length_v = self.beta * proglen_v + (1 - self.beta) * \
-                       compressibility_v
-        else:
-            length_v = 0.0
-
-        return self.gamma * dist_v + (1 - self.gamma) * length_v
-
-
-if __name__ == "__main__":
-    # TODO write some tests here
-    pass