{ "cells": [ { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# GENETIC ALGORITHM for solving linear regression:\n", "# we want to solve: y = a(x1) + b(x2) + c(x3) + d --> find best value for paramter a,b,c,d\n", "#\n", "# Source:https://medium.com/the-andela-way/on-genetic-algorithms-and-their-application-in-solving-regression-problems-4e37ac1115d5\n", "# https://medium.com/the-andela-way/on-genetic-algorithms-and-their-application-in-solving-regression-problems-4e37ac1115d5\n", "# # rewritten by D.Adytia (didit.adytia@gmail.com) ver. 2021-04-20\n", "\n", "from random import random, sample, choice\n", "from math import floor\n", "from tqdm import tqdm # instantly make your loops show a smart progress meter\n", "from numpy import array, dot, mean\n", "from numpy.linalg import pinv\n", "from sys import exit" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def generate_data():\n", " \"\"\"\n", " We will generate data with a clear pattern.\n", " This ensures we have an idea of the desired result.\n", " This is only for demonstration purposes, real data is needed in practice.\n", " \"\"\"\n", " coeff = [0.4, -0.3, 0.2, -0.1]\n", " x = [[random() for j in range(len(coeff))] for i in range(1000)] # generate random data set with range of 1000\n", " y = [dot(i, coeff) for i in x]\n", " return array(x), array(y)\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def multiple_linear_regression(inputs, outputs):\n", " \"\"\"\n", " Get the best expected outcome.\n", " This is expected to equal the coefficients in generate_data().\n", " \"\"\"\n", " X, Y = array(inputs), array(outputs)\n", " X_t, Y_t = X.transpose(), Y.transpose()\n", " coeff = dot((pinv((dot(X_t, X)))), (dot(X_t, Y)))\n", " Y_p = dot(X, coeff)\n", " Y_mean = mean(Y)\n", " SST = array([(i - Y_mean) ** 2 for i in Y]).sum() # SST: the total error in a model, it is the sum of all deviations squared\n", " SSR = array([(i - j) ** 2 for i, j in zip(Y, Y_p)]).sum() # SSR: a measure of the explained variation in SST.\n", " COD = (1 - (SSR / SST)) * 100.0 # COD: stands for ‘coefficient of determination’ which is basically a measure of how good a model is.\n", " av_error = (SSR / len(Y))\n", " return {'COD': COD, 'coeff': coeff, 'error': av_error} # error: the average error, is an average of all deviations from expected values" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def check_termination_condition(best_individual):\n", " \"\"\"\n", " Check if the current_best_individual is better of equal to the expected.\n", " \"\"\"\n", " if ((best_individual['COD'] >= 99.5)\n", " or (generation_count == max_generations)):\n", " return True\n", " else:\n", " return False" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def create_individual(individual_size):\n", " \"\"\"\n", " Create an individual.\n", " \"\"\"\n", " return [random() for i in range(individual_size)] # To create an initial individual, we will use random assigning of variables.\n", "\n", "\n", "def create_population(individual_size, population_size):\n", " \"\"\"\n", " Create an initial population.\n", " \"\"\"\n", " return [create_individual(individual_size) for i in range(population_size)]\n", "\n", "\n", "def get_fitness(individual, inputs):\n", " \"\"\"\n", " Calculate the fitness of an individual.\n", " Return the Coefficient of Determination, average error and weight.\n", " We use the error to get the best individual.\n", " \"\"\"\n", " predicted_outputs = dot(array(inputs), array(individual))\n", " output_mean = mean(outputs)\n", " SST = array(\n", " [(i - output_mean) ** 2 for i in outputs]).sum()\n", " SSR = array(\n", " [(i - j) ** 2 for i, j in zip(outputs, predicted_outputs)]).sum()\n", " COD = (1 - (SSR / SST)) * 100.0\n", " av_error = (SSR / len(outputs))\n", " return {'COD': COD, 'error': av_error, 'coeff': individual}\n", "\n", "\n", "def evaluate_population(population):\n", " \"\"\"\n", " Evaluate a population of individuals and return the best among them.\n", " \"\"\"\n", " fitness_list = [get_fitness(individual, inputs)\n", " for individual in tqdm(population)]\n", " error_list = sorted(fitness_list, key=lambda i: i['error'])\n", " best_individuals = error_list[: selection_size]\n", " best_individuals_stash.append(best_individuals[0]['coeff'])\n", " print('Error: ', best_individuals[0]['error'],\n", " 'COD: ', best_individuals[0]['COD'])\n", " return best_individuals\n", "\n", "\n", "def crossover(parent_1, parent_2):\n", " \"\"\"\n", " Return offspring given two parents.\n", " Unlike real scenarios, genes in the chromosomes aren't necessarily linked.\n", " \"\"\"\n", " child = {}\n", " loci = [i for i in range(0, individual_size)]\n", " loci_1 = sample(loci, floor(0.5*(individual_size)))\n", " loci_2 = [i for i in loci if i not in loci_1]\n", " chromosome_1 = [[i, parent_1['coeff'][i]] for i in loci_1]\n", " chromosome_2 = [[i, parent_2['coeff'][i]] for i in loci_2]\n", " child.update({key: value for (key, value) in chromosome_1})\n", " child.update({key: value for (key, value) in chromosome_2})\n", " return [child[i] for i in loci]\n", "\n", "\n", "def mutate(individual):\n", " \"\"\"\n", " Mutate an individual.\n", " The gene transform decides whether we'll add or deduct a random value.\n", " \"\"\"\n", " loci = [i for i in range(0, individual_size)]\n", " no_of_genes_mutated = floor(probability_of_gene_mutating*individual_size)\n", " loci_to_mutate = sample(loci, no_of_genes_mutated)\n", " for locus in loci_to_mutate:\n", " gene_transform = choice([-1, 1])\n", " change = gene_transform*random()\n", " individual[locus] = individual[locus] + change\n", " return individual\n", "\n", "\n", "def get_new_generation(selected_individuals):\n", " \"\"\"\n", " Given selected individuals, create a new population by mating them.\n", " Here we also apply variation operations like mutation and crossover.\n", " \"\"\"\n", " parent_pairs = [sample(selected_individuals, 2)\n", " for i in range(population_size)]\n", " offspring = [crossover(pair[0], pair[1]) for pair in parent_pairs]\n", " offspring_indices = [i for i in range(population_size)]\n", " offspring_to_mutate = sample(\n", " offspring_indices,\n", " floor(probability_of_individual_mutating*population_size)\n", " )\n", " mutated_offspring = [[i, mutate(offspring[i])]\n", " for i in offspring_to_mutate]\n", " for child in mutated_offspring:\n", " offspring[child[0]] = child[1]\n", " return offspring" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Generation: 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1000/1000 [00:01<00:00, 923.36it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error: 0.03774250830847626 COD: -49.23229538617549\n", "Generation: 1\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1000/1000 [00:01<00:00, 931.97it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error: 0.020264935863856425 COD: 19.87329326128151\n", "Generation: 2\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1000/1000 [00:01<00:00, 920.81it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error: 0.002643499137637879 COD: 89.54771524624717\n", "Generation: 3\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1000/1000 [00:01<00:00, 922.51it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error: 0.003220560343716587 COD: 87.26603943239829\n", "Generation: 4\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1000/1000 [00:01<00:00, 926.78it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error: 0.0016668891254532205 COD: 93.40919028718149\n", "Generation: 5\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1000/1000 [00:01<00:00, 919.12it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error: 0.00017272713669576155 COD: 99.3170441435976\n", "Generation: 6\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1000/1000 [00:01<00:00, 920.81it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error: 0.00017272713669576155 COD: 99.3170441435976\n", "Generation: 7\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1000/1000 [00:01<00:00, 919.97it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error: 0.00013114088204025252 COD: 99.48147445087957\n", "Generation: 8\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1000/1000 [00:01<00:00, 914.91it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error: 5.190357223292297e-05 COD: 99.79477545160077\n", "Generation: 9\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1000/1000 [00:01<00:00, 909.92it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error: 5.190357223292297e-05 COD: 99.79477545160077\n", "{'COD': 99.79477545160077, 'error': 5.190357223292297e-05, 'coeff': [0.3893265359854301, -0.3029543071275922, 0.2140308237184303, -0.10862250890847935]}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# MAIN CODE\n", "# Initialization\n", "inputs, outputs = generate_data()\n", "individual_size = len(inputs[0])\n", "population_size = 1000\n", "selection_size = floor(0.1*population_size)\n", "max_generations = 50\n", "probability_of_individual_mutating = 0.1\n", "probability_of_gene_mutating = 0.25\n", "best_possible = multiple_linear_regression(inputs, outputs)\n", "best_individuals_stash = [create_individual(individual_size)]\n", "initial_population = create_population(individual_size, 1000)\n", "current_population = initial_population\n", "termination = False\n", "generation_count = 0\n", "while termination is False:\n", " current_best_individual = get_fitness(best_individuals_stash[-1], inputs)\n", " print('Generation: ', generation_count)\n", " best_individuals = evaluate_population(current_population)\n", " current_population = get_new_generation(best_individuals)\n", " termination = check_termination_condition(current_best_individual)\n", " generation_count += 1\n", "else:\n", " print(get_fitness(best_individuals_stash[-1], inputs))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" } }, "nbformat": 4, "nbformat_minor": 4 }