diff --git a/lsh-fast b/lsh-fast
deleted file mode 160000
index 994e1549030ad71d7164ed559b5cd1ab8733ec1a..0000000000000000000000000000000000000000
--- a/lsh-fast
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 994e1549030ad71d7164ed559b5cd1ab8733ec1a
diff --git a/lsh-fast/.idea/.gitignore b/lsh-fast/.idea/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..73f69e0958611ac6e00bde95641f6699030ad235
--- /dev/null
+++ b/lsh-fast/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
diff --git a/lsh-fast/.idea/lsh-fast.iml b/lsh-fast/.idea/lsh-fast.iml
new file mode 100644
index 0000000000000000000000000000000000000000..f08604bb65b25149b195f9e9f282f9683a428592
--- /dev/null
+++ b/lsh-fast/.idea/lsh-fast.iml
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module classpath="CMake" type="CPP_MODULE" version="4" />
\ No newline at end of file
diff --git a/lsh-fast/.idea/misc.xml b/lsh-fast/.idea/misc.xml
new file mode 100644
index 0000000000000000000000000000000000000000..79b3c94830bab93d40d0770f2765540fe24ed423
--- /dev/null
+++ b/lsh-fast/.idea/misc.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="CMakeWorkspace" PROJECT_DIR="$PROJECT_DIR$" />
+</project>
\ No newline at end of file
diff --git a/lsh-fast/.idea/modules.xml b/lsh-fast/.idea/modules.xml
new file mode 100644
index 0000000000000000000000000000000000000000..37d907d1f20c3d4d23effb39574d4d72503a5d93
--- /dev/null
+++ b/lsh-fast/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/lsh-fast.iml" filepath="$PROJECT_DIR$/.idea/lsh-fast.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/lsh-fast/.idea/vcs.xml b/lsh-fast/.idea/vcs.xml
new file mode 100644
index 0000000000000000000000000000000000000000..288b36b1efb71c411d5c27a1ea6c08e41a7fed46
--- /dev/null
+++ b/lsh-fast/.idea/vcs.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/lsh-fast/Makefile b/lsh-fast/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..443291b447758d6865f8d8d8645e581d5886e32d
--- /dev/null
+++ b/lsh-fast/Makefile
@@ -0,0 +1,17 @@
+OBJ = clean lsh
+
+CC = g++
+
+all: clean matmult execute
+
+matmult:
+	$(CC) lsh.cpp -o lsh -O3
+
+clean:
+	/bin/rm -rf $(OBJ) core*
+
+veryclean: clean
+	/bin/rn -f *~
+
+execute:
+	./lsh
\ No newline at end of file
diff --git a/lsh-fast/_lsh.cpp b/lsh-fast/_lsh.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b830c25edbbcf80adc18e5822181a30e1b854b8d
--- /dev/null
+++ b/lsh-fast/_lsh.cpp
@@ -0,0 +1,175 @@
+#include <Python.h>
+#include <numpy/arrayobject.h>
+#include "lsh.h"
+#include <math.h>
+
+/* Docstrings */
+static char module_docstring[] = "This module implements fast nearest-neighbor retrieval using LSH.";
+static char lsh_docstring[] = "Calculate the closest neightbours with distances given a query window.";
+
+/* Available functions */
+static PyObject* lsh_lsh(PyObject *self, PyObject *args);
+
+/* Module specification */
+static PyMethodDef module_methods[] = { { "lsh", lsh_lsh, METH_VARARGS, lsh_docstring }, { NULL, NULL, 0, NULL } };
+
+/* Initialize the module */
+#if PY_MAJOR_VERSION >= 3
+#define MOD_ERROR_VAL NULL
+  #define MOD_SUCCESS_VAL(val) val
+  #define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void)
+  #define MOD_DEF(ob, name, doc, methods) \
+          static struct PyModuleDef moduledef = { \
+            PyModuleDef_HEAD_INIT, name, doc, -1, methods, }; \
+          ob = PyModule_Create(&moduledef);
+#else
+#define MOD_ERROR_VAL
+#define MOD_SUCCESS_VAL(val)
+#define MOD_INIT(name) void init##name(void)
+#define MOD_DEF(ob, name, doc, methods) \
+          ob = Py_InitModule3(name, methods, doc);
+#endif
+
+MOD_INIT(_lsh)
+{
+    PyObject *m;
+
+    MOD_DEF(m, "_lsh", module_docstring,
+            module_methods)
+
+    if (m == NULL)
+        return MOD_ERROR_VAL;
+
+    import_array();
+
+    return MOD_SUCCESS_VAL(m);
+}
+
+static PyObject* lsh_lsh(PyObject* self, PyObject* args) {
+    PyObject* data_obj = NULL;
+    PyObject* query_obj = NULL;
+    PyObject* hash_obj = NULL;
+    double **** hashFunctions;
+    double ** weights = NULL;
+    double*** data;
+    double ** query;
+    int * candidates;
+    double * distances;
+    int nrOfCandidates;
+    double r;
+    double a;
+    double sd;
+    PyArray_Descr *descr;
+    descr = PyArray_DescrFromType(NPY_DOUBLE);
+    PyArray_Descr *descr_float;
+    descr_float = PyArray_DescrFromType(NPY_FLOAT64);
+
+    /// Parse the input tuple
+    if (!PyArg_ParseTuple(args, "OOddd|O", &data_obj, &query_obj, &r, &a, &sd, &hash_obj)) {
+        return NULL;
+    }
+
+    /// Get the dimensions of the data and query
+    int data_size = (long) PyArray_DIM(data_obj, 0);
+    int channel_size = (long) PyArray_DIM(data_obj, 2);
+    int query_size = (int) PyArray_DIM(query_obj, 0);
+
+    /// Convert data, query and weights to C array
+    npy_intp dims0[3];
+    if (PyArray_AsCArray(&query_obj, (void **)&query, dims0, 2, descr) < 0 || PyArray_AsCArray(&data_obj, (void ***)&data, dims0, 3, descr) < 0) {
+        PyErr_SetString(PyExc_TypeError, "error converting to c array");
+        return NULL;
+    }
+    if (hash_obj != NULL)
+    {
+        printf("Using weights");
+        if (PyArray_AsCArray(&hash_obj, (void **)&weights, dims0, 2, descr) < 0) {
+            PyErr_SetString(PyExc_TypeError, "error converting weights to c array");
+            return NULL;
+        }
+    }
+
+    int K = ceil(log((log(0.5))/log(1-exp(-2*(0.1)*(0.1)*query_size)))/log((1-exp(-2*(0.1)*(0.1)*query_size))/(0.5)));
+    int L = ceil(log(0.05)/(log(1-pow(1-exp(-2*(0.1)*(0.1)*query_size), K))));
+    printf("K: %d\n", K);
+    printf("L: %d\n", L);
+    printf("Dim: %d\n", channel_size);
+
+    /// Initialize output parameters
+    hashFunctions = (double ****)malloc(L*sizeof(double***));
+    for (int l=0;l<L;l++)
+    {
+        hashFunctions[l] = (double ***)malloc(K*sizeof(double**));
+        for (int k=0;k<K;k++)
+        {
+            hashFunctions[l][k] = (double **)malloc(query_size*sizeof(double*));
+            for (int t=0;t<query_size;t++)
+            {
+                hashFunctions[l][k][t] = (double *)malloc(channel_size*sizeof(double));
+            }
+        }
+    }
+
+    int status = lsh(data, data_size, query_size, channel_size, query, L, K, r, a, sd, candidates, distances, hashFunctions, weights, nrOfCandidates);
+    if (status) {
+        PyErr_SetString(PyExc_RuntimeError, "lsh could not allocate memory");
+        return NULL;
+    }
+
+    npy_intp dimscandidates[1] = {nrOfCandidates};
+    printf("Number of candidates: %d\n", nrOfCandidates);
+    npy_intp dims4[4] = {L, K, query_size, channel_size};
+
+//    PyArrayObject* numpy_candidates = (PyArrayObject*)PyArray_SimpleNewFromData(1, dimscandidates, NPY_INT, (void*)&candidates);
+//    PyArrayObject* numpy_distances = (PyArrayObject*)PyArray_SimpleNewFromData(1, dimsdistance, NPY_DOUBLE, (void*)&distances);
+    // https://github.com/suiyun0234/scipy-master/commit/da7dfc7aad8daa7a516e43f4c7001eea7c1a707e
+    // https://github.com/fjean/pymeanshift/commit/1ba90da647342184ea7df378d7f21eba257a51d9
+    PyArrayObject* numpy_candidates = (PyArrayObject*)PyArray_SimpleNew(1, dimscandidates, NPY_INT);
+    memcpy(PyArray_DATA(numpy_candidates), candidates, dimscandidates[0]*sizeof(int));
+    PyArrayObject* numpy_distances = (PyArrayObject*)PyArray_SimpleNew(1, dimscandidates, NPY_DOUBLE);
+    memcpy(PyArray_DATA(numpy_distances), distances, dimscandidates[0]*sizeof(double));
+    PyArrayObject* TEST = (PyArrayObject*)PyArray_SimpleNew(1, dimscandidates, NPY_DOUBLE);
+    memcpy(PyArray_DATA(TEST), distances, dimscandidates[0]*sizeof(double));
+//    PyArrayObject* numpy_hash_functions = (PyArrayObject*)PyArray_SimpleNew(4, dims4, NPY_DOUBLE);
+//    double* numpy_hash_functions_data = (double*)PyArray_DATA(numpy_hash_functions);
+//    for (int l=0;l<L;l++)
+//    {
+//        for (int k=0;k<K;k++)
+//        {
+//            for (int t=0;t<query_size;t++)
+//            {
+//                memcpy(numpy_hash_functions_data, hashFunctions[l][k][t], channel_size*sizeof(double));
+//                numpy_hash_functions_data += channel_size;
+//            }
+//        }
+//    }
+    PyObject* ret = Py_BuildValue("NNN", PyArray_Return(numpy_candidates), PyArray_Return(numpy_distances), PyArray_Return(TEST));// PyArray_Return(numpy_hash_functions));
+//    Py_XDECREF(data_obj);
+//    Py_XDECREF(query_obj);
+//    Py_XDECREF(hash_obj);
+//    Py_XDECREF(weights);
+//    Py_XDECREF(data);
+//    Py_XDECREF(query);
+//    Py_XDECREF(descr);
+//    Py_XDECREF(descr_float);
+//    Py_XDECREF(query);
+//    Py_XDECREF(numpy_candidates);
+//    Py_XDECREF(numpy_distances);
+//    Py_XDECREF(TEST);
+//    free(candidates);
+//    free(distances);
+//    for (int l=0;l<L;l++)
+//    {
+//        for (int k=0;k<K;k++)
+//        {
+//            for (int t=0;t<query_size;t++)
+//            {
+//                free(hashFunctions[l][k][t]);
+//            }
+//            free(hashFunctions[l][k]);
+//        }
+//        free(hashFunctions[l]);
+//    }
+//    free(hashFunctions);
+    return ret;
+}
diff --git a/lsh-fast/_ucrdtw.c b/lsh-fast/_ucrdtw.c
new file mode 100644
index 0000000000000000000000000000000000000000..c8023fd40b8bd8f17a7222e223ef2434794e0a30
--- /dev/null
+++ b/lsh-fast/_ucrdtw.c
@@ -0,0 +1,130 @@
+#include <Python.h>
+#include <numpy/arrayobject.h>
+#include "ucrdtw.h"
+
+/* Docstrings */
+static char module_docstring[] = "This module implements fast nearest-neighbor retrieval under the dynamic time warping (DTW).";
+static char ucrdtw_docstring[] = "Calculate the nearest neighbor of a times series in a larger time series expressed as location and distance, "
+                                 "using the UCR suite optimizations.";
+
+/* Available functions */
+static PyObject* ucrdtw_ucrdtw(PyObject *self, PyObject *args);
+
+/* Module specification */
+static PyMethodDef module_methods[] = { { "ucrdtw", ucrdtw_ucrdtw, METH_VARARGS, ucrdtw_docstring }, { NULL, NULL, 0, NULL } };
+
+/* Initialize the module */
+#if PY_MAJOR_VERSION >= 3
+  #define MOD_ERROR_VAL NULL
+  #define MOD_SUCCESS_VAL(val) val
+  #define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void)
+  #define MOD_DEF(ob, name, doc, methods) \
+          static struct PyModuleDef moduledef = { \
+            PyModuleDef_HEAD_INIT, name, doc, -1, methods, }; \
+          ob = PyModule_Create(&moduledef);
+#else
+  #define MOD_ERROR_VAL
+  #define MOD_SUCCESS_VAL(val)
+  #define MOD_INIT(name) void init##name(void)
+  #define MOD_DEF(ob, name, doc, methods) \
+          ob = Py_InitModule3(name, methods, doc);
+#endif
+
+MOD_INIT(_ucrdtw)
+{
+    PyObject *m;
+
+    MOD_DEF(m, "_ucrdtw", module_docstring,
+            module_methods)
+
+    if (m == NULL)
+        return MOD_ERROR_VAL;
+
+    import_array();
+
+    return MOD_SUCCESS_VAL(m);
+}
+/*
+Calculate the nearest neighbor of a times series in a larger time series expressed as location and distance,
+using the UCR suite optimizations.
+
+Arguments:
+data - a list of floats or an ndarray for the time series to process
+query - a list of floats or an ndarray for the time series to search for
+warp_width - Allowed warp width as a fraction of query size
+verbose - Optional boolean to print stats
+*/
+static PyObject* ucrdtw_ucrdtw(PyObject* self, PyObject* args) {
+    PyObject* data_obj = NULL;
+    PyObject* query_obj = NULL;
+    double warp_width = -1;
+    PyObject* verbose_obj = NULL;
+
+    /* Parse the input tuple */
+    if (!PyArg_ParseTuple(args, "OOd|O", &data_obj, &query_obj, &warp_width, &verbose_obj)) {
+        return NULL;
+    }
+
+    /* Interpret the input objects as numpy arrays. */
+    if (!PyList_Check(data_obj) && !PyArray_Check(data_obj)) {
+        PyErr_SetString(PyExc_TypeError, "Data argument must be a list or ndarray");
+        return NULL;
+    }
+//    PyObject* data_array = PyArray_FROM_OTF(data_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+//    if (data_array == NULL) {
+//        Py_XDECREF(data_array);
+//        PyErr_SetString(PyExc_TypeError, "Data argument must be a list or ndarray");
+//        return NULL;
+//    }
+    double** data;
+    PyArray_Descr *descr;
+    descr = PyArray_DescrFromType(NPY_DOUBLE);
+    npy_intp dims0[3];
+    if (PyArray_AsCArray(&data_obj, (void **)&data, dims0, 2, descr) < 0) {
+        PyErr_SetString(PyExc_TypeError, "error converting to c array");
+        return NULL;
+    }
+
+    if (!PyList_Check(query_obj) && !PyArray_Check(query_obj)) {
+        Py_XDECREF(data);
+        PyErr_SetString(PyExc_TypeError, "Query argument must be a list or ndarray");
+        return NULL;
+    }
+    PyObject* query_array = PyArray_FROM_OTF(query_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    if (query_array == NULL) {
+        Py_XDECREF(data);
+        PyErr_SetString(PyExc_TypeError, "Query argument must be a list or ndarray");
+        return NULL;
+    }
+
+
+    /* Get pointers to the data as C-types. */
+    int data_size = (long) PyArray_DIM(data_obj, 0);
+
+    double* query = (double*) PyArray_DATA(query_array);
+    int query_size = (int) PyArray_DIM(query_array, 0);
+
+    int verbose = verbose_obj != NULL ? PyObject_IsTrue(verbose_obj) : 0;
+
+    /* Call the external C function to compute the best DTW location and distance. */
+    long long location = -1;
+    double distance = -1;
+    int status = 0;
+    for (int i=0; i<data_size; i++)
+    {
+        status = ucrdtw(data[i], query_size, query, query_size, warp_width, verbose, &location, &distance);
+    }
+
+    /* Clean up. */
+    Py_XDECREF(data);
+    Py_XDECREF(query_array);
+
+    if (status) {
+        PyErr_SetString(PyExc_RuntimeError, "ucrdtw could not allocate memory");
+        return NULL;
+    }
+
+    /* Build the output tuple */
+    PyObject* ret = Py_BuildValue("Ld", location, distance);
+    return ret;
+}
diff --git a/lsh-fast/cmake-build-debug/CMakeFiles/clion-log.txt b/lsh-fast/cmake-build-debug/CMakeFiles/clion-log.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a8a48d90caeed91ebf364d1bd3d815ec0e149eba
--- /dev/null
+++ b/lsh-fast/cmake-build-debug/CMakeFiles/clion-log.txt
@@ -0,0 +1 @@
+CMakeLists.txt not found in /home/dev-laptop/Dylan/locality-sensitive-hashing-visual-analytics/lsh-fast Select CMakeLists.txt file...
diff --git a/lsh-fast/lsh b/lsh-fast/lsh
new file mode 100755
index 0000000000000000000000000000000000000000..480377051923b7d77a3d083c766b4a8135c1f754
Binary files /dev/null and b/lsh-fast/lsh differ
diff --git a/lsh-fast/lsh.cpp b/lsh-fast/lsh.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d4a515a5fcb8f3e68f6cc20ff3c7846e55ce9171
--- /dev/null
+++ b/lsh-fast/lsh.cpp
@@ -0,0 +1,531 @@
+#include <iostream>     // std::cout
+#include <algorithm>    // std::max
+#include <fstream>
+#include <string.h>
+#include <tuple>
+#include <stdlib.h>
+#include <vector>
+#include <math.h>
+#include <random>
+
+#define INF 1e20 // Pseudo-infinite number for this code
+#define dist(x,y) ((x-y)*(x-y))
+
+using namespace std;
+
+/// This function is from the UCRDTW code
+///
+/// Calculate Dynamic Time Wrapping distance
+/// A,B: data and query, respectively
+/// cb : cummulative bound used for early abandoning
+/// r  : size of Sakoe-Chiba warpping band
+double dtw(double* A, double* B, double *cb, int m, int r, double best_so_far) {
+    double *cost;
+    double *cost_prev;
+    double *cost_tmp;
+    int i, j, k;
+    double x, y, z, min_cost;
+
+    /// Instead of using matrix of size O(m^2) or O(mr), we will reuse two arrays of size O(r).
+    cost = (double*) calloc(2 * r + 1, sizeof(double));
+    cost_prev = (double*) calloc(2 * r + 1, sizeof(double));
+    for (k = 0; k < 2 * r + 1; k++) {
+        cost[k] = INF;
+        cost_prev[k] = INF;
+    }
+
+    for (i = 0; i < m; i++) {
+        k = max(0, r - i);
+        min_cost = INF;
+
+        for (j = max(0, i - r); j <= min(m - 1, i + r); j++, k++) {
+            /// Initialize all row and column
+            if ((i == 0) && (j == 0)) {
+                cost[k] = dist(A[0], B[0]);
+                min_cost = cost[k];
+                continue;
+            }
+
+            if ((j - 1 < 0) || (k - 1 < 0)) {
+                y = INF;
+            } else {
+                y = cost[k - 1];
+            }
+            if ((i - 1 < 0) || (k + 1 > 2 * r)) {
+                x = INF;
+            } else {
+                x = cost_prev[k + 1];
+            }
+            if ((i - 1 < 0) || (j - 1 < 0)) {
+                z = INF;
+            } else {
+                z = cost_prev[k];
+            }
+
+            /// Classic DTW calculation
+            cost[k] = min( min( x, y) , z) + dist(A[i], B[j]);
+
+            /// Find minimum cost in row for early abandoning (possibly to use column instead of row).
+            if (cost[k] < min_cost) {
+                min_cost = cost[k];
+            }
+        }
+
+        /// Move current array to previous array.
+        cost_tmp = cost;
+        cost = cost_prev;
+        cost_prev = cost_tmp;
+    }
+    k--;
+
+    /// the DTW distance is in the last cell in the matrix of size O(m^2) or at the middle of our array.
+    double final_dtw = cost_prev[k];
+    free(cost);
+    free(cost_prev);
+    return final_dtw;
+}
+
+/// Sorting function for the 2nd element of two vectors
+bool sortbysecasc(const pair<int,double> &a,
+                   const pair<int,double> &b)
+{
+    return a.second<b.second;
+}
+
+/// Sorting function for the 2nd element of two vectors
+bool sortbysecdesc(const pair<int,double> &a,
+                   const pair<int,double> &b)
+{
+    return a.second>b.second;
+}
+
+/// Get random value from uniform distribution
+double uniform(double a, double b)
+{
+    return (double) std::rand() / (RAND_MAX) * (b - a) + a;
+}
+
+/// Algorithm for calculating similarity of (multivariate) time series data
+int lsh(double *** D, int M, int T, int Dim, double ** Q, int L, int K, double r, double a, double sd, int * &candidates, double * &distances, double **** &hashFunctions, double ** weights, int &nrOfCandidates)
+{
+    clock_t begin_time = clock();
+    std::random_device rd{};
+    std::mt19937 gen{rd()};
+    std::normal_distribution<> distribution0{0,1};
+    srand(time(NULL));
+
+    int S = 2; //3
+    double delta = r * 0.75; //0.07 => 20.000
+    int threshold = T * 0.22; //0.8 => 18/67
+
+    /// Create hash functions
+    int value;
+    for (int l=0;l<L;l++)
+    {
+        for (int k=0;k<K;k++)
+        {
+            for (int d=0;d<Dim;d++)
+            {
+                value = distribution0(gen);
+                for (int t=0;t<T;t++)
+                    {
+                    if (weights == NULL)
+                    {
+                        hashFunctions[l][k][t][d] = value;
+                    } else {
+                        hashFunctions[l][k][t][d] = value;// * weights[t][d];
+                    }
+                }
+            }
+        }
+    }
+
+    /// Calculate ratios for DTW approximation
+    double ** ratios = (double **)malloc(L*sizeof(double*));
+    for (int l=0;l<L;l++)
+    {
+        ratios[l] = (double *)malloc(K*sizeof(double));
+        for (int k=0;k<K;k++)
+        {
+            ratios[l][k] = uniform(a-sd, a+sd);
+        }
+    }
+
+    /// Create hashSET
+    double temp0 = 0;
+    double **** hashSET = (double****)malloc(M*sizeof(double***)) ; // Initialize DataD
+    for (int m=0;m<M;m++)
+    {
+        hashSET[m] = (double***)malloc(L*sizeof(double**)) ; // Initialize Data
+        for (int l=0;l<L;l++)
+        {
+            hashSET[m][l] = (double**)malloc(K*sizeof(double*)) ; // Initialize Data
+            for (int k=0;k<K;k++)
+            {
+                hashSET[m][l][k] = (double*)malloc(T*sizeof(double)) ; // Initialize Data
+                for (int t=0;t<T;t++)
+                {
+                    temp0 = 0;
+                    for (int d=0;d<Dim;d++) {
+                        temp0 += hashFunctions[l][k][t][d] * D[m][t][d];
+                    }
+                    hashSET[m][l][k][t] = temp0;
+                }
+            }
+        }
+    }
+
+    /// Create upper and lower bound on query
+    double ** Q_U = (double**)malloc((T)*sizeof(double*));
+    double ** Q_L = (double**)malloc((T)*sizeof(double*));
+    for (int t=0;t<T;t++)
+    {
+        Q_U[t] = (double*)malloc((Dim)*sizeof(double));
+        Q_L[t] = (double*)malloc((Dim)*sizeof(double));
+    }
+    double currentMax = -10000;
+    double currentMin = 10000;
+    for (int t = 0; t < T; t++) {
+        for (int d = 0; d < Dim; d++) {
+            for (int s = -S; s <= S; s++) {
+                if (t + s < 0 || t + s > T - 1)
+                    continue;
+                currentMax = std::max(Q_U[t + s][d], currentMax);
+                currentMin = std::min(Q_L[t + s][d], currentMin);
+            }
+            Q_U[t][d] = currentMax;
+            Q_L[t][d] = currentMin;
+            currentMax = -10000;
+            currentMin = 10000;
+        }
+    }
+    double temp1 = 0;
+    double temp2 = 0;
+    double temp3 = 0;
+
+    /// Create query hashes
+    double *** hashQ = (double***)malloc(L*sizeof(double**)) ; // Initialize Data
+    double *** hashQ_U = (double***)malloc(L*sizeof(double**)) ; // Initialize Data
+    double *** hashQ_L = (double***)malloc(L*sizeof(double**)) ; // Initialize Data
+    for (int l=0;l<L;l++)
+    {
+        hashQ[l] = (double**)malloc(K*sizeof(double*)) ; // Initialize Data
+        hashQ_U[l] = (double**)malloc(K*sizeof(double*)) ; // Initialize Data
+        hashQ_L[l] = (double**)malloc(K*sizeof(double*)) ; // Initialize Data
+
+        for (int k=0;k<K;k++)
+        {
+            hashQ[l][k] = (double*)malloc(T*sizeof(double)) ; // Initialize Data
+            hashQ_U[l][k] = (double*)malloc(T*sizeof(double)) ; // Initialize Data
+            hashQ_L[l][k] = (double*)malloc(T*sizeof(double)) ; // Initialize Data
+            for (int t=0;t<T;t++)
+            {
+                temp1 = 0;
+                temp2 = 0;
+                temp3 = 0;
+                for (int d=0;d<Dim;d++)
+                {
+                    temp1 += hashFunctions[l][k][t][d] * Q[t][d];
+                    temp2 += hashFunctions[l][k][t][d] * Q_U[t][d];
+                    temp3 += hashFunctions[l][k][t][d] * Q_L[t][d];
+                }
+                hashQ[l][k][t] = temp1;
+                hashQ_U[l][k][t] = temp2;
+                hashQ_L[l][k][t] = temp3;
+            }
+        }
+    }
+
+    /// Generate candidates
+    vector<int> v;
+    v.reserve(M);
+    bool group_collision;
+    int hash_collision;
+    int projection_collision;
+    for (int m=0;m<M;m++)
+    {
+        group_collision = false;
+        for (int l=0;l<L;l++)
+        {
+            hash_collision = 0;
+            for (int k=0;k<K;k++)
+            {
+                projection_collision = 0;
+                for (int t=0;t<T;t++)
+                {
+                    if (std::abs(hashQ[l][k][t]-hashSET[m][l][k][t]) <= delta/2 ||
+                    std::abs(hashQ_U[l][k][t]-hashSET[m][l][k][t]) <= delta/2 ||
+                    std::abs(hashQ_L[l][k][t]-hashSET[m][l][k][t]) <= delta/2
+                            )
+                    {
+                        projection_collision++;
+                    }
+                }
+                if (projection_collision>=threshold)
+                {
+                    hash_collision++;
+                }
+                else
+                    break;
+            }
+            if (hash_collision==K)
+            {
+                group_collision = true;
+            }
+        }
+        if (group_collision)
+        {
+            v.emplace_back(m);
+        }
+    }
+    clock_t end_time = clock();
+    double time_spent = (double)(end_time - begin_time) / CLOCKS_PER_SEC;
+    std::cout << "Hashing done in: " << time_spent << " seconds" << std::endl;
+    std::cout << "Number of candidates pruned: " << 100 - (100 * v.size() / M) << "%" << std::endl;
+
+    /// Estimate ranking using L2/DTW ratios
+    vector<pair<int, double>> sim;
+    sim.reserve(v.size());
+    double lb;
+    double ub;
+    double lb_it;
+    double ub_it;
+    double q_lb = 0;
+//    double q_ub = sqrt(T * pow(r, 2.0)) * K * L;
+    double q_ub = 0;
+    for (int l=0;l<L;l++)
+    {
+        for (int k=0;k<K;k++)
+        {
+            q_ub += ratios[l][k] * sqrt(T * pow(r, 2.0));
+            q_ub += ratios[l][k] * sqrt(T * pow(r, 2.0));
+            q_ub += ratios[l][k] * sqrt(T * pow(r, 2.0));
+        }
+    }
+    for (int m=0;m<v.size();m++)
+    {
+        lb = 0;
+        ub = 0;
+        for (int l=0;l<L;l++)
+        {
+            for (int k=0;k<K;k++)
+            {
+                lb_it = 0;
+                ub_it = 0;
+                for (int t=0;t<T;t++)
+                {
+                    lb_it += pow(((2 * std::abs(hashQ[l][k][t]-hashSET[v[m]][l][k][t])) / delta), 2) * pow(r,2);
+                    ub_it += pow(((2 * std::abs(hashQ[l][k][t]-hashSET[v[m]][l][k][t])) / delta) + 1, 2) * pow(r,2);
+                }
+                lb += ratios[l][k] * sqrt(lb_it);
+                ub += ratios[l][k] * sqrt(ub_it);
+//                lb += sqrt(lb_it);
+//                ub += sqrt(ub_it);
+            }
+        }
+        sim.emplace_back(v[m], (q_ub - lb)/(ub - q_lb));
+    }
+    sort(sim.begin(), sim.end(), sortbysecdesc);
+    end_time = clock();
+    time_spent = (double)(end_time - begin_time) / CLOCKS_PER_SEC;
+    std::cout << "Approximated ranking done in: " << time_spent << " seconds" << std::endl;
+
+
+    /// Calculate DTW distance on compressed MST data
+    vector<pair<int, double>> dtwsim;
+    dtwsim.reserve(sim.size());
+
+    double distance = 0;
+    for (int m=0;m<sim.size();m++)
+    {
+        distance = 0;
+        for (int l=0;l<L;l++)
+        {
+            for (int k = 0; k < K; k++)
+            {
+                distance += dtw(hashSET[sim[m].first][l][k], hashQ[l][k], NULL, T, 0.05 * 120, INF);
+            }
+        }
+        dtwsim.emplace_back(sim[m].first, distance);
+    }
+    end_time = clock();
+    time_spent = (double)(end_time - begin_time) / CLOCKS_PER_SEC;
+    std::cout << "DTW sim done in: " << time_spent << std::endl;
+
+    /// Sort and return distances
+    sort(dtwsim.begin(), dtwsim.end(), sortbysecasc);
+    int count = 0;
+    for (int m=0;m<sim.size();m++)
+    {
+        for (int n=0;n<50;n++)
+        {
+            if (sim[m].first == dtwsim[n].first) {
+                count++;
+            }
+        }
+    }
+    candidates = (int*)malloc(dtwsim.size()*sizeof(int));
+    distances = (double*)malloc(dtwsim.size()*sizeof(double));
+    for (int i=0; i<dtwsim.size(); i++)
+    {
+        candidates[i] = dtwsim[i].first;
+        distances[i] = dtwsim[i].second;
+    }
+    nrOfCandidates = dtwsim.size();
+
+    /// Clean up
+    v.clear();
+    sim.clear();
+    dtwsim.clear();
+    for (int t=0;t<T;t++)
+    {
+        free(Q_U[t]);
+        free(Q_L[t]);
+    }
+    free(Q_U);
+    free(Q_L);
+    for (int m=0;m<M;m++)
+    {
+        for (int l=0;l<L;l++)
+        {
+            for (int k=0;k<K;k++)
+            {
+                free(hashSET[m][l][k]);
+            }
+            free(hashSET[m][l]);
+        }
+        free(hashSET[m]);
+    }
+    free(hashSET);
+    for (int l=0;l<L;l++)
+    {
+        for (int k=0;k<K;k++)
+        {
+            free(hashQ[l][k]);
+            free(hashQ_L[l][k]);
+            free(hashQ_U[l][k]);
+        }
+        free(ratios[l]);
+        free(hashQ[l]);
+        free(hashQ_U[l]);
+        free(hashQ_L[l]);
+    }
+    free(ratios);
+    free(hashQ);
+    free(hashQ_U);
+    free(hashQ_L);
+    return 0;
+}
+
+int main() {
+    /// Run this function for debugging
+    srand((unsigned int)time(NULL));
+    clock_t begin_time = clock();
+    FILE *data;
+    int M = 125621;
+    int K = 2; //ln((ln(0.5))/ln(1-exp(-2*(0.1)^2*120)))/ln((1-exp(-2*(0.1)^2*120))/(0.5))
+    int L = 6; //ln(0.05)/(ln(1-(1-exp(-2(0.1)^2*120))^4))0.0653330009864134
+    int T = 120;
+    int Dim = 1;
+
+    /// Read data
+    double *** D = (double***)malloc(M*sizeof(double**)) ; // Initialize Data
+    for (int i=0;i<M;i++)
+    {
+        D[i] = (double**)malloc((T)*sizeof(double*));
+        for (int t=0;t<T;t++)
+        {
+            D[i][t] = (double*)malloc((Dim)*sizeof(double));
+        }
+    }
+    data = fopen("processed-data","r");
+    if( data == NULL )
+        exit(2);
+    char d[M]; int i; int j = 0; float f;
+    while(fscanf(data,"%[^\n]\n",d) != EOF && j< M)
+    {
+        char *p =  strtok(d," ");
+        i = 0;
+        while (p != NULL && i<T)
+        {
+            f = atof(p);
+            for (int d=0;d<Dim;d++)
+            {
+                D[j][i][d] = f;
+            }
+            i++;
+            p = strtok (NULL, " ");
+        }
+        j++;
+    }
+
+    /// Print time
+    clock_t end_time = clock();
+    double time_spent = (double)(end_time - begin_time) / CLOCKS_PER_SEC;
+    std::cout << "Data read in: " << time_spent << std::endl;
+    begin_time = clock();
+
+    /// Create query
+    double ** Q = (double**)malloc((T)*sizeof(double*));
+    for (int t=0;t<T;t++)
+    {
+        Q[t] = (double*)malloc((Dim)*sizeof(double));
+        for (int d=0;d<Dim;d++)
+        {
+            Q[t][d] = D[80503][t][d];
+        }
+    }
+
+    int testt = 0;
+    while (testt < 5) {
+        /// Initialize output variables
+        int *candidates;
+        int nrOfCandidates;
+        double *distances;
+        double ****hashFunctions = (double ****) malloc(L * sizeof(double ***));
+        for (int l = 0; l < L; l++) {
+            hashFunctions[l] = (double ***) malloc(K * sizeof(double **));
+            for (int k = 0; k < K; k++) {
+                hashFunctions[l][k] = (double **) malloc(T * sizeof(double *));
+                for (int t = 0; t < T; t++) {
+                    hashFunctions[l][k][t] = (double *) malloc(Dim * sizeof(double));
+                }
+            }
+        }
+        double a = 3.1549093441462044;
+        double sd = 0.514583453532817;
+        double r = 0.21020966674727637;
+
+        lsh(D, M, T, Dim, Q, L, K, r, a, sd, candidates, distances, hashFunctions, NULL, nrOfCandidates);
+
+        cout << "Top-10 candidates: ";
+        for (int i=0; i<10; i++)
+        {
+            cout << candidates[i] << ",";
+        }
+        cout << endl;
+
+//        for (int l = 0; l < L; l++) {
+//            for (int k = 0; k < K; k++) {
+//                for (int t = 0; t < T; t++) {
+//                    free(hashFunctions[l][k][t]);
+//                }
+//                free(hashFunctions[l][k]);
+//            }
+//            free(hashFunctions[l]);
+//        }
+//        free(hashFunctions);
+//        cout << endl;
+//        free(candidates);
+//        free(distances);
+
+        testt++;
+    }
+
+
+    end_time = clock();
+    time_spent = (double)(end_time - begin_time) / CLOCKS_PER_SEC;
+    std::cout << "Query done in: " << time_spent << std::endl;
+
+    return 0;
+}
\ No newline at end of file
diff --git a/lsh-fast/lsh.h b/lsh-fast/lsh.h
new file mode 100644
index 0000000000000000000000000000000000000000..4150199c32fa65938b3cd1d88e4b9a71856a1e3c
--- /dev/null
+++ b/lsh-fast/lsh.h
@@ -0,0 +1,2 @@
+
+int lsh(double *** D, int M, int T, int Dim, double ** Q, int L, int K, double r, double a, double sd, int * &candidates, double * &distances, double **** &hashFunctions, double ** weights, int &nrOfCandidates);
diff --git a/lsh-fast/processed-data b/lsh-fast/processed-data
new file mode 100644
index 0000000000000000000000000000000000000000..2441eba86a5487c53e53b515d5f7de09703922e6
Binary files /dev/null and b/lsh-fast/processed-data differ
diff --git a/lsh-fast/query b/lsh-fast/query
new file mode 100644
index 0000000000000000000000000000000000000000..946acaba2be2067e7ce8a037504b2e468ac8c6c9
--- /dev/null
+++ b/lsh-fast/query
@@ -0,0 +1,120 @@
+0.065966
+0.071220
+0.017980
+0.000000
+0.001635
+0.004670
+0.000234
+0.001051
+0.022067
+0.068418
+0.068652
+0.053357
+0.062347
+0.088150
+0.077175
+0.060245
+0.048920
+0.046935
+0.073088
+0.107414
+0.140222
+0.092820
+0.138354
+0.250555
+0.223118
+0.352365
+0.706830
+1.000000
+0.752482
+0.250321
+0.091302
+0.107531
+0.107064
+0.069119
+0.066316
+0.048920
+0.065849
+0.034676
+0.047402
+0.052072
+0.058727
+0.037011
+0.014828
+0.036778
+0.041331
+0.043199
+0.040864
+0.040280
+0.002452
+0.000000
+0.009340
+0.032808
+0.050788
+0.233509
+0.367192
+0.142324
+0.039463
+0.043549
+0.076007
+0.046935
+0.036778
+0.049504
+0.042148
+0.021249
+0.023468
+0.029656
+0.040864
+0.042499
+0.027904
+0.010858
+0.026153
+0.024985
+0.011092
+0.024168
+0.009924
+0.000000
+0.000000
+0.001284
+0.002335
+0.004320
+0.033275
+0.062230
+0.070870
+0.157735
+0.202569
+0.200817
+0.134851
+0.425804
+0.857912
+0.747345
+0.467717
+0.175599
+0.256626
+0.303211
+0.232808
+0.196848
+0.653825
+1.000000
+0.979685
+0.296089
+0.605721
+0.438062
+0.199066
+0.178634
+0.263048
+0.904845
+0.787158
+0.143841
+0.115820
+0.129364
+0.147227
+0.108582
+0.108698
+0.113252
+0.156801
+0.103328
+0.098541
+0.094337
+0.090134
+0.046118
diff --git a/lsh-fast/ucr b/lsh-fast/ucr
new file mode 100755
index 0000000000000000000000000000000000000000..d274d1f4d550994b2ec3d9d1ab20cf95a64a01ca
Binary files /dev/null and b/lsh-fast/ucr differ
diff --git a/lsh-fast/ucrdtw.c b/lsh-fast/ucrdtw.c
new file mode 100644
index 0000000000000000000000000000000000000000..c13631e9e3b92eb812a06923c9dbbd16b9320fc0
--- /dev/null
+++ b/lsh-fast/ucrdtw.c
@@ -0,0 +1,1056 @@
+/***********************************************************************/
+/************************* DISCLAIMER **********************************/
+/***********************************************************************/
+/** This UCR Suite software is copyright protected (c) 2012 by          **/
+/** Thanawin Rakthanmanon, Bilson Campana, Abdullah Mueen,            **/
+/** Gustavo Batista and Eamonn Keogh.                                 **/
+/**                                                                   **/
+/** Unless stated otherwise, all software is provided free of charge. **/
+/** As well, all software is provided on an "as is" basis without     **/
+/** warranty of any kind, express or implied. Under no circumstances  **/
+/** and under no legal theory, whether in tort, contract,or otherwise,**/
+/** shall Thanawin Rakthanmanon, Bilson Campana, Abdullah Mueen,      **/
+/** Gustavo Batista, or Eamonn Keogh be liable to you or to any other **/
+/** person for any indirect, special, incidental, or consequential    **/
+/** damages of any character including, without limitation, damages   **/
+/** for loss of goodwill, work stoppage, computer failure or          **/
+/** malfunction, or for any and all other damages or losses.          **/
+/**                                                                   **/
+/** If you do not agree with these terms, then you you are advised to **/
+/** not use this software.                                            **/
+/***********************************************************************/
+/***********************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <time.h>
+
+#include "ucrdtw.h"
+
+#define min(x,y) ((x)<(y)?(x):(y))
+#define max(x,y) ((x)>(y)?(x):(y))
+#define dist(x,y) ((x-y)*(x-y))
+
+#define INF 1e20 // Pseudo-infinite number for this code
+
+/// Data structure for sorting the query
+typedef struct index {
+    double value;
+    int index;
+} index_t;
+
+/// Sorting function for the query, sort by abs(z_norm(q[i])) from high to low
+int index_comp(const void* a, const void* b) {
+    index_t* x = (index_t*) a;
+    index_t* y = (index_t*) b;
+    double v = fabs(y->value) - fabs(x->value);   // high to low
+	if (v < 0) return -1;
+	if (v > 0) return 1;
+	return 0;
+}
+
+/// Data structure (circular array) for finding minimum and maximum for LB_Keogh envolop
+typedef struct deque {
+    int* dq;
+    int size, capacity;
+    int f, r;
+} deque_t;
+
+/// Initial the queue at the beginning step of envelope calculation
+void deque_init(deque_t* d, int capacity) {
+    d->capacity = capacity;
+    d->size = 0;
+    d->dq = (int*) calloc(d->capacity, sizeof(int));
+    d->f = 0;
+    d->r = d->capacity - 1;
+}
+
+/// Destroy the queue
+void deque_free(deque_t* d) {
+    free(d->dq);
+}
+
+/// Insert to the queue at the back
+void deque_push_back(deque_t* d, int v) {
+    d->dq[d->r] = v;
+    d->r--;
+    if (d->r < 0)
+        d->r = d->capacity - 1;
+    d->size++;
+}
+
+/// Delete the current (front) element from queue
+void deque_pop_front(deque_t* d) {
+    d->f--;
+    if (d->f < 0)
+        d->f = d->capacity - 1;
+    d->size--;
+}
+
+/// Delete the last element from queue
+void deque_pop_back(deque_t* d) {
+    d->r = (d->r + 1) % d->capacity;
+    d->size--;
+}
+
+/// Get the value at the current position of the circular queue
+int deque_front(deque_t* d) {
+    int aux = d->f - 1;
+
+    if (aux < 0)
+        aux = d->capacity - 1;
+    return d->dq[aux];
+}
+
+/// Get the value at the last position of the circular queue
+int deque_back(deque_t* d) {
+    int aux = (d->r + 1) % d->capacity;
+    return d->dq[aux];
+}
+
+/// Check whether or not the queue is empty
+int deque_empty(deque_t* d) {
+    return d->size == 0;
+}
+
+/// Finding the envelope of min and max value for LB_Keogh
+/// Implementation idea is introduced by Danial Lemire in his paper
+/// "Faster Retrieval with a Two-Pass Dynamic-Time-Warping Lower Bound", Pattern Recognition 42(9), 2009.
+void lower_upper_lemire(double *t, int len, int r, double *l, double *u) {
+    struct deque du, dl;
+    int i;
+
+    deque_init(&du, 2 * r + 2);
+    deque_init(&dl, 2 * r + 2);
+
+    deque_push_back(&du, 0);
+    deque_push_back(&dl, 0);
+
+    for (i = 1; i < len; i++) {
+        if (i > r) {
+            u[i - r - 1] = t[deque_front(&du)];
+            l[i - r - 1] = t[deque_front(&dl)];
+        }
+        if (t[i] > t[i - 1]) {
+            deque_pop_back(&du);
+            while (!deque_empty(&du) && t[i] > t[deque_back(&du)]) {
+                deque_pop_back(&du);
+            }
+        } else {
+            deque_pop_back(&dl);
+            while (!deque_empty(&dl) && t[i] < t[deque_back(&dl)]) {
+                deque_pop_back(&dl);
+            }
+        }
+        deque_push_back(&du, i);
+        deque_push_back(&dl, i);
+        if (i == 2 * r + 1 + deque_front(&du)) {
+            deque_pop_front(&du);
+        } else if (i == 2 * r + 1 + deque_front(&dl)) {
+            deque_pop_front(&dl);
+        }
+    }
+    for (i = len; i < len + r + 1; i++) {
+        u[i - r - 1] = t[deque_front(&du)];
+        l[i - r - 1] = t[deque_front(&dl)];
+        if (i - deque_front(&du) >= 2 * r + 1) {
+            deque_pop_front(&du);
+        }
+        if (i - deque_front(&dl) >= 2 * r + 1) {
+            deque_pop_front(&dl);
+        }
+    }
+    deque_free(&du);
+    deque_free(&dl);
+}
+
+/// Calculate quick lower bound
+/// Usually, LB_Kim take time O(m) for finding top,bottom,fist and last.
+/// However, because of z-normalization the top and bottom cannot give significant benefits.
+/// And using the first and last points can be computed in constant time.
+/// The pruning power of LB_Kim is non-trivial, especially when the query is not long, say in length 128.
+double lb_kim_hierarchy(double *t, double *q, int j, int len, double mean, double std, double best_so_far) {
+    /// 1 point at front and back
+    double d, lb;
+    double x0 = (t[j] - mean) / std;
+    double y0 = (t[(len - 1 + j)] - mean) / std;
+    lb = dist(x0,q[0]) + dist(y0, q[len - 1]);
+    if (lb >= best_so_far)
+        return lb;
+
+    /// 2 points at front
+    double x1 = (t[(j + 1)] - mean) / std;
+    d = min(dist(x1,q[0]), dist(x0,q[1]));
+    d = min(d, dist(x1,q[1]));
+    lb += d;
+    if (lb >= best_so_far)
+        return lb;
+
+    /// 2 points at back
+    double y1 = (t[(len - 2 + j)] - mean) / std;
+    d = min(dist(y1,q[len-1]), dist(y0, q[len-2]));
+    d = min(d, dist(y1,q[len-2]));
+    lb += d;
+    if (lb >= best_so_far)
+        return lb;
+
+    /// 3 points at front
+    double x2 = (t[(j + 2)] - mean) / std;
+    d = min(dist(x0,q[2]), dist(x1, q[2]));
+    d = min(d, dist(x2,q[2]));
+    d = min(d, dist(x2,q[1]));
+    d = min(d, dist(x2,q[0]));
+    lb += d;
+    if (lb >= best_so_far)
+        return lb;
+
+    /// 3 points at back
+    double y2 = (t[(len - 3 + j)] - mean) / std;
+    d = min(dist(y0,q[len-3]), dist(y1, q[len-3]));
+    d = min(d, dist(y2,q[len-3]));
+    d = min(d, dist(y2,q[len-2]));
+    d = min(d, dist(y2,q[len-1]));
+    lb += d;
+
+    return lb;
+}
+
+/// LB_Keogh 1: Create Envelope for the query
+/// Note that because the query is known, envelope can be created once at the beginning.
+///
+/// Variable Explanation,
+/// order : sorted indices for the query.
+/// uo, lo: upper and lower envelops for the query, which already sorted.
+/// t     : a circular array keeping the current data.
+/// j     : index of the starting location in t
+/// cb    : (output) current bound at each position. It will be used later for early abandoning in DTW.
+double lb_keogh_cumulative(int* order, double *t, double *uo, double *lo, double *cb, int j, int len, double mean, double std, double best_so_far) {
+    double lb = 0;
+    double x, d;
+    int i;
+
+    for (i = 0; i < len && lb < best_so_far; i++) {
+        x = (t[(order[i] + j)] - mean) / std;
+        d = 0;
+        if (x > uo[i]) {
+            d = dist(x, uo[i]);
+        } else if (x < lo[i]) {
+            d = dist(x, lo[i]);
+        }
+        lb += d;
+        cb[order[i]] = d;
+    }
+    return lb;
+}
+
+/// LB_Keogh 2: Create Envelop for the data
+/// Note that the envelops have been created (in main function) when each data point has been read.
+///
+/// Variable Explanation,
+/// tz: Z-normalized data
+/// qo: sorted query
+/// cb: (output) current bound at each position. Used later for early abandoning in DTW.
+/// l,u: lower and upper envelope of the current data
+double lb_keogh_data_cumulative(int* order, double *tz, double *qo, double *cb, double *l, double *u, int len, double mean, double std, double best_so_far) {
+    double lb = 0;
+    double uu, ll, d;
+    int i;
+
+    for (i = 0; i < len && lb < best_so_far; i++) {
+        uu = (u[order[i]] - mean) / std;
+        ll = (l[order[i]] - mean) / std;
+        d = 0;
+        if (qo[i] > uu) {
+            d = dist(qo[i], uu);
+        } else {
+            if (qo[i] < ll) {
+                d = dist(qo[i], ll);
+            }
+        }
+        lb += d;
+        cb[order[i]] = d;
+    }
+    return lb;
+}
+
+/// Calculate Dynamic Time Wrapping distance
+/// A,B: data and query, respectively
+/// cb : cummulative bound used for early abandoning
+/// r  : size of Sakoe-Chiba warpping band
+double dtw(double* A, double* B, double *cb, int m, int r, double best_so_far) {
+    double *cost;
+    double *cost_prev;
+    double *cost_tmp;
+    int i, j, k;
+    double x, y, z, min_cost;
+
+    /// Instead of using matrix of size O(m^2) or O(mr), we will reuse two arrays of size O(r).
+    cost = (double*) calloc(2 * r + 1, sizeof(double));
+    cost_prev = (double*) calloc(2 * r + 1, sizeof(double));
+    for (k = 0; k < 2 * r + 1; k++) {
+        cost[k] = INF;
+        cost_prev[k] = INF;
+    }
+
+    for (i = 0; i < m; i++) {
+        k = max(0, r - i);
+        min_cost = INF;
+
+        for (j = max(0, i - r); j <= min(m - 1, i + r); j++, k++) {
+            /// Initialize all row and column
+            if ((i == 0) && (j == 0)) {
+                cost[k] = dist(A[0], B[0]);
+                min_cost = cost[k];
+                continue;
+            }
+
+            if ((j - 1 < 0) || (k - 1 < 0)) {
+                y = INF;
+            } else {
+                y = cost[k - 1];
+            }
+            if ((i - 1 < 0) || (k + 1 > 2 * r)) {
+                x = INF;
+            } else {
+                x = cost_prev[k + 1];
+            }
+            if ((i - 1 < 0) || (j - 1 < 0)) {
+                z = INF;
+            } else {
+                z = cost_prev[k];
+            }
+
+            /// Classic DTW calculation
+            cost[k] = min( min( x, y) , z) + dist(A[i], B[j]);
+
+            /// Find minimum cost in row for early abandoning (possibly to use column instead of row).
+            if (cost[k] < min_cost) {
+                min_cost = cost[k];
+            }
+        }
+
+//        /// We can abandon early if the current cummulative distance with lower bound together are larger than best_so_far
+//        if (i + r < m - 1 && min_cost + cb[i + r + 1] >= best_so_far) {
+//            free(cost);
+//            free(cost_prev);
+//            return min_cost + cb[i + r + 1];
+//        }
+
+        /// Move current array to previous array.
+        cost_tmp = cost;
+        cost = cost_prev;
+        cost_prev = cost_tmp;
+    }
+    k--;
+
+    /// the DTW distance is in the last cell in the matrix of size O(m^2) or at the middle of our array.
+    double final_dtw = cost_prev[k];
+    free(cost);
+    free(cost_prev);
+    return final_dtw;
+}
+
+
+/// Calculate the nearest neighbor of a times series in a larger time series expressed as location and distance,
+/// using the UCR suite optimizations.
+int ucrdtw(double* data, long long data_size, double* query, long query_size, double warp_width, int verbose, long long* location, double* distance) {
+    long m = query_size;
+    int r = warp_width <= 1 ? floor(warp_width * m) : floor(warp_width);
+
+    double best_so_far; /// best-so-far
+    double *q, *t; /// data array
+    int *order; ///new order of the query
+    double *u, *l, *qo, *uo, *lo, *tz, *cb, *cb1, *cb2, *u_d, *l_d;
+
+    double d = 0.0;
+    long long i, j;
+    double ex, ex2, mean, std;
+
+    long long loc = 0;
+    double t1, t2;
+    int kim = 0, keogh = 0, keogh2 = 0;
+    double dist = 0, lb_kim = 0, lb_k = 0, lb_k2 = 0;
+    double *buffer, *u_buff, *l_buff;
+    index_t *q_tmp;
+
+    /// For every EPOCH points, all cumulative values, such as ex (sum), ex2 (sum square), will be restarted for reducing the floating point error.
+    int EPOCH = 100000;
+
+    if (verbose) {
+        t1 = clock();
+    }
+
+    /// calloc everything here
+    q = (double*) calloc(m, sizeof(double));
+    if (q == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+    memcpy((void*)q, (void*)query, m * sizeof(double));
+
+    qo = (double*) calloc(m, sizeof(double));
+    if (qo == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    uo = (double*) calloc(m, sizeof(double));
+    if (uo == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    lo = (double*) calloc(m, sizeof(double));
+    if (lo == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    order = (int *) calloc(m, sizeof(int));
+    if (order == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    q_tmp = (index_t *) calloc(m, sizeof(index_t));
+    if (q_tmp == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    u = (double*) calloc(m, sizeof(double));
+    if (u == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    l = (double*) calloc(m, sizeof(double));
+    if (l == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    cb = (double*) calloc(m, sizeof(double));
+    if (cb == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+    }
+
+    cb1 = (double*) calloc(m, sizeof(double));
+    if (cb1 == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    cb2 = (double*) calloc(m, sizeof(double));
+    if (cb2 == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    u_d = (double*) calloc(m, sizeof(double));
+    if (u == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    l_d = (double*) calloc(m, sizeof(double));
+    if (l == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    t = (double*) calloc(m, sizeof(double) * 2);
+    if (t == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    tz = (double*) calloc(m, sizeof(double));
+    if (tz == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    buffer = (double*) calloc(EPOCH, sizeof(double));
+    if (buffer == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    u_buff = (double*) calloc(EPOCH, sizeof(double));
+    if (u_buff == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    l_buff = (double*) calloc(EPOCH, sizeof(double));
+    if (l_buff == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    /// Read query
+    best_so_far = INF;
+    ex = ex2 = 0;
+    for (i = 0; i < m; i++) {
+        d = q[i];
+        ex += d;
+        ex2 += d * d;
+    }
+    /// Do z-normalize the query, keep in same array, q
+    mean = ex / m;
+    std = ex2 / m;
+    std = sqrt(std - mean * mean);
+    for (i = 0; i < m; i++)
+        q[i] = (q[i] - mean) / std;
+
+    /// Create envelope of the query: lower envelope, l, and upper envelope, u
+    lower_upper_lemire(q, m, r, l, u);
+
+    /// Sort the query one time by abs(z-norm(q[i]))
+    for (i = 0; i < m; i++) {
+        q_tmp[i].value = q[i];
+        q_tmp[i].index = i;
+    }
+    qsort(q_tmp, m, sizeof(index_t), index_comp);
+
+    /// also create another arrays for keeping sorted envelope
+    for (i = 0; i < m; i++) {
+        int o = q_tmp[i].index;
+        order[i] = o;
+        qo[i] = q[o];
+        uo[i] = u[o];
+        lo[i] = l[o];
+    }
+
+    /// Initial the cummulative lower bound
+    for (i = 0; i < m; i++) {
+        cb[i] = 0;
+        cb1[i] = 0;
+        cb2[i] = 0;
+    }
+
+    i = 0;          /// current index of the data in current chunk of size EPOCH
+    j = 0;          /// the starting index of the data in the circular array, t
+    ex = ex2 = 0;
+    int done = 0;
+    int it = 0, ep = 0, k = 0;
+    long long I; /// the starting index of the data in current chunk of size EPOCH
+    long long data_index = 0;
+    while (!done) {
+        /// Read first m-1 points
+        ep = 0;
+        if (it == 0) {
+            for (k = 0; k < m - 1 && data_index < data_size; k++) {
+                buffer[k] = data[data_index++];
+            }
+        } else {
+            for (k = 0; k < m - 1; k++)
+                buffer[k] = buffer[EPOCH - m + 1 + k];
+        }
+
+        /// Read buffer of size EPOCH or when all data has been read.
+        ep = m - 1;
+        while (ep < EPOCH && data_index < data_size) {
+            buffer[ep] = data[data_index++];
+            ep++;
+        }
+
+        /// Data are read in chunk of size EPOCH.
+        /// When there is nothing to read, the loop is end.
+        if (ep <= m - 1) {
+            done = 1;
+        } else {
+            lower_upper_lemire(buffer, ep, r, l_buff, u_buff);
+            /// Do main task here..
+            ex = 0;
+            ex2 = 0;
+            for (i = 0; i < ep; i++) {
+                /// A bunch of data has been read and pick one of them at a time to use
+                d = buffer[i];
+
+                /// Calcualte sum and sum square
+                ex += d;
+                ex2 += d * d;
+
+                /// t is a circular array for keeping current data
+                t[i % m] = d;
+
+                /// Double the size for avoiding using modulo "%" operator
+                t[(i % m) + m] = d;
+
+                /// Start the task when there are more than m-1 points in the current chunk
+                if (i >= m - 1) {
+                    mean = ex / m;
+                    std = ex2 / m;
+                    std = sqrt(std - mean * mean);
+
+                    /// compute the start location of the data in the current circular array, t
+                    j = (i + 1) % m;
+                    /// the start location of the data in the current chunk
+                    I = i - (m - 1);
+
+                    /// Use a constant lower bound to prune the obvious subsequence
+                    lb_kim = lb_kim_hierarchy(t, q, j, m, mean, std, best_so_far);
+
+                    if (lb_kim < best_so_far) {
+                        /// Use a linear time lower bound to prune; z_normalization of t will be computed on the fly.
+                        /// uo, lo are envelope of the query.
+                        lb_k = lb_keogh_cumulative(order, t, uo, lo, cb1, j, m, mean, std, best_so_far);
+                        if (lb_k < best_so_far) {
+                            /// Take another linear time to compute z_normalization of t.
+                            /// Note that for better optimization, this can merge to the previous function.
+                            for (k = 0; k < m; k++) {
+                                tz[k] = (t[(k + j)] - mean) / std;
+                            }
+
+                            /// Use another lb_keogh to prune
+                            /// qo is the sorted query. tz is unsorted z_normalized data.
+                            /// l_buff, u_buff are big envelope for all data in this chunk
+                            lb_k2 = lb_keogh_data_cumulative(order, tz, qo, cb2, l_buff + I, u_buff + I, m, mean, std, best_so_far);
+                            if (lb_k2 < best_so_far) {
+                                /// Choose better lower bound between lb_keogh and lb_keogh2 to be used in early abandoning DTW
+                                /// Note that cb and cb2 will be cumulative summed here.
+                                if (lb_k > lb_k2) {
+                                    cb[m - 1] = cb1[m - 1];
+                                    for (k = m - 2; k >= 0; k--)
+                                        cb[k] = cb[k + 1] + cb1[k];
+                                } else {
+                                    cb[m - 1] = cb2[m - 1];
+                                    for (k = m - 2; k >= 0; k--)
+                                        cb[k] = cb[k + 1] + cb2[k];
+                                }
+
+                                /// Compute DTW and early abandoning if possible
+                                dist = dtw(tz, q, cb, m, r, best_so_far);
+
+                                if (dist < best_so_far) {   /// Update best_so_far
+                                                    /// loc is the real starting location of the nearest neighbor in the file
+                                    best_so_far = dist;
+                                    loc = (it) * (EPOCH - m + 1) + i - m + 1;
+                                }
+                            } else
+                                keogh2++;
+                        } else
+                            keogh++;
+                    } else
+                        kim++;
+
+                    /// Reduce absolute points from sum and sum square
+                    ex -= t[j];
+                    ex2 -= t[j] * t[j];
+                }
+            }
+
+            /// If the size of last chunk is less then EPOCH, then no more data and terminate.
+            if (ep < EPOCH)
+                done = 1;
+            else
+                it++;
+        }
+    }
+
+    i = (it) * (EPOCH - m + 1) + ep;
+    free(q);
+    free(qo);
+    free(uo);
+    free(lo);
+    free(order);
+    free(q_tmp);
+    free(u);
+    free(l);
+    free(cb);
+    free(cb1);
+    free(cb2);
+    free(u_d);
+    free(l_d);
+    free(t);
+    free(tz);
+    free(buffer);
+    free(u_buff);
+    free(l_buff);
+
+    if (verbose) {
+        t2 = clock();
+        printf("Location : %lld\n", loc);
+        printf("Distance : %.6f\n", sqrt(best_so_far));
+        printf("Data Scanned : %lld\n", i);
+        printf("Total Execution Time : %.4f secs\n", (t2 - t1) / CLOCKS_PER_SEC);
+        printf("\n");
+        printf("Pruned by LB_Kim    : %6.2f%%\n", ((double) kim / i) * 100);
+        printf("Pruned by LB_Keogh  : %6.2f%%\n", ((double) keogh / i) * 100);
+        printf("Pruned by LB_Keogh2 : %6.2f%%\n", ((double) keogh2 / i) * 100);
+        printf("DTW Calculation     : %6.2f%%\n", 100 - (((double) kim + keogh + keogh2) / i * 100));
+    }
+    *location = loc;
+    *distance = sqrt(best_so_far);
+    return 0;
+}
+
+/// Calculate the nearest neighbor of a times series in a larger time series expressed as location and distance,
+/// using the UCR suite optimizations. This function supports streaming the data and the query to search.
+int ucrdtwf(FILE* data_file, FILE* query_file, long query_length, double warp_width, int verbose, long long* location, double* distance) {
+    FILE* fp = data_file;
+    FILE* qp = query_file;
+    long m = query_length;
+    int r = warp_width <= 1 ? floor(warp_width * m) : floor(warp_width);
+
+    double best_so_far;          /// best-so-far
+    double *t, *q;       /// data array and query array
+    int *order;          ///new order of the query
+    double *u, *l, *qo, *uo, *lo, *tz, *cb, *cb1, *cb2, *u_d, *l_d;
+
+    double d;
+    long long i, j;
+    double ex, ex2, mean, std;
+
+    long long loc = 0;
+    double t1, t2;
+    int kim = 0, keogh = 0, keogh2 = 0;
+    double dist = 0, lb_kim = 0, lb_k = 0, lb_k2 = 0;
+    double *buffer, *u_buff, *l_buff;
+    index_t *q_tmp;
+
+    /// For every EPOCH points, all cumulative values, such as ex (sum), ex2 (sum square), will be restarted for reducing the floating point error.
+    int EPOCH = 100000;
+
+    if (verbose) {
+        t1 = clock();
+    }
+
+    /// calloc everything here
+    q = (double*) calloc(m, sizeof(double));
+    if (q == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    qo = (double*) calloc(m, sizeof(double));
+    if (qo == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    uo = (double*) calloc(m, sizeof(double));
+    if (uo == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    lo = (double*) calloc(m, sizeof(double));
+    if (lo == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    order = (int*) calloc(m, sizeof(int));
+    if (order == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    q_tmp = (index_t*) calloc(m, sizeof(index_t));
+    if (q_tmp == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    u = (double*) calloc(m, sizeof(double));
+    if (u == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    l = (double*) calloc(m, sizeof(double));
+    if (l == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    cb = (double*) calloc(m, sizeof(double));
+    if (cb == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+    }
+
+    cb1 = (double*) calloc(m, sizeof(double));
+    if (cb1 == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    cb2 = (double*) calloc(m, sizeof(double));
+    if (cb2 == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    u_d = (double*) calloc(m, sizeof(double));
+    if (u == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    l_d = (double*) calloc(m, sizeof(double));
+    if (l == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    t = (double*) calloc(m, sizeof(double) * 2);
+    if (t == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    tz = (double*) calloc(m, sizeof(double));
+    if (tz == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    buffer = (double*) calloc(EPOCH, sizeof(double));
+    if (buffer == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    u_buff = (double*) calloc(EPOCH, sizeof(double));
+    if (u_buff == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    l_buff = (double*) calloc(EPOCH, sizeof(double));
+    if (l_buff == NULL) {
+        printf("ERROR: Memory can't be allocated!\n");
+        return -1;
+    }
+
+    /// Read query file
+    best_so_far = INF;
+    i = 0;
+    j = 0;
+    ex = ex2 = 0;
+
+    while (fscanf(qp, "%lf", &d) != EOF && i < m) {
+        ex += d;
+        ex2 += d * d;
+        q[i] = d;
+        i++;
+    }
+
+    /// Do z-normalize the query, keep in same array, q
+    mean = ex / m;
+    std = ex2 / m;
+    std = sqrt(std - mean * mean);
+    for (i = 0; i < m; i++)
+        q[i] = (q[i] - mean) / std;
+
+    /// Create envelope of the query: lower envelope, l, and upper envelope, u
+    lower_upper_lemire(q, m, r, l, u);
+
+    /// Sort the query one time by abs(z-norm(q[i]))
+    for (i = 0; i < m; i++) {
+        q_tmp[i].value = q[i];
+        q_tmp[i].index = i;
+    }
+    qsort(q_tmp, m, sizeof(index_t), index_comp);
+
+    /// also create another arrays for keeping sorted envelope
+    for (i = 0; i < m; i++) {
+        int o = q_tmp[i].index;
+        order[i] = o;
+        qo[i] = q[o];
+        uo[i] = u[o];
+        lo[i] = l[o];
+    }
+
+    /// Initial the cummulative lower bound
+    for (i = 0; i < m; i++) {
+        cb[i] = 0;
+        cb1[i] = 0;
+        cb2[i] = 0;
+    }
+
+    i = 0;          /// current index of the data in current chunk of size EPOCH
+    j = 0;          /// the starting index of the data in the circular array, t
+    ex = ex2 = 0;
+    int done = 0;
+    int it = 0, ep = 0, k = 0;
+    long long I; /// the starting index of the data in current chunk of size EPOCH
+
+    while (!done) {
+        /// Read first m-1 points
+        ep = 0;
+        if (it == 0) {
+            for (k = 0; k < m - 1; k++)
+                if (fscanf(fp, "%lf", &d) != EOF)
+                    buffer[k] = d;
+        } else {
+            for (k = 0; k < m - 1; k++)
+                buffer[k] = buffer[EPOCH - m + 1 + k];
+        }
+
+        /// Read buffer of size EPOCH or when all data has been read.
+        ep = m - 1;
+        while (ep < EPOCH) {
+            if (fscanf(fp, "%lf", &d) == EOF)
+                break;
+            buffer[ep] = d;
+            ep++;
+        }
+
+        /// Data are read in chunk of size EPOCH.
+        /// When there is nothing to read, the loop is end.
+        if (ep <= m - 1) {
+            done = 1;
+        } else {
+            lower_upper_lemire(buffer, ep, r, l_buff, u_buff);
+            /// Do main task here..
+            ex = 0;
+            ex2 = 0;
+            for (i = 0; i < ep; i++) {
+                /// A bunch of data has been read and pick one of them at a time to use
+                d = buffer[i];
+
+                /// Calcualte sum and sum square
+                ex += d;
+                ex2 += d * d;
+
+                /// t is a circular array for keeping current data
+                t[i % m] = d;
+
+                /// Double the size for avoiding using modulo "%" operator
+                t[(i % m) + m] = d;
+
+                /// Start the task when there are more than m-1 points in the current chunk
+                if (i >= m - 1) {
+                    mean = ex / m;
+                    std = ex2 / m;
+                    std = sqrt(std - mean * mean);
+
+                    /// compute the start location of the data in the current circular array, t
+                    j = (i + 1) % m;
+                    /// the start location of the data in the current chunk
+                    I = i - (m - 1);
+
+                    /// Use a constant lower bound to prune the obvious subsequence
+                    lb_kim = lb_kim_hierarchy(t, q, j, m, mean, std, best_so_far);
+
+                    if (lb_kim < best_so_far) {
+                        /// Use a linear time lower bound to prune; z_normalization of t will be computed on the fly.
+                        /// uo, lo are envelope of the query.
+                        lb_k = lb_keogh_cumulative(order, t, uo, lo, cb1, j, m, mean, std, best_so_far);
+                        if (lb_k < best_so_far) {
+                            /// Take another linear time to compute z_normalization of t.
+                            /// Note that for better optimization, this can merge to the previous function.
+                            for (k = 0; k < m; k++) {
+                                tz[k] = (t[(k + j)] - mean) / std;
+                            }
+
+                            /// Use another lb_keogh to prune
+                            /// qo is the sorted query. tz is unsorted z_normalized data.
+                            /// l_buff, u_buff are big envelope for all data in this chunk
+                            lb_k2 = lb_keogh_data_cumulative(order, tz, qo, cb2, l_buff + I, u_buff + I, m, mean, std, best_so_far);
+                            if (lb_k2 < best_so_far) {
+                                /// Choose better lower bound between lb_keogh and lb_keogh2 to be used in early abandoning DTW
+                                /// Note that cb and cb2 will be cumulative summed here.
+                                if (lb_k > lb_k2) {
+                                    cb[m - 1] = cb1[m - 1];
+                                    for (k = m - 2; k >= 0; k--)
+                                        cb[k] = cb[k + 1] + cb1[k];
+                                } else {
+                                    cb[m - 1] = cb2[m - 1];
+                                    for (k = m - 2; k >= 0; k--)
+                                        cb[k] = cb[k + 1] + cb2[k];
+                                }
+
+                                /// Compute DTW and early abandoning if possible
+                                dist = dtw(tz, q, cb, m, r, best_so_far);
+
+                                if (dist < best_so_far) {   /// Update best_so_far
+                                                    /// loc is the real starting location of the nearest neighbor in the file
+                                    best_so_far = dist;
+                                    loc = (it) * (EPOCH - m + 1) + i - m + 1;
+                                }
+                            } else
+                                keogh2++;
+                        } else
+                            keogh++;
+                    } else
+                        kim++;
+
+                    /// Reduce absolute points from sum and sum square
+                    ex -= t[j];
+                    ex2 -= t[j] * t[j];
+                }
+            }
+
+            /// If the size of last chunk is less then EPOCH, then no more data and terminate.
+            if (ep < EPOCH)
+                done = 1;
+            else
+                it++;
+        }
+    }
+
+    i = (it) * (EPOCH - m + 1) + ep;
+
+    free(q);
+    free(qo);
+    free(uo);
+    free(lo);
+    free(order);
+    free(q_tmp);
+    free(u);
+    free(l);
+    free(cb);
+    free(cb1);
+    free(cb2);
+    free(u_d);
+    free(l_d);
+    free(t);
+    free(buffer);
+    free(u_buff);
+    free(l_buff);
+
+    if (verbose) {
+        t2 = clock();
+        printf("Data Scanned : %lld\n", i);
+        printf("Total Execution Time : %.4f secs\n", (t2 - t1) / CLOCKS_PER_SEC);
+        printf("\n");
+        printf("Pruned by LB_Kim    : %6.2f%%\n", ((double) kim / i) * 100);
+        printf("Pruned by LB_Keogh  : %6.2f%%\n", ((double) keogh / i) * 100);
+        printf("Pruned by LB_Keogh2 : %6.2f%%\n", ((double) keogh2 / i) * 100);
+        printf("DTW Calculation     : %6.2f%%\n", 100 - (((double) kim + keogh + keogh2) / i * 100));
+    }
+    *location = loc;
+    *distance = sqrt(best_so_far);
+    return 0;
+}
+
+//int main(int argc, char** argv) {
+//    if (argc < 5) {
+//        printf("Error: Invalid arguments\n");
+//        printf("Command usage: dtwfind <data-file>  <query-file> <query-length> <warp-width>\n\n");
+//        printf("For example: dtwfind data.txt query.txt 128 0.05\n");
+//        return -2;
+//    }
+//
+//    FILE* data_file = fopen(argv[1], "r");
+//    if (data_file == NULL) {
+//        printf("ERROR: File not found: %s\n", argv[1]);
+//    }
+//
+//    FILE* query_file = fopen(argv[2], "r");
+//    if (query_file == NULL) {
+//        printf("ERROR: File not found: %s\n", argv[2]);
+//    }
+//
+//    long long location = -1;
+//    double distance = -1;
+//    int ret = ucrdtwf(data_file, query_file, atoll(argv[3]), atof(argv[4]), 1, &location, &distance);
+//    printf("Location: %lld\nDistance: %.6f\n", location, distance);
+//    return ret;
+//}
diff --git a/lsh-fast/ucrdtw.h b/lsh-fast/ucrdtw.h
new file mode 100644
index 0000000000000000000000000000000000000000..58acfe82cd51180f37d9f70b4b8fa904907c35af
--- /dev/null
+++ b/lsh-fast/ucrdtw.h
@@ -0,0 +1,6 @@
+
+int ucrdtw(double* data, long long data_size, double* query, long query_size, double warp_width, int verbose, long long* location, double* distance);
+
+int ucrdtwf(FILE* data_stream, FILE* query_stream, long query_length, double warp_width, int verbose, long long* location, double* distance);
+
+double dtw(double* A, double* B, double *cb, int m, int r, double best_so_far);