diff --git a/lsh-fast b/lsh-fast deleted file mode 160000 index 994e1549030ad71d7164ed559b5cd1ab8733ec1a..0000000000000000000000000000000000000000 --- a/lsh-fast +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 994e1549030ad71d7164ed559b5cd1ab8733ec1a diff --git a/lsh-fast/.idea/.gitignore b/lsh-fast/.idea/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..73f69e0958611ac6e00bde95641f6699030ad235 --- /dev/null +++ b/lsh-fast/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/lsh-fast/.idea/lsh-fast.iml b/lsh-fast/.idea/lsh-fast.iml new file mode 100644 index 0000000000000000000000000000000000000000..f08604bb65b25149b195f9e9f282f9683a428592 --- /dev/null +++ b/lsh-fast/.idea/lsh-fast.iml @@ -0,0 +1,2 @@ +<?xml version="1.0" encoding="UTF-8"?> +<module classpath="CMake" type="CPP_MODULE" version="4" /> \ No newline at end of file diff --git a/lsh-fast/.idea/misc.xml b/lsh-fast/.idea/misc.xml new file mode 100644 index 0000000000000000000000000000000000000000..79b3c94830bab93d40d0770f2765540fe24ed423 --- /dev/null +++ b/lsh-fast/.idea/misc.xml @@ -0,0 +1,4 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="CMakeWorkspace" PROJECT_DIR="$PROJECT_DIR$" /> +</project> \ No newline at end of file diff --git a/lsh-fast/.idea/modules.xml b/lsh-fast/.idea/modules.xml new file mode 100644 index 0000000000000000000000000000000000000000..37d907d1f20c3d4d23effb39574d4d72503a5d93 --- /dev/null +++ b/lsh-fast/.idea/modules.xml @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="ProjectModuleManager"> + <modules> + <module fileurl="file://$PROJECT_DIR$/.idea/lsh-fast.iml" filepath="$PROJECT_DIR$/.idea/lsh-fast.iml" /> + </modules> + </component> +</project> \ No newline at end of file diff --git a/lsh-fast/.idea/vcs.xml b/lsh-fast/.idea/vcs.xml new file mode 100644 index 0000000000000000000000000000000000000000..288b36b1efb71c411d5c27a1ea6c08e41a7fed46 --- /dev/null +++ b/lsh-fast/.idea/vcs.xml @@ -0,0 +1,7 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="VcsDirectoryMappings"> + <mapping directory="$PROJECT_DIR$/.." vcs="Git" /> + <mapping directory="$PROJECT_DIR$" vcs="Git" /> + </component> +</project> \ No newline at end of file diff --git a/lsh-fast/Makefile b/lsh-fast/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..443291b447758d6865f8d8d8645e581d5886e32d --- /dev/null +++ b/lsh-fast/Makefile @@ -0,0 +1,17 @@ +OBJ = clean lsh + +CC = g++ + +all: clean matmult execute + +matmult: + $(CC) lsh.cpp -o lsh -O3 + +clean: + /bin/rm -rf $(OBJ) core* + +veryclean: clean + /bin/rn -f *~ + +execute: + ./lsh \ No newline at end of file diff --git a/lsh-fast/_lsh.cpp b/lsh-fast/_lsh.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b830c25edbbcf80adc18e5822181a30e1b854b8d --- /dev/null +++ b/lsh-fast/_lsh.cpp @@ -0,0 +1,175 @@ +#include <Python.h> +#include <numpy/arrayobject.h> +#include "lsh.h" +#include <math.h> + +/* Docstrings */ +static char module_docstring[] = "This module implements fast nearest-neighbor retrieval using LSH."; +static char lsh_docstring[] = "Calculate the closest neightbours with distances given a query window."; + +/* Available functions */ +static PyObject* lsh_lsh(PyObject *self, PyObject *args); + +/* Module specification */ +static PyMethodDef module_methods[] = { { "lsh", lsh_lsh, METH_VARARGS, lsh_docstring }, { NULL, NULL, 0, NULL } }; + +/* Initialize the module */ +#if PY_MAJOR_VERSION >= 3 +#define MOD_ERROR_VAL NULL + #define MOD_SUCCESS_VAL(val) val + #define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void) + #define MOD_DEF(ob, name, doc, methods) \ + static struct PyModuleDef moduledef = { \ + PyModuleDef_HEAD_INIT, name, doc, -1, methods, }; \ + ob = PyModule_Create(&moduledef); +#else +#define MOD_ERROR_VAL +#define MOD_SUCCESS_VAL(val) +#define MOD_INIT(name) void init##name(void) +#define MOD_DEF(ob, name, doc, methods) \ + ob = Py_InitModule3(name, methods, doc); +#endif + +MOD_INIT(_lsh) +{ + PyObject *m; + + MOD_DEF(m, "_lsh", module_docstring, + module_methods) + + if (m == NULL) + return MOD_ERROR_VAL; + + import_array(); + + return MOD_SUCCESS_VAL(m); +} + +static PyObject* lsh_lsh(PyObject* self, PyObject* args) { + PyObject* data_obj = NULL; + PyObject* query_obj = NULL; + PyObject* hash_obj = NULL; + double **** hashFunctions; + double ** weights = NULL; + double*** data; + double ** query; + int * candidates; + double * distances; + int nrOfCandidates; + double r; + double a; + double sd; + PyArray_Descr *descr; + descr = PyArray_DescrFromType(NPY_DOUBLE); + PyArray_Descr *descr_float; + descr_float = PyArray_DescrFromType(NPY_FLOAT64); + + /// Parse the input tuple + if (!PyArg_ParseTuple(args, "OOddd|O", &data_obj, &query_obj, &r, &a, &sd, &hash_obj)) { + return NULL; + } + + /// Get the dimensions of the data and query + int data_size = (long) PyArray_DIM(data_obj, 0); + int channel_size = (long) PyArray_DIM(data_obj, 2); + int query_size = (int) PyArray_DIM(query_obj, 0); + + /// Convert data, query and weights to C array + npy_intp dims0[3]; + if (PyArray_AsCArray(&query_obj, (void **)&query, dims0, 2, descr) < 0 || PyArray_AsCArray(&data_obj, (void ***)&data, dims0, 3, descr) < 0) { + PyErr_SetString(PyExc_TypeError, "error converting to c array"); + return NULL; + } + if (hash_obj != NULL) + { + printf("Using weights"); + if (PyArray_AsCArray(&hash_obj, (void **)&weights, dims0, 2, descr) < 0) { + PyErr_SetString(PyExc_TypeError, "error converting weights to c array"); + return NULL; + } + } + + int K = ceil(log((log(0.5))/log(1-exp(-2*(0.1)*(0.1)*query_size)))/log((1-exp(-2*(0.1)*(0.1)*query_size))/(0.5))); + int L = ceil(log(0.05)/(log(1-pow(1-exp(-2*(0.1)*(0.1)*query_size), K)))); + printf("K: %d\n", K); + printf("L: %d\n", L); + printf("Dim: %d\n", channel_size); + + /// Initialize output parameters + hashFunctions = (double ****)malloc(L*sizeof(double***)); + for (int l=0;l<L;l++) + { + hashFunctions[l] = (double ***)malloc(K*sizeof(double**)); + for (int k=0;k<K;k++) + { + hashFunctions[l][k] = (double **)malloc(query_size*sizeof(double*)); + for (int t=0;t<query_size;t++) + { + hashFunctions[l][k][t] = (double *)malloc(channel_size*sizeof(double)); + } + } + } + + int status = lsh(data, data_size, query_size, channel_size, query, L, K, r, a, sd, candidates, distances, hashFunctions, weights, nrOfCandidates); + if (status) { + PyErr_SetString(PyExc_RuntimeError, "lsh could not allocate memory"); + return NULL; + } + + npy_intp dimscandidates[1] = {nrOfCandidates}; + printf("Number of candidates: %d\n", nrOfCandidates); + npy_intp dims4[4] = {L, K, query_size, channel_size}; + +// PyArrayObject* numpy_candidates = (PyArrayObject*)PyArray_SimpleNewFromData(1, dimscandidates, NPY_INT, (void*)&candidates); +// PyArrayObject* numpy_distances = (PyArrayObject*)PyArray_SimpleNewFromData(1, dimsdistance, NPY_DOUBLE, (void*)&distances); + // https://github.com/suiyun0234/scipy-master/commit/da7dfc7aad8daa7a516e43f4c7001eea7c1a707e + // https://github.com/fjean/pymeanshift/commit/1ba90da647342184ea7df378d7f21eba257a51d9 + PyArrayObject* numpy_candidates = (PyArrayObject*)PyArray_SimpleNew(1, dimscandidates, NPY_INT); + memcpy(PyArray_DATA(numpy_candidates), candidates, dimscandidates[0]*sizeof(int)); + PyArrayObject* numpy_distances = (PyArrayObject*)PyArray_SimpleNew(1, dimscandidates, NPY_DOUBLE); + memcpy(PyArray_DATA(numpy_distances), distances, dimscandidates[0]*sizeof(double)); + PyArrayObject* TEST = (PyArrayObject*)PyArray_SimpleNew(1, dimscandidates, NPY_DOUBLE); + memcpy(PyArray_DATA(TEST), distances, dimscandidates[0]*sizeof(double)); +// PyArrayObject* numpy_hash_functions = (PyArrayObject*)PyArray_SimpleNew(4, dims4, NPY_DOUBLE); +// double* numpy_hash_functions_data = (double*)PyArray_DATA(numpy_hash_functions); +// for (int l=0;l<L;l++) +// { +// for (int k=0;k<K;k++) +// { +// for (int t=0;t<query_size;t++) +// { +// memcpy(numpy_hash_functions_data, hashFunctions[l][k][t], channel_size*sizeof(double)); +// numpy_hash_functions_data += channel_size; +// } +// } +// } + PyObject* ret = Py_BuildValue("NNN", PyArray_Return(numpy_candidates), PyArray_Return(numpy_distances), PyArray_Return(TEST));// PyArray_Return(numpy_hash_functions)); +// Py_XDECREF(data_obj); +// Py_XDECREF(query_obj); +// Py_XDECREF(hash_obj); +// Py_XDECREF(weights); +// Py_XDECREF(data); +// Py_XDECREF(query); +// Py_XDECREF(descr); +// Py_XDECREF(descr_float); +// Py_XDECREF(query); +// Py_XDECREF(numpy_candidates); +// Py_XDECREF(numpy_distances); +// Py_XDECREF(TEST); +// free(candidates); +// free(distances); +// for (int l=0;l<L;l++) +// { +// for (int k=0;k<K;k++) +// { +// for (int t=0;t<query_size;t++) +// { +// free(hashFunctions[l][k][t]); +// } +// free(hashFunctions[l][k]); +// } +// free(hashFunctions[l]); +// } +// free(hashFunctions); + return ret; +} diff --git a/lsh-fast/_ucrdtw.c b/lsh-fast/_ucrdtw.c new file mode 100644 index 0000000000000000000000000000000000000000..c8023fd40b8bd8f17a7222e223ef2434794e0a30 --- /dev/null +++ b/lsh-fast/_ucrdtw.c @@ -0,0 +1,130 @@ +#include <Python.h> +#include <numpy/arrayobject.h> +#include "ucrdtw.h" + +/* Docstrings */ +static char module_docstring[] = "This module implements fast nearest-neighbor retrieval under the dynamic time warping (DTW)."; +static char ucrdtw_docstring[] = "Calculate the nearest neighbor of a times series in a larger time series expressed as location and distance, " + "using the UCR suite optimizations."; + +/* Available functions */ +static PyObject* ucrdtw_ucrdtw(PyObject *self, PyObject *args); + +/* Module specification */ +static PyMethodDef module_methods[] = { { "ucrdtw", ucrdtw_ucrdtw, METH_VARARGS, ucrdtw_docstring }, { NULL, NULL, 0, NULL } }; + +/* Initialize the module */ +#if PY_MAJOR_VERSION >= 3 + #define MOD_ERROR_VAL NULL + #define MOD_SUCCESS_VAL(val) val + #define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void) + #define MOD_DEF(ob, name, doc, methods) \ + static struct PyModuleDef moduledef = { \ + PyModuleDef_HEAD_INIT, name, doc, -1, methods, }; \ + ob = PyModule_Create(&moduledef); +#else + #define MOD_ERROR_VAL + #define MOD_SUCCESS_VAL(val) + #define MOD_INIT(name) void init##name(void) + #define MOD_DEF(ob, name, doc, methods) \ + ob = Py_InitModule3(name, methods, doc); +#endif + +MOD_INIT(_ucrdtw) +{ + PyObject *m; + + MOD_DEF(m, "_ucrdtw", module_docstring, + module_methods) + + if (m == NULL) + return MOD_ERROR_VAL; + + import_array(); + + return MOD_SUCCESS_VAL(m); +} +/* +Calculate the nearest neighbor of a times series in a larger time series expressed as location and distance, +using the UCR suite optimizations. + +Arguments: +data - a list of floats or an ndarray for the time series to process +query - a list of floats or an ndarray for the time series to search for +warp_width - Allowed warp width as a fraction of query size +verbose - Optional boolean to print stats +*/ +static PyObject* ucrdtw_ucrdtw(PyObject* self, PyObject* args) { + PyObject* data_obj = NULL; + PyObject* query_obj = NULL; + double warp_width = -1; + PyObject* verbose_obj = NULL; + + /* Parse the input tuple */ + if (!PyArg_ParseTuple(args, "OOd|O", &data_obj, &query_obj, &warp_width, &verbose_obj)) { + return NULL; + } + + /* Interpret the input objects as numpy arrays. */ + if (!PyList_Check(data_obj) && !PyArray_Check(data_obj)) { + PyErr_SetString(PyExc_TypeError, "Data argument must be a list or ndarray"); + return NULL; + } +// PyObject* data_array = PyArray_FROM_OTF(data_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY); +// if (data_array == NULL) { +// Py_XDECREF(data_array); +// PyErr_SetString(PyExc_TypeError, "Data argument must be a list or ndarray"); +// return NULL; +// } + double** data; + PyArray_Descr *descr; + descr = PyArray_DescrFromType(NPY_DOUBLE); + npy_intp dims0[3]; + if (PyArray_AsCArray(&data_obj, (void **)&data, dims0, 2, descr) < 0) { + PyErr_SetString(PyExc_TypeError, "error converting to c array"); + return NULL; + } + + if (!PyList_Check(query_obj) && !PyArray_Check(query_obj)) { + Py_XDECREF(data); + PyErr_SetString(PyExc_TypeError, "Query argument must be a list or ndarray"); + return NULL; + } + PyObject* query_array = PyArray_FROM_OTF(query_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY); + if (query_array == NULL) { + Py_XDECREF(data); + PyErr_SetString(PyExc_TypeError, "Query argument must be a list or ndarray"); + return NULL; + } + + + /* Get pointers to the data as C-types. */ + int data_size = (long) PyArray_DIM(data_obj, 0); + + double* query = (double*) PyArray_DATA(query_array); + int query_size = (int) PyArray_DIM(query_array, 0); + + int verbose = verbose_obj != NULL ? PyObject_IsTrue(verbose_obj) : 0; + + /* Call the external C function to compute the best DTW location and distance. */ + long long location = -1; + double distance = -1; + int status = 0; + for (int i=0; i<data_size; i++) + { + status = ucrdtw(data[i], query_size, query, query_size, warp_width, verbose, &location, &distance); + } + + /* Clean up. */ + Py_XDECREF(data); + Py_XDECREF(query_array); + + if (status) { + PyErr_SetString(PyExc_RuntimeError, "ucrdtw could not allocate memory"); + return NULL; + } + + /* Build the output tuple */ + PyObject* ret = Py_BuildValue("Ld", location, distance); + return ret; +} diff --git a/lsh-fast/cmake-build-debug/CMakeFiles/clion-log.txt b/lsh-fast/cmake-build-debug/CMakeFiles/clion-log.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8a48d90caeed91ebf364d1bd3d815ec0e149eba --- /dev/null +++ b/lsh-fast/cmake-build-debug/CMakeFiles/clion-log.txt @@ -0,0 +1 @@ +CMakeLists.txt not found in /home/dev-laptop/Dylan/locality-sensitive-hashing-visual-analytics/lsh-fast Select CMakeLists.txt file... diff --git a/lsh-fast/lsh b/lsh-fast/lsh new file mode 100755 index 0000000000000000000000000000000000000000..480377051923b7d77a3d083c766b4a8135c1f754 Binary files /dev/null and b/lsh-fast/lsh differ diff --git a/lsh-fast/lsh.cpp b/lsh-fast/lsh.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d4a515a5fcb8f3e68f6cc20ff3c7846e55ce9171 --- /dev/null +++ b/lsh-fast/lsh.cpp @@ -0,0 +1,531 @@ +#include <iostream> // std::cout +#include <algorithm> // std::max +#include <fstream> +#include <string.h> +#include <tuple> +#include <stdlib.h> +#include <vector> +#include <math.h> +#include <random> + +#define INF 1e20 // Pseudo-infinite number for this code +#define dist(x,y) ((x-y)*(x-y)) + +using namespace std; + +/// This function is from the UCRDTW code +/// +/// Calculate Dynamic Time Wrapping distance +/// A,B: data and query, respectively +/// cb : cummulative bound used for early abandoning +/// r : size of Sakoe-Chiba warpping band +double dtw(double* A, double* B, double *cb, int m, int r, double best_so_far) { + double *cost; + double *cost_prev; + double *cost_tmp; + int i, j, k; + double x, y, z, min_cost; + + /// Instead of using matrix of size O(m^2) or O(mr), we will reuse two arrays of size O(r). + cost = (double*) calloc(2 * r + 1, sizeof(double)); + cost_prev = (double*) calloc(2 * r + 1, sizeof(double)); + for (k = 0; k < 2 * r + 1; k++) { + cost[k] = INF; + cost_prev[k] = INF; + } + + for (i = 0; i < m; i++) { + k = max(0, r - i); + min_cost = INF; + + for (j = max(0, i - r); j <= min(m - 1, i + r); j++, k++) { + /// Initialize all row and column + if ((i == 0) && (j == 0)) { + cost[k] = dist(A[0], B[0]); + min_cost = cost[k]; + continue; + } + + if ((j - 1 < 0) || (k - 1 < 0)) { + y = INF; + } else { + y = cost[k - 1]; + } + if ((i - 1 < 0) || (k + 1 > 2 * r)) { + x = INF; + } else { + x = cost_prev[k + 1]; + } + if ((i - 1 < 0) || (j - 1 < 0)) { + z = INF; + } else { + z = cost_prev[k]; + } + + /// Classic DTW calculation + cost[k] = min( min( x, y) , z) + dist(A[i], B[j]); + + /// Find minimum cost in row for early abandoning (possibly to use column instead of row). + if (cost[k] < min_cost) { + min_cost = cost[k]; + } + } + + /// Move current array to previous array. + cost_tmp = cost; + cost = cost_prev; + cost_prev = cost_tmp; + } + k--; + + /// the DTW distance is in the last cell in the matrix of size O(m^2) or at the middle of our array. + double final_dtw = cost_prev[k]; + free(cost); + free(cost_prev); + return final_dtw; +} + +/// Sorting function for the 2nd element of two vectors +bool sortbysecasc(const pair<int,double> &a, + const pair<int,double> &b) +{ + return a.second<b.second; +} + +/// Sorting function for the 2nd element of two vectors +bool sortbysecdesc(const pair<int,double> &a, + const pair<int,double> &b) +{ + return a.second>b.second; +} + +/// Get random value from uniform distribution +double uniform(double a, double b) +{ + return (double) std::rand() / (RAND_MAX) * (b - a) + a; +} + +/// Algorithm for calculating similarity of (multivariate) time series data +int lsh(double *** D, int M, int T, int Dim, double ** Q, int L, int K, double r, double a, double sd, int * &candidates, double * &distances, double **** &hashFunctions, double ** weights, int &nrOfCandidates) +{ + clock_t begin_time = clock(); + std::random_device rd{}; + std::mt19937 gen{rd()}; + std::normal_distribution<> distribution0{0,1}; + srand(time(NULL)); + + int S = 2; //3 + double delta = r * 0.75; //0.07 => 20.000 + int threshold = T * 0.22; //0.8 => 18/67 + + /// Create hash functions + int value; + for (int l=0;l<L;l++) + { + for (int k=0;k<K;k++) + { + for (int d=0;d<Dim;d++) + { + value = distribution0(gen); + for (int t=0;t<T;t++) + { + if (weights == NULL) + { + hashFunctions[l][k][t][d] = value; + } else { + hashFunctions[l][k][t][d] = value;// * weights[t][d]; + } + } + } + } + } + + /// Calculate ratios for DTW approximation + double ** ratios = (double **)malloc(L*sizeof(double*)); + for (int l=0;l<L;l++) + { + ratios[l] = (double *)malloc(K*sizeof(double)); + for (int k=0;k<K;k++) + { + ratios[l][k] = uniform(a-sd, a+sd); + } + } + + /// Create hashSET + double temp0 = 0; + double **** hashSET = (double****)malloc(M*sizeof(double***)) ; // Initialize DataD + for (int m=0;m<M;m++) + { + hashSET[m] = (double***)malloc(L*sizeof(double**)) ; // Initialize Data + for (int l=0;l<L;l++) + { + hashSET[m][l] = (double**)malloc(K*sizeof(double*)) ; // Initialize Data + for (int k=0;k<K;k++) + { + hashSET[m][l][k] = (double*)malloc(T*sizeof(double)) ; // Initialize Data + for (int t=0;t<T;t++) + { + temp0 = 0; + for (int d=0;d<Dim;d++) { + temp0 += hashFunctions[l][k][t][d] * D[m][t][d]; + } + hashSET[m][l][k][t] = temp0; + } + } + } + } + + /// Create upper and lower bound on query + double ** Q_U = (double**)malloc((T)*sizeof(double*)); + double ** Q_L = (double**)malloc((T)*sizeof(double*)); + for (int t=0;t<T;t++) + { + Q_U[t] = (double*)malloc((Dim)*sizeof(double)); + Q_L[t] = (double*)malloc((Dim)*sizeof(double)); + } + double currentMax = -10000; + double currentMin = 10000; + for (int t = 0; t < T; t++) { + for (int d = 0; d < Dim; d++) { + for (int s = -S; s <= S; s++) { + if (t + s < 0 || t + s > T - 1) + continue; + currentMax = std::max(Q_U[t + s][d], currentMax); + currentMin = std::min(Q_L[t + s][d], currentMin); + } + Q_U[t][d] = currentMax; + Q_L[t][d] = currentMin; + currentMax = -10000; + currentMin = 10000; + } + } + double temp1 = 0; + double temp2 = 0; + double temp3 = 0; + + /// Create query hashes + double *** hashQ = (double***)malloc(L*sizeof(double**)) ; // Initialize Data + double *** hashQ_U = (double***)malloc(L*sizeof(double**)) ; // Initialize Data + double *** hashQ_L = (double***)malloc(L*sizeof(double**)) ; // Initialize Data + for (int l=0;l<L;l++) + { + hashQ[l] = (double**)malloc(K*sizeof(double*)) ; // Initialize Data + hashQ_U[l] = (double**)malloc(K*sizeof(double*)) ; // Initialize Data + hashQ_L[l] = (double**)malloc(K*sizeof(double*)) ; // Initialize Data + + for (int k=0;k<K;k++) + { + hashQ[l][k] = (double*)malloc(T*sizeof(double)) ; // Initialize Data + hashQ_U[l][k] = (double*)malloc(T*sizeof(double)) ; // Initialize Data + hashQ_L[l][k] = (double*)malloc(T*sizeof(double)) ; // Initialize Data + for (int t=0;t<T;t++) + { + temp1 = 0; + temp2 = 0; + temp3 = 0; + for (int d=0;d<Dim;d++) + { + temp1 += hashFunctions[l][k][t][d] * Q[t][d]; + temp2 += hashFunctions[l][k][t][d] * Q_U[t][d]; + temp3 += hashFunctions[l][k][t][d] * Q_L[t][d]; + } + hashQ[l][k][t] = temp1; + hashQ_U[l][k][t] = temp2; + hashQ_L[l][k][t] = temp3; + } + } + } + + /// Generate candidates + vector<int> v; + v.reserve(M); + bool group_collision; + int hash_collision; + int projection_collision; + for (int m=0;m<M;m++) + { + group_collision = false; + for (int l=0;l<L;l++) + { + hash_collision = 0; + for (int k=0;k<K;k++) + { + projection_collision = 0; + for (int t=0;t<T;t++) + { + if (std::abs(hashQ[l][k][t]-hashSET[m][l][k][t]) <= delta/2 || + std::abs(hashQ_U[l][k][t]-hashSET[m][l][k][t]) <= delta/2 || + std::abs(hashQ_L[l][k][t]-hashSET[m][l][k][t]) <= delta/2 + ) + { + projection_collision++; + } + } + if (projection_collision>=threshold) + { + hash_collision++; + } + else + break; + } + if (hash_collision==K) + { + group_collision = true; + } + } + if (group_collision) + { + v.emplace_back(m); + } + } + clock_t end_time = clock(); + double time_spent = (double)(end_time - begin_time) / CLOCKS_PER_SEC; + std::cout << "Hashing done in: " << time_spent << " seconds" << std::endl; + std::cout << "Number of candidates pruned: " << 100 - (100 * v.size() / M) << "%" << std::endl; + + /// Estimate ranking using L2/DTW ratios + vector<pair<int, double>> sim; + sim.reserve(v.size()); + double lb; + double ub; + double lb_it; + double ub_it; + double q_lb = 0; +// double q_ub = sqrt(T * pow(r, 2.0)) * K * L; + double q_ub = 0; + for (int l=0;l<L;l++) + { + for (int k=0;k<K;k++) + { + q_ub += ratios[l][k] * sqrt(T * pow(r, 2.0)); + q_ub += ratios[l][k] * sqrt(T * pow(r, 2.0)); + q_ub += ratios[l][k] * sqrt(T * pow(r, 2.0)); + } + } + for (int m=0;m<v.size();m++) + { + lb = 0; + ub = 0; + for (int l=0;l<L;l++) + { + for (int k=0;k<K;k++) + { + lb_it = 0; + ub_it = 0; + for (int t=0;t<T;t++) + { + lb_it += pow(((2 * std::abs(hashQ[l][k][t]-hashSET[v[m]][l][k][t])) / delta), 2) * pow(r,2); + ub_it += pow(((2 * std::abs(hashQ[l][k][t]-hashSET[v[m]][l][k][t])) / delta) + 1, 2) * pow(r,2); + } + lb += ratios[l][k] * sqrt(lb_it); + ub += ratios[l][k] * sqrt(ub_it); +// lb += sqrt(lb_it); +// ub += sqrt(ub_it); + } + } + sim.emplace_back(v[m], (q_ub - lb)/(ub - q_lb)); + } + sort(sim.begin(), sim.end(), sortbysecdesc); + end_time = clock(); + time_spent = (double)(end_time - begin_time) / CLOCKS_PER_SEC; + std::cout << "Approximated ranking done in: " << time_spent << " seconds" << std::endl; + + + /// Calculate DTW distance on compressed MST data + vector<pair<int, double>> dtwsim; + dtwsim.reserve(sim.size()); + + double distance = 0; + for (int m=0;m<sim.size();m++) + { + distance = 0; + for (int l=0;l<L;l++) + { + for (int k = 0; k < K; k++) + { + distance += dtw(hashSET[sim[m].first][l][k], hashQ[l][k], NULL, T, 0.05 * 120, INF); + } + } + dtwsim.emplace_back(sim[m].first, distance); + } + end_time = clock(); + time_spent = (double)(end_time - begin_time) / CLOCKS_PER_SEC; + std::cout << "DTW sim done in: " << time_spent << std::endl; + + /// Sort and return distances + sort(dtwsim.begin(), dtwsim.end(), sortbysecasc); + int count = 0; + for (int m=0;m<sim.size();m++) + { + for (int n=0;n<50;n++) + { + if (sim[m].first == dtwsim[n].first) { + count++; + } + } + } + candidates = (int*)malloc(dtwsim.size()*sizeof(int)); + distances = (double*)malloc(dtwsim.size()*sizeof(double)); + for (int i=0; i<dtwsim.size(); i++) + { + candidates[i] = dtwsim[i].first; + distances[i] = dtwsim[i].second; + } + nrOfCandidates = dtwsim.size(); + + /// Clean up + v.clear(); + sim.clear(); + dtwsim.clear(); + for (int t=0;t<T;t++) + { + free(Q_U[t]); + free(Q_L[t]); + } + free(Q_U); + free(Q_L); + for (int m=0;m<M;m++) + { + for (int l=0;l<L;l++) + { + for (int k=0;k<K;k++) + { + free(hashSET[m][l][k]); + } + free(hashSET[m][l]); + } + free(hashSET[m]); + } + free(hashSET); + for (int l=0;l<L;l++) + { + for (int k=0;k<K;k++) + { + free(hashQ[l][k]); + free(hashQ_L[l][k]); + free(hashQ_U[l][k]); + } + free(ratios[l]); + free(hashQ[l]); + free(hashQ_U[l]); + free(hashQ_L[l]); + } + free(ratios); + free(hashQ); + free(hashQ_U); + free(hashQ_L); + return 0; +} + +int main() { + /// Run this function for debugging + srand((unsigned int)time(NULL)); + clock_t begin_time = clock(); + FILE *data; + int M = 125621; + int K = 2; //ln((ln(0.5))/ln(1-exp(-2*(0.1)^2*120)))/ln((1-exp(-2*(0.1)^2*120))/(0.5)) + int L = 6; //ln(0.05)/(ln(1-(1-exp(-2(0.1)^2*120))^4))0.0653330009864134 + int T = 120; + int Dim = 1; + + /// Read data + double *** D = (double***)malloc(M*sizeof(double**)) ; // Initialize Data + for (int i=0;i<M;i++) + { + D[i] = (double**)malloc((T)*sizeof(double*)); + for (int t=0;t<T;t++) + { + D[i][t] = (double*)malloc((Dim)*sizeof(double)); + } + } + data = fopen("processed-data","r"); + if( data == NULL ) + exit(2); + char d[M]; int i; int j = 0; float f; + while(fscanf(data,"%[^\n]\n",d) != EOF && j< M) + { + char *p = strtok(d," "); + i = 0; + while (p != NULL && i<T) + { + f = atof(p); + for (int d=0;d<Dim;d++) + { + D[j][i][d] = f; + } + i++; + p = strtok (NULL, " "); + } + j++; + } + + /// Print time + clock_t end_time = clock(); + double time_spent = (double)(end_time - begin_time) / CLOCKS_PER_SEC; + std::cout << "Data read in: " << time_spent << std::endl; + begin_time = clock(); + + /// Create query + double ** Q = (double**)malloc((T)*sizeof(double*)); + for (int t=0;t<T;t++) + { + Q[t] = (double*)malloc((Dim)*sizeof(double)); + for (int d=0;d<Dim;d++) + { + Q[t][d] = D[80503][t][d]; + } + } + + int testt = 0; + while (testt < 5) { + /// Initialize output variables + int *candidates; + int nrOfCandidates; + double *distances; + double ****hashFunctions = (double ****) malloc(L * sizeof(double ***)); + for (int l = 0; l < L; l++) { + hashFunctions[l] = (double ***) malloc(K * sizeof(double **)); + for (int k = 0; k < K; k++) { + hashFunctions[l][k] = (double **) malloc(T * sizeof(double *)); + for (int t = 0; t < T; t++) { + hashFunctions[l][k][t] = (double *) malloc(Dim * sizeof(double)); + } + } + } + double a = 3.1549093441462044; + double sd = 0.514583453532817; + double r = 0.21020966674727637; + + lsh(D, M, T, Dim, Q, L, K, r, a, sd, candidates, distances, hashFunctions, NULL, nrOfCandidates); + + cout << "Top-10 candidates: "; + for (int i=0; i<10; i++) + { + cout << candidates[i] << ","; + } + cout << endl; + +// for (int l = 0; l < L; l++) { +// for (int k = 0; k < K; k++) { +// for (int t = 0; t < T; t++) { +// free(hashFunctions[l][k][t]); +// } +// free(hashFunctions[l][k]); +// } +// free(hashFunctions[l]); +// } +// free(hashFunctions); +// cout << endl; +// free(candidates); +// free(distances); + + testt++; + } + + + end_time = clock(); + time_spent = (double)(end_time - begin_time) / CLOCKS_PER_SEC; + std::cout << "Query done in: " << time_spent << std::endl; + + return 0; +} \ No newline at end of file diff --git a/lsh-fast/lsh.h b/lsh-fast/lsh.h new file mode 100644 index 0000000000000000000000000000000000000000..4150199c32fa65938b3cd1d88e4b9a71856a1e3c --- /dev/null +++ b/lsh-fast/lsh.h @@ -0,0 +1,2 @@ + +int lsh(double *** D, int M, int T, int Dim, double ** Q, int L, int K, double r, double a, double sd, int * &candidates, double * &distances, double **** &hashFunctions, double ** weights, int &nrOfCandidates); diff --git a/lsh-fast/processed-data b/lsh-fast/processed-data new file mode 100644 index 0000000000000000000000000000000000000000..2441eba86a5487c53e53b515d5f7de09703922e6 Binary files /dev/null and b/lsh-fast/processed-data differ diff --git a/lsh-fast/query b/lsh-fast/query new file mode 100644 index 0000000000000000000000000000000000000000..946acaba2be2067e7ce8a037504b2e468ac8c6c9 --- /dev/null +++ b/lsh-fast/query @@ -0,0 +1,120 @@ +0.065966 +0.071220 +0.017980 +0.000000 +0.001635 +0.004670 +0.000234 +0.001051 +0.022067 +0.068418 +0.068652 +0.053357 +0.062347 +0.088150 +0.077175 +0.060245 +0.048920 +0.046935 +0.073088 +0.107414 +0.140222 +0.092820 +0.138354 +0.250555 +0.223118 +0.352365 +0.706830 +1.000000 +0.752482 +0.250321 +0.091302 +0.107531 +0.107064 +0.069119 +0.066316 +0.048920 +0.065849 +0.034676 +0.047402 +0.052072 +0.058727 +0.037011 +0.014828 +0.036778 +0.041331 +0.043199 +0.040864 +0.040280 +0.002452 +0.000000 +0.009340 +0.032808 +0.050788 +0.233509 +0.367192 +0.142324 +0.039463 +0.043549 +0.076007 +0.046935 +0.036778 +0.049504 +0.042148 +0.021249 +0.023468 +0.029656 +0.040864 +0.042499 +0.027904 +0.010858 +0.026153 +0.024985 +0.011092 +0.024168 +0.009924 +0.000000 +0.000000 +0.001284 +0.002335 +0.004320 +0.033275 +0.062230 +0.070870 +0.157735 +0.202569 +0.200817 +0.134851 +0.425804 +0.857912 +0.747345 +0.467717 +0.175599 +0.256626 +0.303211 +0.232808 +0.196848 +0.653825 +1.000000 +0.979685 +0.296089 +0.605721 +0.438062 +0.199066 +0.178634 +0.263048 +0.904845 +0.787158 +0.143841 +0.115820 +0.129364 +0.147227 +0.108582 +0.108698 +0.113252 +0.156801 +0.103328 +0.098541 +0.094337 +0.090134 +0.046118 diff --git a/lsh-fast/ucr b/lsh-fast/ucr new file mode 100755 index 0000000000000000000000000000000000000000..d274d1f4d550994b2ec3d9d1ab20cf95a64a01ca Binary files /dev/null and b/lsh-fast/ucr differ diff --git a/lsh-fast/ucrdtw.c b/lsh-fast/ucrdtw.c new file mode 100644 index 0000000000000000000000000000000000000000..c13631e9e3b92eb812a06923c9dbbd16b9320fc0 --- /dev/null +++ b/lsh-fast/ucrdtw.c @@ -0,0 +1,1056 @@ +/***********************************************************************/ +/************************* DISCLAIMER **********************************/ +/***********************************************************************/ +/** This UCR Suite software is copyright protected (c) 2012 by **/ +/** Thanawin Rakthanmanon, Bilson Campana, Abdullah Mueen, **/ +/** Gustavo Batista and Eamonn Keogh. **/ +/** **/ +/** Unless stated otherwise, all software is provided free of charge. **/ +/** As well, all software is provided on an "as is" basis without **/ +/** warranty of any kind, express or implied. Under no circumstances **/ +/** and under no legal theory, whether in tort, contract,or otherwise,**/ +/** shall Thanawin Rakthanmanon, Bilson Campana, Abdullah Mueen, **/ +/** Gustavo Batista, or Eamonn Keogh be liable to you or to any other **/ +/** person for any indirect, special, incidental, or consequential **/ +/** damages of any character including, without limitation, damages **/ +/** for loss of goodwill, work stoppage, computer failure or **/ +/** malfunction, or for any and all other damages or losses. **/ +/** **/ +/** If you do not agree with these terms, then you you are advised to **/ +/** not use this software. **/ +/***********************************************************************/ +/***********************************************************************/ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <string.h> +#include <time.h> + +#include "ucrdtw.h" + +#define min(x,y) ((x)<(y)?(x):(y)) +#define max(x,y) ((x)>(y)?(x):(y)) +#define dist(x,y) ((x-y)*(x-y)) + +#define INF 1e20 // Pseudo-infinite number for this code + +/// Data structure for sorting the query +typedef struct index { + double value; + int index; +} index_t; + +/// Sorting function for the query, sort by abs(z_norm(q[i])) from high to low +int index_comp(const void* a, const void* b) { + index_t* x = (index_t*) a; + index_t* y = (index_t*) b; + double v = fabs(y->value) - fabs(x->value); // high to low + if (v < 0) return -1; + if (v > 0) return 1; + return 0; +} + +/// Data structure (circular array) for finding minimum and maximum for LB_Keogh envolop +typedef struct deque { + int* dq; + int size, capacity; + int f, r; +} deque_t; + +/// Initial the queue at the beginning step of envelope calculation +void deque_init(deque_t* d, int capacity) { + d->capacity = capacity; + d->size = 0; + d->dq = (int*) calloc(d->capacity, sizeof(int)); + d->f = 0; + d->r = d->capacity - 1; +} + +/// Destroy the queue +void deque_free(deque_t* d) { + free(d->dq); +} + +/// Insert to the queue at the back +void deque_push_back(deque_t* d, int v) { + d->dq[d->r] = v; + d->r--; + if (d->r < 0) + d->r = d->capacity - 1; + d->size++; +} + +/// Delete the current (front) element from queue +void deque_pop_front(deque_t* d) { + d->f--; + if (d->f < 0) + d->f = d->capacity - 1; + d->size--; +} + +/// Delete the last element from queue +void deque_pop_back(deque_t* d) { + d->r = (d->r + 1) % d->capacity; + d->size--; +} + +/// Get the value at the current position of the circular queue +int deque_front(deque_t* d) { + int aux = d->f - 1; + + if (aux < 0) + aux = d->capacity - 1; + return d->dq[aux]; +} + +/// Get the value at the last position of the circular queue +int deque_back(deque_t* d) { + int aux = (d->r + 1) % d->capacity; + return d->dq[aux]; +} + +/// Check whether or not the queue is empty +int deque_empty(deque_t* d) { + return d->size == 0; +} + +/// Finding the envelope of min and max value for LB_Keogh +/// Implementation idea is introduced by Danial Lemire in his paper +/// "Faster Retrieval with a Two-Pass Dynamic-Time-Warping Lower Bound", Pattern Recognition 42(9), 2009. +void lower_upper_lemire(double *t, int len, int r, double *l, double *u) { + struct deque du, dl; + int i; + + deque_init(&du, 2 * r + 2); + deque_init(&dl, 2 * r + 2); + + deque_push_back(&du, 0); + deque_push_back(&dl, 0); + + for (i = 1; i < len; i++) { + if (i > r) { + u[i - r - 1] = t[deque_front(&du)]; + l[i - r - 1] = t[deque_front(&dl)]; + } + if (t[i] > t[i - 1]) { + deque_pop_back(&du); + while (!deque_empty(&du) && t[i] > t[deque_back(&du)]) { + deque_pop_back(&du); + } + } else { + deque_pop_back(&dl); + while (!deque_empty(&dl) && t[i] < t[deque_back(&dl)]) { + deque_pop_back(&dl); + } + } + deque_push_back(&du, i); + deque_push_back(&dl, i); + if (i == 2 * r + 1 + deque_front(&du)) { + deque_pop_front(&du); + } else if (i == 2 * r + 1 + deque_front(&dl)) { + deque_pop_front(&dl); + } + } + for (i = len; i < len + r + 1; i++) { + u[i - r - 1] = t[deque_front(&du)]; + l[i - r - 1] = t[deque_front(&dl)]; + if (i - deque_front(&du) >= 2 * r + 1) { + deque_pop_front(&du); + } + if (i - deque_front(&dl) >= 2 * r + 1) { + deque_pop_front(&dl); + } + } + deque_free(&du); + deque_free(&dl); +} + +/// Calculate quick lower bound +/// Usually, LB_Kim take time O(m) for finding top,bottom,fist and last. +/// However, because of z-normalization the top and bottom cannot give significant benefits. +/// And using the first and last points can be computed in constant time. +/// The pruning power of LB_Kim is non-trivial, especially when the query is not long, say in length 128. +double lb_kim_hierarchy(double *t, double *q, int j, int len, double mean, double std, double best_so_far) { + /// 1 point at front and back + double d, lb; + double x0 = (t[j] - mean) / std; + double y0 = (t[(len - 1 + j)] - mean) / std; + lb = dist(x0,q[0]) + dist(y0, q[len - 1]); + if (lb >= best_so_far) + return lb; + + /// 2 points at front + double x1 = (t[(j + 1)] - mean) / std; + d = min(dist(x1,q[0]), dist(x0,q[1])); + d = min(d, dist(x1,q[1])); + lb += d; + if (lb >= best_so_far) + return lb; + + /// 2 points at back + double y1 = (t[(len - 2 + j)] - mean) / std; + d = min(dist(y1,q[len-1]), dist(y0, q[len-2])); + d = min(d, dist(y1,q[len-2])); + lb += d; + if (lb >= best_so_far) + return lb; + + /// 3 points at front + double x2 = (t[(j + 2)] - mean) / std; + d = min(dist(x0,q[2]), dist(x1, q[2])); + d = min(d, dist(x2,q[2])); + d = min(d, dist(x2,q[1])); + d = min(d, dist(x2,q[0])); + lb += d; + if (lb >= best_so_far) + return lb; + + /// 3 points at back + double y2 = (t[(len - 3 + j)] - mean) / std; + d = min(dist(y0,q[len-3]), dist(y1, q[len-3])); + d = min(d, dist(y2,q[len-3])); + d = min(d, dist(y2,q[len-2])); + d = min(d, dist(y2,q[len-1])); + lb += d; + + return lb; +} + +/// LB_Keogh 1: Create Envelope for the query +/// Note that because the query is known, envelope can be created once at the beginning. +/// +/// Variable Explanation, +/// order : sorted indices for the query. +/// uo, lo: upper and lower envelops for the query, which already sorted. +/// t : a circular array keeping the current data. +/// j : index of the starting location in t +/// cb : (output) current bound at each position. It will be used later for early abandoning in DTW. +double lb_keogh_cumulative(int* order, double *t, double *uo, double *lo, double *cb, int j, int len, double mean, double std, double best_so_far) { + double lb = 0; + double x, d; + int i; + + for (i = 0; i < len && lb < best_so_far; i++) { + x = (t[(order[i] + j)] - mean) / std; + d = 0; + if (x > uo[i]) { + d = dist(x, uo[i]); + } else if (x < lo[i]) { + d = dist(x, lo[i]); + } + lb += d; + cb[order[i]] = d; + } + return lb; +} + +/// LB_Keogh 2: Create Envelop for the data +/// Note that the envelops have been created (in main function) when each data point has been read. +/// +/// Variable Explanation, +/// tz: Z-normalized data +/// qo: sorted query +/// cb: (output) current bound at each position. Used later for early abandoning in DTW. +/// l,u: lower and upper envelope of the current data +double lb_keogh_data_cumulative(int* order, double *tz, double *qo, double *cb, double *l, double *u, int len, double mean, double std, double best_so_far) { + double lb = 0; + double uu, ll, d; + int i; + + for (i = 0; i < len && lb < best_so_far; i++) { + uu = (u[order[i]] - mean) / std; + ll = (l[order[i]] - mean) / std; + d = 0; + if (qo[i] > uu) { + d = dist(qo[i], uu); + } else { + if (qo[i] < ll) { + d = dist(qo[i], ll); + } + } + lb += d; + cb[order[i]] = d; + } + return lb; +} + +/// Calculate Dynamic Time Wrapping distance +/// A,B: data and query, respectively +/// cb : cummulative bound used for early abandoning +/// r : size of Sakoe-Chiba warpping band +double dtw(double* A, double* B, double *cb, int m, int r, double best_so_far) { + double *cost; + double *cost_prev; + double *cost_tmp; + int i, j, k; + double x, y, z, min_cost; + + /// Instead of using matrix of size O(m^2) or O(mr), we will reuse two arrays of size O(r). + cost = (double*) calloc(2 * r + 1, sizeof(double)); + cost_prev = (double*) calloc(2 * r + 1, sizeof(double)); + for (k = 0; k < 2 * r + 1; k++) { + cost[k] = INF; + cost_prev[k] = INF; + } + + for (i = 0; i < m; i++) { + k = max(0, r - i); + min_cost = INF; + + for (j = max(0, i - r); j <= min(m - 1, i + r); j++, k++) { + /// Initialize all row and column + if ((i == 0) && (j == 0)) { + cost[k] = dist(A[0], B[0]); + min_cost = cost[k]; + continue; + } + + if ((j - 1 < 0) || (k - 1 < 0)) { + y = INF; + } else { + y = cost[k - 1]; + } + if ((i - 1 < 0) || (k + 1 > 2 * r)) { + x = INF; + } else { + x = cost_prev[k + 1]; + } + if ((i - 1 < 0) || (j - 1 < 0)) { + z = INF; + } else { + z = cost_prev[k]; + } + + /// Classic DTW calculation + cost[k] = min( min( x, y) , z) + dist(A[i], B[j]); + + /// Find minimum cost in row for early abandoning (possibly to use column instead of row). + if (cost[k] < min_cost) { + min_cost = cost[k]; + } + } + +// /// We can abandon early if the current cummulative distance with lower bound together are larger than best_so_far +// if (i + r < m - 1 && min_cost + cb[i + r + 1] >= best_so_far) { +// free(cost); +// free(cost_prev); +// return min_cost + cb[i + r + 1]; +// } + + /// Move current array to previous array. + cost_tmp = cost; + cost = cost_prev; + cost_prev = cost_tmp; + } + k--; + + /// the DTW distance is in the last cell in the matrix of size O(m^2) or at the middle of our array. + double final_dtw = cost_prev[k]; + free(cost); + free(cost_prev); + return final_dtw; +} + + +/// Calculate the nearest neighbor of a times series in a larger time series expressed as location and distance, +/// using the UCR suite optimizations. +int ucrdtw(double* data, long long data_size, double* query, long query_size, double warp_width, int verbose, long long* location, double* distance) { + long m = query_size; + int r = warp_width <= 1 ? floor(warp_width * m) : floor(warp_width); + + double best_so_far; /// best-so-far + double *q, *t; /// data array + int *order; ///new order of the query + double *u, *l, *qo, *uo, *lo, *tz, *cb, *cb1, *cb2, *u_d, *l_d; + + double d = 0.0; + long long i, j; + double ex, ex2, mean, std; + + long long loc = 0; + double t1, t2; + int kim = 0, keogh = 0, keogh2 = 0; + double dist = 0, lb_kim = 0, lb_k = 0, lb_k2 = 0; + double *buffer, *u_buff, *l_buff; + index_t *q_tmp; + + /// For every EPOCH points, all cumulative values, such as ex (sum), ex2 (sum square), will be restarted for reducing the floating point error. + int EPOCH = 100000; + + if (verbose) { + t1 = clock(); + } + + /// calloc everything here + q = (double*) calloc(m, sizeof(double)); + if (q == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + memcpy((void*)q, (void*)query, m * sizeof(double)); + + qo = (double*) calloc(m, sizeof(double)); + if (qo == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + uo = (double*) calloc(m, sizeof(double)); + if (uo == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + lo = (double*) calloc(m, sizeof(double)); + if (lo == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + order = (int *) calloc(m, sizeof(int)); + if (order == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + q_tmp = (index_t *) calloc(m, sizeof(index_t)); + if (q_tmp == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + u = (double*) calloc(m, sizeof(double)); + if (u == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + l = (double*) calloc(m, sizeof(double)); + if (l == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + cb = (double*) calloc(m, sizeof(double)); + if (cb == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + } + + cb1 = (double*) calloc(m, sizeof(double)); + if (cb1 == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + cb2 = (double*) calloc(m, sizeof(double)); + if (cb2 == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + u_d = (double*) calloc(m, sizeof(double)); + if (u == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + l_d = (double*) calloc(m, sizeof(double)); + if (l == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + t = (double*) calloc(m, sizeof(double) * 2); + if (t == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + tz = (double*) calloc(m, sizeof(double)); + if (tz == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + buffer = (double*) calloc(EPOCH, sizeof(double)); + if (buffer == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + u_buff = (double*) calloc(EPOCH, sizeof(double)); + if (u_buff == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + l_buff = (double*) calloc(EPOCH, sizeof(double)); + if (l_buff == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + /// Read query + best_so_far = INF; + ex = ex2 = 0; + for (i = 0; i < m; i++) { + d = q[i]; + ex += d; + ex2 += d * d; + } + /// Do z-normalize the query, keep in same array, q + mean = ex / m; + std = ex2 / m; + std = sqrt(std - mean * mean); + for (i = 0; i < m; i++) + q[i] = (q[i] - mean) / std; + + /// Create envelope of the query: lower envelope, l, and upper envelope, u + lower_upper_lemire(q, m, r, l, u); + + /// Sort the query one time by abs(z-norm(q[i])) + for (i = 0; i < m; i++) { + q_tmp[i].value = q[i]; + q_tmp[i].index = i; + } + qsort(q_tmp, m, sizeof(index_t), index_comp); + + /// also create another arrays for keeping sorted envelope + for (i = 0; i < m; i++) { + int o = q_tmp[i].index; + order[i] = o; + qo[i] = q[o]; + uo[i] = u[o]; + lo[i] = l[o]; + } + + /// Initial the cummulative lower bound + for (i = 0; i < m; i++) { + cb[i] = 0; + cb1[i] = 0; + cb2[i] = 0; + } + + i = 0; /// current index of the data in current chunk of size EPOCH + j = 0; /// the starting index of the data in the circular array, t + ex = ex2 = 0; + int done = 0; + int it = 0, ep = 0, k = 0; + long long I; /// the starting index of the data in current chunk of size EPOCH + long long data_index = 0; + while (!done) { + /// Read first m-1 points + ep = 0; + if (it == 0) { + for (k = 0; k < m - 1 && data_index < data_size; k++) { + buffer[k] = data[data_index++]; + } + } else { + for (k = 0; k < m - 1; k++) + buffer[k] = buffer[EPOCH - m + 1 + k]; + } + + /// Read buffer of size EPOCH or when all data has been read. + ep = m - 1; + while (ep < EPOCH && data_index < data_size) { + buffer[ep] = data[data_index++]; + ep++; + } + + /// Data are read in chunk of size EPOCH. + /// When there is nothing to read, the loop is end. + if (ep <= m - 1) { + done = 1; + } else { + lower_upper_lemire(buffer, ep, r, l_buff, u_buff); + /// Do main task here.. + ex = 0; + ex2 = 0; + for (i = 0; i < ep; i++) { + /// A bunch of data has been read and pick one of them at a time to use + d = buffer[i]; + + /// Calcualte sum and sum square + ex += d; + ex2 += d * d; + + /// t is a circular array for keeping current data + t[i % m] = d; + + /// Double the size for avoiding using modulo "%" operator + t[(i % m) + m] = d; + + /// Start the task when there are more than m-1 points in the current chunk + if (i >= m - 1) { + mean = ex / m; + std = ex2 / m; + std = sqrt(std - mean * mean); + + /// compute the start location of the data in the current circular array, t + j = (i + 1) % m; + /// the start location of the data in the current chunk + I = i - (m - 1); + + /// Use a constant lower bound to prune the obvious subsequence + lb_kim = lb_kim_hierarchy(t, q, j, m, mean, std, best_so_far); + + if (lb_kim < best_so_far) { + /// Use a linear time lower bound to prune; z_normalization of t will be computed on the fly. + /// uo, lo are envelope of the query. + lb_k = lb_keogh_cumulative(order, t, uo, lo, cb1, j, m, mean, std, best_so_far); + if (lb_k < best_so_far) { + /// Take another linear time to compute z_normalization of t. + /// Note that for better optimization, this can merge to the previous function. + for (k = 0; k < m; k++) { + tz[k] = (t[(k + j)] - mean) / std; + } + + /// Use another lb_keogh to prune + /// qo is the sorted query. tz is unsorted z_normalized data. + /// l_buff, u_buff are big envelope for all data in this chunk + lb_k2 = lb_keogh_data_cumulative(order, tz, qo, cb2, l_buff + I, u_buff + I, m, mean, std, best_so_far); + if (lb_k2 < best_so_far) { + /// Choose better lower bound between lb_keogh and lb_keogh2 to be used in early abandoning DTW + /// Note that cb and cb2 will be cumulative summed here. + if (lb_k > lb_k2) { + cb[m - 1] = cb1[m - 1]; + for (k = m - 2; k >= 0; k--) + cb[k] = cb[k + 1] + cb1[k]; + } else { + cb[m - 1] = cb2[m - 1]; + for (k = m - 2; k >= 0; k--) + cb[k] = cb[k + 1] + cb2[k]; + } + + /// Compute DTW and early abandoning if possible + dist = dtw(tz, q, cb, m, r, best_so_far); + + if (dist < best_so_far) { /// Update best_so_far + /// loc is the real starting location of the nearest neighbor in the file + best_so_far = dist; + loc = (it) * (EPOCH - m + 1) + i - m + 1; + } + } else + keogh2++; + } else + keogh++; + } else + kim++; + + /// Reduce absolute points from sum and sum square + ex -= t[j]; + ex2 -= t[j] * t[j]; + } + } + + /// If the size of last chunk is less then EPOCH, then no more data and terminate. + if (ep < EPOCH) + done = 1; + else + it++; + } + } + + i = (it) * (EPOCH - m + 1) + ep; + free(q); + free(qo); + free(uo); + free(lo); + free(order); + free(q_tmp); + free(u); + free(l); + free(cb); + free(cb1); + free(cb2); + free(u_d); + free(l_d); + free(t); + free(tz); + free(buffer); + free(u_buff); + free(l_buff); + + if (verbose) { + t2 = clock(); + printf("Location : %lld\n", loc); + printf("Distance : %.6f\n", sqrt(best_so_far)); + printf("Data Scanned : %lld\n", i); + printf("Total Execution Time : %.4f secs\n", (t2 - t1) / CLOCKS_PER_SEC); + printf("\n"); + printf("Pruned by LB_Kim : %6.2f%%\n", ((double) kim / i) * 100); + printf("Pruned by LB_Keogh : %6.2f%%\n", ((double) keogh / i) * 100); + printf("Pruned by LB_Keogh2 : %6.2f%%\n", ((double) keogh2 / i) * 100); + printf("DTW Calculation : %6.2f%%\n", 100 - (((double) kim + keogh + keogh2) / i * 100)); + } + *location = loc; + *distance = sqrt(best_so_far); + return 0; +} + +/// Calculate the nearest neighbor of a times series in a larger time series expressed as location and distance, +/// using the UCR suite optimizations. This function supports streaming the data and the query to search. +int ucrdtwf(FILE* data_file, FILE* query_file, long query_length, double warp_width, int verbose, long long* location, double* distance) { + FILE* fp = data_file; + FILE* qp = query_file; + long m = query_length; + int r = warp_width <= 1 ? floor(warp_width * m) : floor(warp_width); + + double best_so_far; /// best-so-far + double *t, *q; /// data array and query array + int *order; ///new order of the query + double *u, *l, *qo, *uo, *lo, *tz, *cb, *cb1, *cb2, *u_d, *l_d; + + double d; + long long i, j; + double ex, ex2, mean, std; + + long long loc = 0; + double t1, t2; + int kim = 0, keogh = 0, keogh2 = 0; + double dist = 0, lb_kim = 0, lb_k = 0, lb_k2 = 0; + double *buffer, *u_buff, *l_buff; + index_t *q_tmp; + + /// For every EPOCH points, all cumulative values, such as ex (sum), ex2 (sum square), will be restarted for reducing the floating point error. + int EPOCH = 100000; + + if (verbose) { + t1 = clock(); + } + + /// calloc everything here + q = (double*) calloc(m, sizeof(double)); + if (q == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + qo = (double*) calloc(m, sizeof(double)); + if (qo == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + uo = (double*) calloc(m, sizeof(double)); + if (uo == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + lo = (double*) calloc(m, sizeof(double)); + if (lo == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + order = (int*) calloc(m, sizeof(int)); + if (order == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + q_tmp = (index_t*) calloc(m, sizeof(index_t)); + if (q_tmp == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + u = (double*) calloc(m, sizeof(double)); + if (u == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + l = (double*) calloc(m, sizeof(double)); + if (l == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + cb = (double*) calloc(m, sizeof(double)); + if (cb == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + } + + cb1 = (double*) calloc(m, sizeof(double)); + if (cb1 == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + cb2 = (double*) calloc(m, sizeof(double)); + if (cb2 == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + u_d = (double*) calloc(m, sizeof(double)); + if (u == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + l_d = (double*) calloc(m, sizeof(double)); + if (l == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + t = (double*) calloc(m, sizeof(double) * 2); + if (t == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + tz = (double*) calloc(m, sizeof(double)); + if (tz == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + buffer = (double*) calloc(EPOCH, sizeof(double)); + if (buffer == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + u_buff = (double*) calloc(EPOCH, sizeof(double)); + if (u_buff == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + l_buff = (double*) calloc(EPOCH, sizeof(double)); + if (l_buff == NULL) { + printf("ERROR: Memory can't be allocated!\n"); + return -1; + } + + /// Read query file + best_so_far = INF; + i = 0; + j = 0; + ex = ex2 = 0; + + while (fscanf(qp, "%lf", &d) != EOF && i < m) { + ex += d; + ex2 += d * d; + q[i] = d; + i++; + } + + /// Do z-normalize the query, keep in same array, q + mean = ex / m; + std = ex2 / m; + std = sqrt(std - mean * mean); + for (i = 0; i < m; i++) + q[i] = (q[i] - mean) / std; + + /// Create envelope of the query: lower envelope, l, and upper envelope, u + lower_upper_lemire(q, m, r, l, u); + + /// Sort the query one time by abs(z-norm(q[i])) + for (i = 0; i < m; i++) { + q_tmp[i].value = q[i]; + q_tmp[i].index = i; + } + qsort(q_tmp, m, sizeof(index_t), index_comp); + + /// also create another arrays for keeping sorted envelope + for (i = 0; i < m; i++) { + int o = q_tmp[i].index; + order[i] = o; + qo[i] = q[o]; + uo[i] = u[o]; + lo[i] = l[o]; + } + + /// Initial the cummulative lower bound + for (i = 0; i < m; i++) { + cb[i] = 0; + cb1[i] = 0; + cb2[i] = 0; + } + + i = 0; /// current index of the data in current chunk of size EPOCH + j = 0; /// the starting index of the data in the circular array, t + ex = ex2 = 0; + int done = 0; + int it = 0, ep = 0, k = 0; + long long I; /// the starting index of the data in current chunk of size EPOCH + + while (!done) { + /// Read first m-1 points + ep = 0; + if (it == 0) { + for (k = 0; k < m - 1; k++) + if (fscanf(fp, "%lf", &d) != EOF) + buffer[k] = d; + } else { + for (k = 0; k < m - 1; k++) + buffer[k] = buffer[EPOCH - m + 1 + k]; + } + + /// Read buffer of size EPOCH or when all data has been read. + ep = m - 1; + while (ep < EPOCH) { + if (fscanf(fp, "%lf", &d) == EOF) + break; + buffer[ep] = d; + ep++; + } + + /// Data are read in chunk of size EPOCH. + /// When there is nothing to read, the loop is end. + if (ep <= m - 1) { + done = 1; + } else { + lower_upper_lemire(buffer, ep, r, l_buff, u_buff); + /// Do main task here.. + ex = 0; + ex2 = 0; + for (i = 0; i < ep; i++) { + /// A bunch of data has been read and pick one of them at a time to use + d = buffer[i]; + + /// Calcualte sum and sum square + ex += d; + ex2 += d * d; + + /// t is a circular array for keeping current data + t[i % m] = d; + + /// Double the size for avoiding using modulo "%" operator + t[(i % m) + m] = d; + + /// Start the task when there are more than m-1 points in the current chunk + if (i >= m - 1) { + mean = ex / m; + std = ex2 / m; + std = sqrt(std - mean * mean); + + /// compute the start location of the data in the current circular array, t + j = (i + 1) % m; + /// the start location of the data in the current chunk + I = i - (m - 1); + + /// Use a constant lower bound to prune the obvious subsequence + lb_kim = lb_kim_hierarchy(t, q, j, m, mean, std, best_so_far); + + if (lb_kim < best_so_far) { + /// Use a linear time lower bound to prune; z_normalization of t will be computed on the fly. + /// uo, lo are envelope of the query. + lb_k = lb_keogh_cumulative(order, t, uo, lo, cb1, j, m, mean, std, best_so_far); + if (lb_k < best_so_far) { + /// Take another linear time to compute z_normalization of t. + /// Note that for better optimization, this can merge to the previous function. + for (k = 0; k < m; k++) { + tz[k] = (t[(k + j)] - mean) / std; + } + + /// Use another lb_keogh to prune + /// qo is the sorted query. tz is unsorted z_normalized data. + /// l_buff, u_buff are big envelope for all data in this chunk + lb_k2 = lb_keogh_data_cumulative(order, tz, qo, cb2, l_buff + I, u_buff + I, m, mean, std, best_so_far); + if (lb_k2 < best_so_far) { + /// Choose better lower bound between lb_keogh and lb_keogh2 to be used in early abandoning DTW + /// Note that cb and cb2 will be cumulative summed here. + if (lb_k > lb_k2) { + cb[m - 1] = cb1[m - 1]; + for (k = m - 2; k >= 0; k--) + cb[k] = cb[k + 1] + cb1[k]; + } else { + cb[m - 1] = cb2[m - 1]; + for (k = m - 2; k >= 0; k--) + cb[k] = cb[k + 1] + cb2[k]; + } + + /// Compute DTW and early abandoning if possible + dist = dtw(tz, q, cb, m, r, best_so_far); + + if (dist < best_so_far) { /// Update best_so_far + /// loc is the real starting location of the nearest neighbor in the file + best_so_far = dist; + loc = (it) * (EPOCH - m + 1) + i - m + 1; + } + } else + keogh2++; + } else + keogh++; + } else + kim++; + + /// Reduce absolute points from sum and sum square + ex -= t[j]; + ex2 -= t[j] * t[j]; + } + } + + /// If the size of last chunk is less then EPOCH, then no more data and terminate. + if (ep < EPOCH) + done = 1; + else + it++; + } + } + + i = (it) * (EPOCH - m + 1) + ep; + + free(q); + free(qo); + free(uo); + free(lo); + free(order); + free(q_tmp); + free(u); + free(l); + free(cb); + free(cb1); + free(cb2); + free(u_d); + free(l_d); + free(t); + free(buffer); + free(u_buff); + free(l_buff); + + if (verbose) { + t2 = clock(); + printf("Data Scanned : %lld\n", i); + printf("Total Execution Time : %.4f secs\n", (t2 - t1) / CLOCKS_PER_SEC); + printf("\n"); + printf("Pruned by LB_Kim : %6.2f%%\n", ((double) kim / i) * 100); + printf("Pruned by LB_Keogh : %6.2f%%\n", ((double) keogh / i) * 100); + printf("Pruned by LB_Keogh2 : %6.2f%%\n", ((double) keogh2 / i) * 100); + printf("DTW Calculation : %6.2f%%\n", 100 - (((double) kim + keogh + keogh2) / i * 100)); + } + *location = loc; + *distance = sqrt(best_so_far); + return 0; +} + +//int main(int argc, char** argv) { +// if (argc < 5) { +// printf("Error: Invalid arguments\n"); +// printf("Command usage: dtwfind <data-file> <query-file> <query-length> <warp-width>\n\n"); +// printf("For example: dtwfind data.txt query.txt 128 0.05\n"); +// return -2; +// } +// +// FILE* data_file = fopen(argv[1], "r"); +// if (data_file == NULL) { +// printf("ERROR: File not found: %s\n", argv[1]); +// } +// +// FILE* query_file = fopen(argv[2], "r"); +// if (query_file == NULL) { +// printf("ERROR: File not found: %s\n", argv[2]); +// } +// +// long long location = -1; +// double distance = -1; +// int ret = ucrdtwf(data_file, query_file, atoll(argv[3]), atof(argv[4]), 1, &location, &distance); +// printf("Location: %lld\nDistance: %.6f\n", location, distance); +// return ret; +//} diff --git a/lsh-fast/ucrdtw.h b/lsh-fast/ucrdtw.h new file mode 100644 index 0000000000000000000000000000000000000000..58acfe82cd51180f37d9f70b4b8fa904907c35af --- /dev/null +++ b/lsh-fast/ucrdtw.h @@ -0,0 +1,6 @@ + +int ucrdtw(double* data, long long data_size, double* query, long query_size, double warp_width, int verbose, long long* location, double* distance); + +int ucrdtwf(FILE* data_stream, FILE* query_stream, long query_length, double warp_width, int verbose, long long* location, double* distance); + +double dtw(double* A, double* B, double *cb, int m, int r, double best_so_far);