Commit 822a152f authored by Kruyff,D.L.W. (Dylan)'s avatar Kruyff,D.L.W. (Dylan)
Browse files

Fix lsh-fast folder not working

parent 9353b768
lsh-fast @ 994e1549
Subproject commit 994e1549030ad71d7164ed559b5cd1ab8733ec1a
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
<?xml version="1.0" encoding="UTF-8"?>
<module classpath="CMake" type="CPP_MODULE" version="4" />
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CMakeWorkspace" PROJECT_DIR="$PROJECT_DIR$" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/lsh-fast.iml" filepath="$PROJECT_DIR$/.idea/lsh-fast.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
OBJ = clean lsh
CC = g++
all: clean matmult execute
matmult:
$(CC) lsh.cpp -o lsh -O3
clean:
/bin/rm -rf $(OBJ) core*
veryclean: clean
/bin/rn -f *~
execute:
./lsh
\ No newline at end of file
#include <Python.h>
#include <numpy/arrayobject.h>
#include "lsh.h"
#include <math.h>
/* Docstrings */
static char module_docstring[] = "This module implements fast nearest-neighbor retrieval using LSH.";
static char lsh_docstring[] = "Calculate the closest neightbours with distances given a query window.";
/* Available functions */
static PyObject* lsh_lsh(PyObject *self, PyObject *args);
/* Module specification */
static PyMethodDef module_methods[] = { { "lsh", lsh_lsh, METH_VARARGS, lsh_docstring }, { NULL, NULL, 0, NULL } };
/* Initialize the module */
#if PY_MAJOR_VERSION >= 3
#define MOD_ERROR_VAL NULL
#define MOD_SUCCESS_VAL(val) val
#define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void)
#define MOD_DEF(ob, name, doc, methods) \
static struct PyModuleDef moduledef = { \
PyModuleDef_HEAD_INIT, name, doc, -1, methods, }; \
ob = PyModule_Create(&moduledef);
#else
#define MOD_ERROR_VAL
#define MOD_SUCCESS_VAL(val)
#define MOD_INIT(name) void init##name(void)
#define MOD_DEF(ob, name, doc, methods) \
ob = Py_InitModule3(name, methods, doc);
#endif
MOD_INIT(_lsh)
{
PyObject *m;
MOD_DEF(m, "_lsh", module_docstring,
module_methods)
if (m == NULL)
return MOD_ERROR_VAL;
import_array();
return MOD_SUCCESS_VAL(m);
}
static PyObject* lsh_lsh(PyObject* self, PyObject* args) {
PyObject* data_obj = NULL;
PyObject* query_obj = NULL;
PyObject* hash_obj = NULL;
double **** hashFunctions;
double ** weights = NULL;
double*** data;
double ** query;
int * candidates;
double * distances;
int nrOfCandidates;
double r;
double a;
double sd;
PyArray_Descr *descr;
descr = PyArray_DescrFromType(NPY_DOUBLE);
PyArray_Descr *descr_float;
descr_float = PyArray_DescrFromType(NPY_FLOAT64);
/// Parse the input tuple
if (!PyArg_ParseTuple(args, "OOddd|O", &data_obj, &query_obj, &r, &a, &sd, &hash_obj)) {
return NULL;
}
/// Get the dimensions of the data and query
int data_size = (long) PyArray_DIM(data_obj, 0);
int channel_size = (long) PyArray_DIM(data_obj, 2);
int query_size = (int) PyArray_DIM(query_obj, 0);
/// Convert data, query and weights to C array
npy_intp dims0[3];
if (PyArray_AsCArray(&query_obj, (void **)&query, dims0, 2, descr) < 0 || PyArray_AsCArray(&data_obj, (void ***)&data, dims0, 3, descr) < 0) {
PyErr_SetString(PyExc_TypeError, "error converting to c array");
return NULL;
}
if (hash_obj != NULL)
{
printf("Using weights");
if (PyArray_AsCArray(&hash_obj, (void **)&weights, dims0, 2, descr) < 0) {
PyErr_SetString(PyExc_TypeError, "error converting weights to c array");
return NULL;
}
}
int K = ceil(log((log(0.5))/log(1-exp(-2*(0.1)*(0.1)*query_size)))/log((1-exp(-2*(0.1)*(0.1)*query_size))/(0.5)));
int L = ceil(log(0.05)/(log(1-pow(1-exp(-2*(0.1)*(0.1)*query_size), K))));
printf("K: %d\n", K);
printf("L: %d\n", L);
printf("Dim: %d\n", channel_size);
/// Initialize output parameters
hashFunctions = (double ****)malloc(L*sizeof(double***));
for (int l=0;l<L;l++)
{
hashFunctions[l] = (double ***)malloc(K*sizeof(double**));
for (int k=0;k<K;k++)
{
hashFunctions[l][k] = (double **)malloc(query_size*sizeof(double*));
for (int t=0;t<query_size;t++)
{
hashFunctions[l][k][t] = (double *)malloc(channel_size*sizeof(double));
}
}
}
int status = lsh(data, data_size, query_size, channel_size, query, L, K, r, a, sd, candidates, distances, hashFunctions, weights, nrOfCandidates);
if (status) {
PyErr_SetString(PyExc_RuntimeError, "lsh could not allocate memory");
return NULL;
}
npy_intp dimscandidates[1] = {nrOfCandidates};
printf("Number of candidates: %d\n", nrOfCandidates);
npy_intp dims4[4] = {L, K, query_size, channel_size};
// PyArrayObject* numpy_candidates = (PyArrayObject*)PyArray_SimpleNewFromData(1, dimscandidates, NPY_INT, (void*)&candidates);
// PyArrayObject* numpy_distances = (PyArrayObject*)PyArray_SimpleNewFromData(1, dimsdistance, NPY_DOUBLE, (void*)&distances);
// https://github.com/suiyun0234/scipy-master/commit/da7dfc7aad8daa7a516e43f4c7001eea7c1a707e
// https://github.com/fjean/pymeanshift/commit/1ba90da647342184ea7df378d7f21eba257a51d9
PyArrayObject* numpy_candidates = (PyArrayObject*)PyArray_SimpleNew(1, dimscandidates, NPY_INT);
memcpy(PyArray_DATA(numpy_candidates), candidates, dimscandidates[0]*sizeof(int));
PyArrayObject* numpy_distances = (PyArrayObject*)PyArray_SimpleNew(1, dimscandidates, NPY_DOUBLE);
memcpy(PyArray_DATA(numpy_distances), distances, dimscandidates[0]*sizeof(double));
PyArrayObject* TEST = (PyArrayObject*)PyArray_SimpleNew(1, dimscandidates, NPY_DOUBLE);
memcpy(PyArray_DATA(TEST), distances, dimscandidates[0]*sizeof(double));
// PyArrayObject* numpy_hash_functions = (PyArrayObject*)PyArray_SimpleNew(4, dims4, NPY_DOUBLE);
// double* numpy_hash_functions_data = (double*)PyArray_DATA(numpy_hash_functions);
// for (int l=0;l<L;l++)
// {
// for (int k=0;k<K;k++)
// {
// for (int t=0;t<query_size;t++)
// {
// memcpy(numpy_hash_functions_data, hashFunctions[l][k][t], channel_size*sizeof(double));
// numpy_hash_functions_data += channel_size;
// }
// }
// }
PyObject* ret = Py_BuildValue("NNN", PyArray_Return(numpy_candidates), PyArray_Return(numpy_distances), PyArray_Return(TEST));// PyArray_Return(numpy_hash_functions));
// Py_XDECREF(data_obj);
// Py_XDECREF(query_obj);
// Py_XDECREF(hash_obj);
// Py_XDECREF(weights);
// Py_XDECREF(data);
// Py_XDECREF(query);
// Py_XDECREF(descr);
// Py_XDECREF(descr_float);
// Py_XDECREF(query);
// Py_XDECREF(numpy_candidates);
// Py_XDECREF(numpy_distances);
// Py_XDECREF(TEST);
// free(candidates);
// free(distances);
// for (int l=0;l<L;l++)
// {
// for (int k=0;k<K;k++)
// {
// for (int t=0;t<query_size;t++)
// {
// free(hashFunctions[l][k][t]);
// }
// free(hashFunctions[l][k]);
// }
// free(hashFunctions[l]);
// }
// free(hashFunctions);
return ret;
}
#include <Python.h>
#include <numpy/arrayobject.h>
#include "ucrdtw.h"
/* Docstrings */
static char module_docstring[] = "This module implements fast nearest-neighbor retrieval under the dynamic time warping (DTW).";
static char ucrdtw_docstring[] = "Calculate the nearest neighbor of a times series in a larger time series expressed as location and distance, "
"using the UCR suite optimizations.";
/* Available functions */
static PyObject* ucrdtw_ucrdtw(PyObject *self, PyObject *args);
/* Module specification */
static PyMethodDef module_methods[] = { { "ucrdtw", ucrdtw_ucrdtw, METH_VARARGS, ucrdtw_docstring }, { NULL, NULL, 0, NULL } };
/* Initialize the module */
#if PY_MAJOR_VERSION >= 3
#define MOD_ERROR_VAL NULL
#define MOD_SUCCESS_VAL(val) val
#define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void)
#define MOD_DEF(ob, name, doc, methods) \
static struct PyModuleDef moduledef = { \
PyModuleDef_HEAD_INIT, name, doc, -1, methods, }; \
ob = PyModule_Create(&moduledef);
#else
#define MOD_ERROR_VAL
#define MOD_SUCCESS_VAL(val)
#define MOD_INIT(name) void init##name(void)
#define MOD_DEF(ob, name, doc, methods) \
ob = Py_InitModule3(name, methods, doc);
#endif
MOD_INIT(_ucrdtw)
{
PyObject *m;
MOD_DEF(m, "_ucrdtw", module_docstring,
module_methods)
if (m == NULL)
return MOD_ERROR_VAL;
import_array();
return MOD_SUCCESS_VAL(m);
}
/*
Calculate the nearest neighbor of a times series in a larger time series expressed as location and distance,
using the UCR suite optimizations.
Arguments:
data - a list of floats or an ndarray for the time series to process
query - a list of floats or an ndarray for the time series to search for
warp_width - Allowed warp width as a fraction of query size
verbose - Optional boolean to print stats
*/
static PyObject* ucrdtw_ucrdtw(PyObject* self, PyObject* args) {
PyObject* data_obj = NULL;
PyObject* query_obj = NULL;
double warp_width = -1;
PyObject* verbose_obj = NULL;
/* Parse the input tuple */
if (!PyArg_ParseTuple(args, "OOd|O", &data_obj, &query_obj, &warp_width, &verbose_obj)) {
return NULL;
}
/* Interpret the input objects as numpy arrays. */
if (!PyList_Check(data_obj) && !PyArray_Check(data_obj)) {
PyErr_SetString(PyExc_TypeError, "Data argument must be a list or ndarray");
return NULL;
}
// PyObject* data_array = PyArray_FROM_OTF(data_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
// if (data_array == NULL) {
// Py_XDECREF(data_array);
// PyErr_SetString(PyExc_TypeError, "Data argument must be a list or ndarray");
// return NULL;
// }
double** data;
PyArray_Descr *descr;
descr = PyArray_DescrFromType(NPY_DOUBLE);
npy_intp dims0[3];
if (PyArray_AsCArray(&data_obj, (void **)&data, dims0, 2, descr) < 0) {
PyErr_SetString(PyExc_TypeError, "error converting to c array");
return NULL;
}
if (!PyList_Check(query_obj) && !PyArray_Check(query_obj)) {
Py_XDECREF(data);
PyErr_SetString(PyExc_TypeError, "Query argument must be a list or ndarray");
return NULL;
}
PyObject* query_array = PyArray_FROM_OTF(query_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
if (query_array == NULL) {
Py_XDECREF(data);
PyErr_SetString(PyExc_TypeError, "Query argument must be a list or ndarray");
return NULL;
}
/* Get pointers to the data as C-types. */
int data_size = (long) PyArray_DIM(data_obj, 0);
double* query = (double*) PyArray_DATA(query_array);
int query_size = (int) PyArray_DIM(query_array, 0);
int verbose = verbose_obj != NULL ? PyObject_IsTrue(verbose_obj) : 0;
/* Call the external C function to compute the best DTW location and distance. */
long long location = -1;
double distance = -1;
int status = 0;
for (int i=0; i<data_size; i++)
{
status = ucrdtw(data[i], query_size, query, query_size, warp_width, verbose, &location, &distance);
}
/* Clean up. */
Py_XDECREF(data);
Py_XDECREF(query_array);
if (status) {
PyErr_SetString(PyExc_RuntimeError, "ucrdtw could not allocate memory");
return NULL;
}
/* Build the output tuple */
PyObject* ret = Py_BuildValue("Ld", location, distance);
return ret;
}
CMakeLists.txt not found in /home/dev-laptop/Dylan/locality-sensitive-hashing-visual-analytics/lsh-fast Select CMakeLists.txt file...
File added
#include <iostream> // std::cout
#include <algorithm> // std::max
#include <fstream>
#include <string.h>
#include <tuple>
#include <stdlib.h>
#include <vector>
#include <math.h>
#include <random>
#define INF 1e20 // Pseudo-infinite number for this code
#define dist(x,y) ((x-y)*(x-y))
using namespace std;
/// This function is from the UCRDTW code
///
/// Calculate Dynamic Time Wrapping distance
/// A,B: data and query, respectively
/// cb : cummulative bound used for early abandoning
/// r : size of Sakoe-Chiba warpping band
double dtw(double* A, double* B, double *cb, int m, int r, double best_so_far) {
double *cost;
double *cost_prev;
double *cost_tmp;
int i, j, k;
double x, y, z, min_cost;
/// Instead of using matrix of size O(m^2) or O(mr), we will reuse two arrays of size O(r).
cost = (double*) calloc(2 * r + 1, sizeof(double));
cost_prev = (double*) calloc(2 * r + 1, sizeof(double));
for (k = 0; k < 2 * r + 1; k++) {
cost[k] = INF;
cost_prev[k] = INF;
}
for (i = 0; i < m; i++) {
k = max(0, r - i);
min_cost = INF;
for (j = max(0, i - r); j <= min(m - 1, i + r); j++, k++) {
/// Initialize all row and column
if ((i == 0) && (j == 0)) {
cost[k] = dist(A[0], B[0]);
min_cost = cost[k];
continue;
}
if ((j - 1 < 0) || (k - 1 < 0)) {
y = INF;
} else {
y = cost[k - 1];
}
if ((i - 1 < 0) || (k + 1 > 2 * r)) {
x = INF;
} else {
x = cost_prev[k + 1];
}
if ((i - 1 < 0) || (j - 1 < 0)) {
z = INF;
} else {
z = cost_prev[k];
}
/// Classic DTW calculation
cost[k] = min( min( x, y) , z) + dist(A[i], B[j]);
/// Find minimum cost in row for early abandoning (possibly to use column instead of row).
if (cost[k] < min_cost) {
min_cost = cost[k];
}
}
/// Move current array to previous array.
cost_tmp = cost;
cost = cost_prev;
cost_prev = cost_tmp;
}
k--;
/// the DTW distance is in the last cell in the matrix of size O(m^2) or at the middle of our array.
double final_dtw = cost_prev[k];
free(cost);
free(cost_prev);
return final_dtw;
}
/// Sorting function for the 2nd element of two vectors
bool sortbysecasc(const pair<int,double> &a,
const pair<int,double> &b)
{
return a.second<b.second;
}
/// Sorting function for the 2nd element of two vectors
bool sortbysecdesc(const pair<int,double> &a,
const pair<int,double> &b)
{
return a.second>b.second;
}
/// Get random value from uniform distribution
double uniform(double a, double b)
{
return (double) std::rand() / (RAND_MAX) * (b - a) + a;
}
/// Algorithm for calculating similarity of (multivariate) time series data
int lsh(double *** D, int M, int T, int Dim, double ** Q, int L, int K, double r, double a, double sd, int * &candidates, double * &distances, double **** &hashFunctions, double ** weights, int &nrOfCandidates)
{
clock_t begin_time = clock();
std::random_device rd{};
std::mt19937 gen{rd()};
std::normal_distribution<> distribution0{0,1};
srand(time(NULL));
int S = 2; //3
double delta = r * 0.75; //0.07 => 20.000
int threshold = T * 0.22; //0.8 => 18/67
/// Create hash functions
int value;
for (int l=0;l<L;l++)
{
for (int k=0;k<K;k++)
{
for (int d=0;d<Dim;d++)
{
value = distribution0(gen);
for (int t=0;t<T;t++)
{
if (weights == NULL)
{
hashFunctions[l][k][t][d] = value;
} else {
hashFunctions[l][k][t][d] = value;// * weights[t][d];
}
}
}
}
}
/// Calculate ratios for DTW approximation
double ** ratios = (double **)malloc(L*sizeof(double*));
for (int l=0;l<L;l++)
{
ratios[l] = (double *)malloc(K*sizeof(double));
for (int k=0;k<K;k++)
{
ratios[l][k] = uniform(a-sd, a+sd);
}
}
/// Create hashSET
double temp0 = 0;
double **** hashSET = (double****)malloc(M*sizeof(double***)) ; // Initialize DataD
for (int m=0;m<M;m++)
{
hashSET[m] = (double***)malloc(L*sizeof(double**)) ; // Initialize Data
for (int l=0;l<L;l++)
{
hashSET[m][l] = (double**)malloc(K*sizeof(double*)) ; // Initialize Data
for (int k=0;k<K;k++)
{
hashSET[m][l][k] = (double*)malloc(T*sizeof(double)) ; // Initialize Data
for (int t=0;t<T;t++)
{
temp0 = 0;
for (int d=0;d<Dim;d++) {
temp0 += hashFunctions[l][k][t][d] * D[m][t][d];
}
hashSET[m][l][k][t] = temp0;
}
}
}
}
/// Create upper and lower bound on query
double ** Q_U = (double**)malloc((T)*sizeof(double*));
double ** Q_L = (double**)malloc((T)*sizeof(double*));
for (int t=0;t<T;t++)
{
Q_U[t] = (double*)malloc((Dim)*sizeof(double));
Q_L[t] = (double*)malloc((Dim)*sizeof(double));
}
double currentMax = -10000;
double currentMin = 10000;
for (int t = 0; t < T; t++) {
for (int d = 0; d < Dim; d++) {
for (int s = -S; s <= S; s++) {
if (t + s < 0 || t + s > T - 1)
continue;
currentMax = std::max(Q_U[t + s][d], currentMax);
currentMin = std::min(Q_L[t + s][d], currentMin);
}
Q_U[t][d] = currentMax;
Q_L[t][d] = currentMin;
currentMax = -10000;
currentMin = 10000;
}
}
double temp1 = 0;
double temp2 = 0;
double temp3 = 0;
/// Create query hashes
double *** hashQ = (double***)malloc(L*sizeof(double**)) ; // Initialize Data
double *** hashQ_U = (double***)malloc(L*sizeof(double**)) ; // Initialize Data
double *** hashQ_L = (double***)malloc(L*sizeof(double**)) ; // Initialize Data
for (int l=0;l<L;l++)
{
hashQ[l] = (double**)malloc(K*sizeof(double*)) ; // Initialize Data
hashQ_U[l] = (double**)malloc(K*sizeof(double*)) ; // Initialize Data
hashQ_L[l] = (double**)malloc(K*sizeof(double*)) ; // Initialize Data
for (int k=0;k<K;k++)
{
hashQ[l][k] = (double*)malloc(T*sizeof(double)) ; // Initialize Data
hashQ_U[l][k] = (double*)malloc(T*sizeof(double)) ; // Initialize Data
hashQ_L[l][k] = (double*)malloc(T*sizeof(double)) ; // Initialize Data
for (int t=0;t<T;t++)
{
temp1 = 0;
temp2 = 0;
temp3 = 0;
for (int d=0;d<Dim;d++)
{
temp1 += hashFunctions[l][k][t][d] * Q[t][d];
temp2 += hashFunctions[l][k][t][d] * Q_U[t][d];
temp3 += hashFunctions[l][k][t][d] * Q_L[t][d];
}
hashQ[l][k][t] = temp1;
hashQ_U[l][k][t] = temp2;
hashQ_L[l][k][t] = temp3;
}
}