main.py 10.9 KB
Newer Older
1 2 3 4
from flask import Flask, jsonify, request
import pandas as pd
import numpy as np
from flask_cors import CORS
5
from collections import defaultdict, Counter
6
from time import time
7 8 9
import os.path
import json
from sklearn import preprocessing
10
import orjson
11
import dask.dataframe as dd
12 13 14 15 16 17 18 19 20 21

app = Flask(__name__)
CORS(app)

@app.route('/', methods=['GET'])
def index():
    return "hi"

@app.route('/read-data', methods=['GET'])
def read_data():
22 23 24 25 26 27 28 29 30 31
    filename = 'data.pkl'
    if (not os.path.isfile(filename)):
        print("start")
        df = dd.read_csv("NW_Ground_Stations_2016.csv", usecols=['number_sta', 'date', 't'])
        print("read file")
        df = df.loc[df['number_sta'] == 14066001]
        print("split rows")
        df = df.compute()
        df.to_pickle(filename)
        print("to_pandas")
32 33
    df = pd.read_pickle(filename)
    df.dropna(subset=['t'], inplace=True)
34
    response = {
35 36
        "index": json.dumps(df.loc[:, 'date'].values.astype(str).tolist()),
        "values": json.dumps(df.loc[:, 't'].values.astype(str).tolist())
37
    }
38
    print("response ready")
39 40 41
    response = jsonify(response)
    return response

42 43 44 45 46
@app.route('/read-mts-data', methods=['GET'])
def read_mts_data():
    filename = 'data.pkl'
    if (not os.path.isfile(filename)):
        print("start")
47
        df = dd.read_csv("NW_Ground_Stations_2016.csv", usecols=['number_sta', 'date', 't', 'hu', 'td'])
48 49 50 51 52 53 54 55 56 57
        print("read file")
        df = df.loc[df['number_sta'] == 14066001]
        print("split rows")
        df = df.compute()
        df.to_pickle(filename)
        print("to_pandas")
    df = pd.read_pickle(filename)
    df.dropna(subset=['t'], inplace=True)
    response = [
        {
58 59 60 61 62 63
            "index": df.loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[:, 't'].values.tolist()
        },
        {
            "index": df.loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[:, 'hu'].values.tolist()
64 65
        },
        {
66 67
            "index": df.loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[:, 'td'].values.tolist()
68 69 70 71 72 73
        }
    ]
    print("response ready")
    response = orjson.dumps(response)
    return response

74 75
@app.route('/create-windows', methods=['POST'])
def create_windows():
76
    t0 = time()
77 78 79 80 81 82 83 84 85 86 87 88
    if (not os.path.isfile('processed-data.npy')):
        filename = 'data.pkl'
        df = pd.read_pickle(filename)
        values = df.loc[:, 't'].values.astype(str).tolist()
        print("Data read: " + str(time()-t0))
        raw_data = request.json
        window_size = int(raw_data['parameters']["windowsize"])
        print("Processing: " + str(time()-t0))
        data = [values[i:i+window_size] for i in range(len(values) - window_size)]
        data = preprocessing.minmax_scale(data, (-1, 1), axis=1)
        print("Preprocessed: " + str(time()-t0))
        np.save('processed-data', data)
89
    print("Sending response: " + str(time()-t0))
90
    return '1'
91

92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
@app.route('/create-mts-windows', methods=['POST'])
def create_mts_windows():
    t0 = time()
    if (not os.path.isfile('processed-data.npy')):
        filename = 'data.pkl'
        df = pd.read_pickle(filename)
        values = df.loc[:, 't'].values.astype(str).tolist()
        print("Data read: " + str(time()-t0))
        raw_data = request.json
        window_size = int(raw_data['parameters']["windowsize"])
        print("Processing: " + str(time()-t0))
        data = [values[i:i+window_size] for i in range(len(values) - window_size)]
        data = preprocessing.minmax_scale(data, (-1, 1), axis=1)
        print("Preprocessed: " + str(time()-t0))
        np.save('processed-data', data)
    print("Sending response: " + str(time()-t0))
    return '1'

110 111 112
@app.route('/create-tables', methods=['POST'])
def create_tables():
    t0 = time()
113 114 115
    print("loading")
    data = np.load('processed-data.npy')
    print(time()-t0)
116 117
    raw_data = orjson.loads(request.data)
    print(time()-t0)
118 119 120
    window_size = int(raw_data['parameters']["windowsize"])
    hash_size = int(raw_data['parameters']["hashsize"])
    table_size = int(raw_data['parameters']["tablesize"])
121
    data = np.array(data)
122
    print('Starting: ' + str(time()-t0))
123 124
    tables_hash_function = [np.random.uniform(-1, 1, size=(window_size, hash_size)) for _ in range(table_size)]
    print('Init time: ' + str(time() - t0))
125 126 127 128
    tables = []
    for index in range(table_size):
        t1 = time()
        table = defaultdict(list)
129 130
        signatures_bool = np.dot(data, tables_hash_function[index]) > 0
        signatures = [''.join(['1' if x else '0' for x in lst]) for lst in signatures_bool]
131 132 133 134
        for i in range(len(signatures)):
            table[signatures[i]].append(i)
        print(time()-t1)
        tables.append(table)
135

136 137 138 139
    print('Creation time: ' + str(time() - t0))
    hash_functions = np.array(tables_hash_function).tolist()
    response = {}
    for table_index in range(table_size):
140
        response[str(table_index)] = {
141 142 143
            "hash": hash_functions[table_index],
            "entries": tables[table_index]
        }
144
    response = orjson.dumps(response)
145 146 147 148
    return response

@app.route('/query', methods=['POST'])
def query():
149
    t0 = time()
150
    raw_data = orjson.loads(request.data)
151 152 153 154 155 156 157 158 159 160 161
    window = raw_data['window']
    output = preprocessing.minmax_scale(window, (-1, 1))
    response = orjson.dumps(output.tolist())
    print("Query done: " + str(time()-t0))
    return response

@app.route('/similarity', methods=['POST'])
def similarity():
    t0 = time()
    raw_data = orjson.loads(request.data)
    window = raw_data['query']
162 163 164
    tables = raw_data["tables"]
    neighbours = []

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
165
    output = defaultdict(list)
166 167

    for t in tables.values():
168 169
        signature_bool = np.dot(window, t["hash"]) > 0
        signature = ''.join(['1' if x else '0' for x in signature_bool])
170
        neighbours.extend(t["entries"][signature])
171 172
    neighbours_with_frequency = dict(Counter(neighbours))
    for index, frequency in neighbours_with_frequency.items():
173
        output[str(frequency)].append(index)
174
    response = orjson.dumps(output)
175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
    print("Similarity done: " + str(time()-t0))
    return response

@app.route('/average-progress', methods=['POST'])
def average_progress():
    t0 = time()
    raw_data = orjson.loads(request.data)
    all_windows = raw_data['windows']
    data = np.load('processed-data.npy')
    output = []
    actual_windows = []
    print("Initialized: " + str(time() - t0))
    for windows in all_windows:
        t1 = time()
        actual_windows.extend(data[windows])
190 191 192
        if len(actual_windows) == 0:
            output.append([])
            continue
193 194 195 196 197 198 199 200
        max_values = np.maximum.reduce(actual_windows).tolist()
        min_values = np.minimum.reduce(actual_windows).tolist()
        average_values = (np.sum(actual_windows, 0)/len(actual_windows)).tolist()
        output.append({
            'average': average_values,
            'max': max_values,
            'min': min_values
        })
201 202 203
        print("Average calculated: " + str(time() - t1))
    response = orjson.dumps(output)
    print("Averages calculated: " + str(time() - t0))
204 205
    return response

206 207
@app.route('/average-table', methods=['POST'])
def average_table():
208 209 210 211 212
    t0 = time()
    raw_data = orjson.loads(request.data)
    all_windows = raw_data['windows']
    data = np.load('processed-data.npy')
    output = []
213
    print("Initialized: " + str(time() - t0))
214 215 216
    for windows in all_windows:
        t1 = time()
        actual_windows = data[windows]
217
        print(len(actual_windows))
218 219 220 221 222 223 224
        average_values = np.average(actual_windows, 0)
        # average_values = (np.sum(actual_windows, 0) / len(actual_windows))
        std_values = np.std(actual_windows, 0)
        max_values = average_values + std_values
        min_values = average_values - std_values
        # max_values = np.maximum.reduce(actual_windows).tolist()
        # min_values = np.minimum.reduce(actual_windows).tolist()
225
        output.append({
226 227 228
            'average': average_values.tolist(),
            'max': max_values.tolist(),
            'min': min_values.tolist()
229
        })
230 231
        print("Average calculated: " + str(time() - t1))
    response = orjson.dumps(output)
232
    print("Averages calculated: " + str(time() - t0))
233 234 235 236 237
    return response

@app.route('/update', methods=['POST'])
def update():
    t0 = time()
238
    print("Start")
239
    raw_data = orjson.loads(request.data)
240 241
    print("Data loaded: " + str(time() - t0))
    data = np.load('processed-data.npy')
242 243
    label_data = raw_data["labelData"]
    tables = raw_data["tables"]
244
    window = raw_data["query"]
245 246 247 248 249 250

    window_size = int(raw_data['parameters']["windowsize"])
    hash_size = int(raw_data['parameters']["hashsize"])
    table_size = int(raw_data['parameters']["tablesize"])
    new_tables = []

251 252
    correct_indices = [int(index) for index, value in label_data.items() if value is True]
    incorrect_indices = [int(index) for index, value in label_data.items() if value is False]
253

254
    print("Initialized: " + str(time() - t0))
255 256
    for t in tables.values():
        valid = True
257 258
        signature = ''.join((np.dot(window, t["hash"]) > 0).astype('int').astype('str'))
        neighbours = t["entries"][signature]
259 260 261 262 263 264 265 266 267 268
        for index in correct_indices:
            if index not in neighbours:
                valid = False
                break
        for index in incorrect_indices:
            if index in neighbours:
                valid = False
                break
        if valid:
            new_tables.append(t)
269 270 271 272 273 274
    print("Filtered good tables: " + str(time() - t0))
    for index in range(table_size - len(new_tables)):
        entries = defaultdict(list)
        t1 = time()
        while True:
            hash_function = np.random.randn(window_size, hash_size)
275 276
            correct_signatures = [''.join((np.dot(data[i], hash_function) > 0).astype('int').astype('str')) for
                                  i in
277
                                  correct_indices]
278 279
            incorrect_signatures = [''.join((np.dot(data[i], hash_function) > 0).astype('int').astype('str')) for
                                    i
280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
                                    in incorrect_indices]
            if correct_signatures.count(correct_signatures[0]) == len(
                    correct_signatures) and incorrect_signatures.count(
                    correct_signatures[0]) == 0:
                break
        print("first: " + str(time() - t1))
        t2 = time()
        signatures_bool = np.dot(data, hash_function) > 0
        signatures = [''.join(['1' if x else '0' for x in lst]) for lst in signatures_bool]
        for i in range(len(signatures)):
            entries[signatures[i]].append(i)
        print("second: " + str(time() - t2))
        new_tables.append({
            "hash": hash_function.tolist(),
            "entries": entries
        })
296

297 298 299 300 301 302 303 304
    print('Update time: ' + str(time() - t0))
    response = {}
    for table_index in range(len(new_tables)):
        response[table_index] = {
            "hash": new_tables[table_index]["hash"],
            "entries": new_tables[table_index]["entries"]
        }
    response = jsonify(response)
305
    return response