main.py 12.9 KB
Newer Older
1
2
3
4
from flask import Flask, jsonify, request
import pandas as pd
import numpy as np
from flask_cors import CORS
5
from collections import defaultdict, Counter
6
from time import time
7
8
9
import os.path
import json
from sklearn import preprocessing
10
import orjson
11
import dask.dataframe as dd
12
13
14
15
16
17
18
19
20
21

app = Flask(__name__)
CORS(app)

@app.route('/', methods=['GET'])
def index():
    return "hi"

@app.route('/read-data', methods=['GET'])
def read_data():
22
23
24
25
26
27
28
29
30
31
    filename = 'data.pkl'
    if (not os.path.isfile(filename)):
        print("start")
        df = dd.read_csv("NW_Ground_Stations_2016.csv", usecols=['number_sta', 'date', 't'])
        print("read file")
        df = df.loc[df['number_sta'] == 14066001]
        print("split rows")
        df = df.compute()
        df.to_pickle(filename)
        print("to_pandas")
32
33
    df = pd.read_pickle(filename)
    df.dropna(subset=['t'], inplace=True)
34
    response = {
35
36
        "index": json.dumps(df.loc[:, 'date'].values.astype(str).tolist()),
        "values": json.dumps(df.loc[:, 't'].values.astype(str).tolist())
37
    }
38
    print("response ready")
39
40
41
    response = jsonify(response)
    return response

42
43
44
45
46
@app.route('/read-mts-data', methods=['GET'])
def read_mts_data():
    filename = 'data.pkl'
    if (not os.path.isfile(filename)):
        print("start")
47
        df = dd.read_csv("NW_Ground_Stations_2016.csv", usecols=['number_sta', 'date', 't', 'hu', 'td'])
48
49
50
51
52
53
54
55
56
57
        print("read file")
        df = df.loc[df['number_sta'] == 14066001]
        print("split rows")
        df = df.compute()
        df.to_pickle(filename)
        print("to_pandas")
    df = pd.read_pickle(filename)
    df.dropna(subset=['t'], inplace=True)
    response = [
        {
58
59
60
61
62
63
            "index": df.loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[:, 't'].values.tolist()
        },
        {
            "index": df.loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[:, 'hu'].values.tolist()
64
65
        },
        {
66
67
            "index": df.loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[:, 'td'].values.tolist()
68
69
70
71
72
73
        }
    ]
    print("response ready")
    response = orjson.dumps(response)
    return response

74
75
@app.route('/create-windows', methods=['POST'])
def create_windows():
76
    t0 = time()
77
78
79
80
81
82
83
84
85
86
87
88
    if (not os.path.isfile('processed-data.npy')):
        filename = 'data.pkl'
        df = pd.read_pickle(filename)
        values = df.loc[:, 't'].values.astype(str).tolist()
        print("Data read: " + str(time()-t0))
        raw_data = request.json
        window_size = int(raw_data['parameters']["windowsize"])
        print("Processing: " + str(time()-t0))
        data = [values[i:i+window_size] for i in range(len(values) - window_size)]
        data = preprocessing.minmax_scale(data, (-1, 1), axis=1)
        print("Preprocessed: " + str(time()-t0))
        np.save('processed-data', data)
89
    print("Sending response: " + str(time()-t0))
90
    return '1'
91

92
93
94
95
96
97
@app.route('/create-mts-windows', methods=['POST'])
def create_mts_windows():
    t0 = time()
    if (not os.path.isfile('processed-data.npy')):
        filename = 'data.pkl'
        df = pd.read_pickle(filename)
98
99
100
101
        channels = list()
        channels.append(df.loc[:, 't'].values.tolist())
        channels.append(df.loc[:, 'hu'].values.tolist())
        channels.append(df.loc[:, 'td'].values.tolist())
102
103
104
105
        print("Data read: " + str(time()-t0))
        raw_data = request.json
        window_size = int(raw_data['parameters']["windowsize"])
        print("Processing: " + str(time()-t0))
106
107
108
109
110
111
112
        data = [([values[i:i+window_size] for values in channels]) for i in range(len(channels[0]) - window_size)]
        print("Raw windows: " + str(time()-t0))
        windows = []
        for i in range(len(data)):
            if i % 5000 == 0:
                print(i)
            windows.append(preprocessing.minmax_scale(data[i], (-1, 1), axis=1))
113
        print("Preprocessed: " + str(time()-t0))
114
        np.save('processed-data', windows)
115
116
117
    print("Sending response: " + str(time()-t0))
    return '1'

118
119
120
@app.route('/create-tables', methods=['POST'])
def create_tables():
    t0 = time()
121
122
123
    print("loading")
    data = np.load('processed-data.npy')
    print(time()-t0)
124
125
    raw_data = orjson.loads(request.data)
    print(time()-t0)
126
127
128
    window_size = int(raw_data['parameters']["windowsize"])
    hash_size = int(raw_data['parameters']["hashsize"])
    table_size = int(raw_data['parameters']["tablesize"])
129
    data = np.array(data)
130
    print('Starting: ' + str(time()-t0))
131
132
    tables_hash_function = [np.random.uniform(-1, 1, size=(window_size, hash_size)) for _ in range(table_size)]
    print('Init time: ' + str(time() - t0))
133
134
135
136
    tables = []
    for index in range(table_size):
        t1 = time()
        table = defaultdict(list)
137
138
        signatures_bool = np.dot(data, tables_hash_function[index]) > 0
        signatures = [''.join(['1' if x else '0' for x in lst]) for lst in signatures_bool]
139
140
141
142
        for i in range(len(signatures)):
            table[signatures[i]].append(i)
        print(time()-t1)
        tables.append(table)
143

144
145
146
147
    print('Creation time: ' + str(time() - t0))
    hash_functions = np.array(tables_hash_function).tolist()
    response = {}
    for table_index in range(table_size):
148
        response[str(table_index)] = {
149
150
151
            "hash": hash_functions[table_index],
            "entries": tables[table_index]
        }
152
    response = orjson.dumps(response)
153
154
    return response

155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
@app.route('/create-mts-tables', methods=['POST'])
def create_mts_tables():
    t0 = time()
    print("loading")
    data = np.load('processed-data.npy')
    print(time()-t0)
    raw_data = orjson.loads(request.data)
    print(time()-t0)
    window_size = int(raw_data['parameters']["windowsize"])
    hash_size = int(raw_data['parameters']["hashsize"])
    table_size = int(raw_data['parameters']["tablesize"])
    data = np.array(data)
    print(data.shape)
    print('Starting: ' + str(time()-t0))
    tables_hash_function = [np.random.uniform(-1, 1, size=(window_size, hash_size)) for _ in range(table_size)]
    print('Init time: ' + str(time() - t0))
    tables = []
    for index in range(table_size):
        t1 = time()
        table = defaultdict(list)
        # signatures_bool = []
        # for window in data:
        #     signatures_bool.append(np.dot([1, 1, 1], np.dot(window, tables_hash_function[index])) > 0)
        signatures_bool = np.dot([1, 1, 1], np.dot(data, tables_hash_function[index])) > 0
        signatures = [''.join(['1' if x else '0' for x in lst]) for lst in signatures_bool]
        for i in range(len(signatures)):
            table[signatures[i]].append(i)
        print(time()-t1)
        tables.append(table)

    print('Creation time: ' + str(time() - t0))
    hash_functions = np.array(tables_hash_function).tolist()
    response = {}
    for table_index in range(table_size):
        response[str(table_index)] = {
            "hash": hash_functions[table_index],
            "entries": tables[table_index]
        }
    response = orjson.dumps(response)
    return response

196
197
@app.route('/query', methods=['POST'])
def query():
198
    t0 = time()
199
    raw_data = orjson.loads(request.data)
200
    window = raw_data['window']
201
    output = preprocessing.minmax_scale(window, (-1, 1), axis=1)
202
203
204
205
206
207
208
209
210
    response = orjson.dumps(output.tolist())
    print("Query done: " + str(time()-t0))
    return response

@app.route('/similarity', methods=['POST'])
def similarity():
    t0 = time()
    raw_data = orjson.loads(request.data)
    window = raw_data['query']
211
212
213
    tables = raw_data["tables"]
    neighbours = []

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
214
    output = defaultdict(list)
215
216

    for t in tables.values():
217
        signature_bool = np.dot([1, 1, 1], np.dot(window, t["hash"])) > 0
218
        signature = ''.join(['1' if x else '0' for x in signature_bool])
219
        neighbours.extend(t["entries"][signature])
220
221
    neighbours_with_frequency = dict(Counter(neighbours))
    for index, frequency in neighbours_with_frequency.items():
222
        output[str(frequency)].append(index)
223
    response = orjson.dumps(output)
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
    print("Similarity done: " + str(time()-t0))
    return response

@app.route('/average-progress', methods=['POST'])
def average_progress():
    t0 = time()
    raw_data = orjson.loads(request.data)
    all_windows = raw_data['windows']
    data = np.load('processed-data.npy')
    output = []
    actual_windows = []
    print("Initialized: " + str(time() - t0))
    for windows in all_windows:
        t1 = time()
        actual_windows.extend(data[windows])
239
240
241
        if len(actual_windows) == 0:
            output.append([])
            continue
242
243
244
245
246
247
248
249
        max_values = np.maximum.reduce(actual_windows).tolist()
        min_values = np.minimum.reduce(actual_windows).tolist()
        average_values = (np.sum(actual_windows, 0)/len(actual_windows)).tolist()
        output.append({
            'average': average_values,
            'max': max_values,
            'min': min_values
        })
250
251
252
        print("Average calculated: " + str(time() - t1))
    response = orjson.dumps(output)
    print("Averages calculated: " + str(time() - t0))
253
254
    return response

255
256
@app.route('/average-table', methods=['POST'])
def average_table():
257
258
259
260
261
    t0 = time()
    raw_data = orjson.loads(request.data)
    all_windows = raw_data['windows']
    data = np.load('processed-data.npy')
    output = []
262
    print("Initialized: " + str(time() - t0))
263
264
265
    for windows in all_windows:
        t1 = time()
        actual_windows = data[windows]
266
        print(len(actual_windows))
267
268
269
270
271
272
273
        average_values = np.average(actual_windows, 0)
        # average_values = (np.sum(actual_windows, 0) / len(actual_windows))
        std_values = np.std(actual_windows, 0)
        max_values = average_values + std_values
        min_values = average_values - std_values
        # max_values = np.maximum.reduce(actual_windows).tolist()
        # min_values = np.minimum.reduce(actual_windows).tolist()
274
        output.append({
275
276
277
            'average': average_values.tolist(),
            'max': max_values.tolist(),
            'min': min_values.tolist()
278
        })
279
280
        print("Average calculated: " + str(time() - t1))
    response = orjson.dumps(output)
281
    print("Averages calculated: " + str(time() - t0))
282
283
284
285
286
    return response

@app.route('/update', methods=['POST'])
def update():
    t0 = time()
287
    print("Start")
288
    raw_data = orjson.loads(request.data)
289
290
    print("Data loaded: " + str(time() - t0))
    data = np.load('processed-data.npy')
291
292
    label_data = raw_data["labelData"]
    tables = raw_data["tables"]
293
    window = raw_data["query"]
294
295
296
297
298
299

    window_size = int(raw_data['parameters']["windowsize"])
    hash_size = int(raw_data['parameters']["hashsize"])
    table_size = int(raw_data['parameters']["tablesize"])
    new_tables = []

300
301
    correct_indices = [int(index) for index, value in label_data.items() if value is True]
    incorrect_indices = [int(index) for index, value in label_data.items() if value is False]
302

303
    print("Initialized: " + str(time() - t0))
304
305
    for t in tables.values():
        valid = True
306
307
        signature = ''.join((np.dot(window, t["hash"]) > 0).astype('int').astype('str'))
        neighbours = t["entries"][signature]
308
309
310
311
312
313
314
315
316
317
        for index in correct_indices:
            if index not in neighbours:
                valid = False
                break
        for index in incorrect_indices:
            if index in neighbours:
                valid = False
                break
        if valid:
            new_tables.append(t)
318
319
320
321
322
323
    print("Filtered good tables: " + str(time() - t0))
    for index in range(table_size - len(new_tables)):
        entries = defaultdict(list)
        t1 = time()
        while True:
            hash_function = np.random.randn(window_size, hash_size)
324
325
            correct_signatures = [''.join((np.dot(data[i], hash_function) > 0).astype('int').astype('str')) for
                                  i in
326
                                  correct_indices]
327
328
            incorrect_signatures = [''.join((np.dot(data[i], hash_function) > 0).astype('int').astype('str')) for
                                    i
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
                                    in incorrect_indices]
            if correct_signatures.count(correct_signatures[0]) == len(
                    correct_signatures) and incorrect_signatures.count(
                    correct_signatures[0]) == 0:
                break
        print("first: " + str(time() - t1))
        t2 = time()
        signatures_bool = np.dot(data, hash_function) > 0
        signatures = [''.join(['1' if x else '0' for x in lst]) for lst in signatures_bool]
        for i in range(len(signatures)):
            entries[signatures[i]].append(i)
        print("second: " + str(time() - t2))
        new_tables.append({
            "hash": hash_function.tolist(),
            "entries": entries
        })
345

346
347
348
349
350
351
352
353
    print('Update time: ' + str(time() - t0))
    response = {}
    for table_index in range(len(new_tables)):
        response[table_index] = {
            "hash": new_tables[table_index]["hash"],
            "entries": new_tables[table_index]["entries"]
        }
    response = jsonify(response)
354
    return response