main.py 13 KB
Newer Older
1
2
3
4
from flask import Flask, jsonify, request
import pandas as pd
import numpy as np
from flask_cors import CORS
5
from collections import defaultdict, Counter
6
from time import time
7
8
9
import os.path
import json
from sklearn import preprocessing
10
import orjson
11
import dask.dataframe as dd
12
13
14
15
16
17
18
19
20
21

app = Flask(__name__)
CORS(app)

@app.route('/', methods=['GET'])
def index():
    return "hi"

@app.route('/read-data', methods=['GET'])
def read_data():
22
23
24
25
26
27
28
29
30
31
    filename = 'data.pkl'
    if (not os.path.isfile(filename)):
        print("start")
        df = dd.read_csv("NW_Ground_Stations_2016.csv", usecols=['number_sta', 'date', 't'])
        print("read file")
        df = df.loc[df['number_sta'] == 14066001]
        print("split rows")
        df = df.compute()
        df.to_pickle(filename)
        print("to_pandas")
32
33
    df = pd.read_pickle(filename)
    df.dropna(subset=['t'], inplace=True)
34
    response = {
35
36
        "index": json.dumps(df.loc[:, 'date'].values.astype(str).tolist()),
        "values": json.dumps(df.loc[:, 't'].values.astype(str).tolist())
37
    }
38
    print("response ready")
39
40
41
    response = jsonify(response)
    return response

42
43
44
45
46
@app.route('/read-mts-data', methods=['GET'])
def read_mts_data():
    filename = 'data.pkl'
    if (not os.path.isfile(filename)):
        print("start")
47
        df = dd.read_csv("NW_Ground_Stations_2016.csv", usecols=['number_sta', 'date', 't', 'hu', 'td'])
48
49
50
51
52
53
54
55
56
57
        print("read file")
        df = df.loc[df['number_sta'] == 14066001]
        print("split rows")
        df = df.compute()
        df.to_pickle(filename)
        print("to_pandas")
    df = pd.read_pickle(filename)
    df.dropna(subset=['t'], inplace=True)
    response = [
        {
58
59
60
61
62
63
            "index": df.loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[:, 't'].values.tolist()
        },
        {
            "index": df.loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[:, 'hu'].values.tolist()
64
65
        },
        {
66
67
            "index": df.loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[:, 'td'].values.tolist()
68
69
70
71
72
73
        }
    ]
    print("response ready")
    response = orjson.dumps(response)
    return response

74
75
@app.route('/create-windows', methods=['POST'])
def create_windows():
76
    t0 = time()
77
78
79
80
81
82
83
84
85
86
87
88
    if (not os.path.isfile('processed-data.npy')):
        filename = 'data.pkl'
        df = pd.read_pickle(filename)
        values = df.loc[:, 't'].values.astype(str).tolist()
        print("Data read: " + str(time()-t0))
        raw_data = request.json
        window_size = int(raw_data['parameters']["windowsize"])
        print("Processing: " + str(time()-t0))
        data = [values[i:i+window_size] for i in range(len(values) - window_size)]
        data = preprocessing.minmax_scale(data, (-1, 1), axis=1)
        print("Preprocessed: " + str(time()-t0))
        np.save('processed-data', data)
89
    print("Sending response: " + str(time()-t0))
90
    return '1'
91

92
93
94
95
96
97
@app.route('/create-mts-windows', methods=['POST'])
def create_mts_windows():
    t0 = time()
    if (not os.path.isfile('processed-data.npy')):
        filename = 'data.pkl'
        df = pd.read_pickle(filename)
98
99
100
101
        channels = list()
        channels.append(df.loc[:, 't'].values.tolist())
        channels.append(df.loc[:, 'hu'].values.tolist())
        channels.append(df.loc[:, 'td'].values.tolist())
102
103
104
105
        print("Data read: " + str(time()-t0))
        raw_data = request.json
        window_size = int(raw_data['parameters']["windowsize"])
        print("Processing: " + str(time()-t0))
106
107
108
109
110
111
112
        data = [([values[i:i+window_size] for values in channels]) for i in range(len(channels[0]) - window_size)]
        print("Raw windows: " + str(time()-t0))
        windows = []
        for i in range(len(data)):
            if i % 5000 == 0:
                print(i)
            windows.append(preprocessing.minmax_scale(data[i], (-1, 1), axis=1))
113
        print("Preprocessed: " + str(time()-t0))
114
        np.save('processed-data', windows)
115
116
117
    print("Sending response: " + str(time()-t0))
    return '1'

118
119
120
@app.route('/create-tables', methods=['POST'])
def create_tables():
    t0 = time()
121
122
123
    print("loading")
    data = np.load('processed-data.npy')
    print(time()-t0)
124
125
    raw_data = orjson.loads(request.data)
    print(time()-t0)
126
127
128
    window_size = int(raw_data['parameters']["windowsize"])
    hash_size = int(raw_data['parameters']["hashsize"])
    table_size = int(raw_data['parameters']["tablesize"])
129
    print('Starting: ' + str(time()-t0))
130
131
    tables_hash_function = [np.random.uniform(-1, 1, size=(window_size, hash_size)) for _ in range(table_size)]
    print('Init time: ' + str(time() - t0))
132
133
134
135
    tables = []
    for index in range(table_size):
        t1 = time()
        table = defaultdict(list)
136
137
        signatures_bool = np.dot(data, tables_hash_function[index]) > 0
        signatures = [''.join(['1' if x else '0' for x in lst]) for lst in signatures_bool]
138
139
140
141
        for i in range(len(signatures)):
            table[signatures[i]].append(i)
        print(time()-t1)
        tables.append(table)
142

143
144
145
146
    print('Creation time: ' + str(time() - t0))
    hash_functions = np.array(tables_hash_function).tolist()
    response = {}
    for table_index in range(table_size):
147
        response[str(table_index)] = {
148
149
150
            "hash": hash_functions[table_index],
            "entries": tables[table_index]
        }
151
    response = orjson.dumps(response)
152
153
    return response

154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
@app.route('/create-mts-tables', methods=['POST'])
def create_mts_tables():
    t0 = time()
    print("loading")
    data = np.load('processed-data.npy')
    print(time()-t0)
    raw_data = orjson.loads(request.data)
    print(time()-t0)
    window_size = int(raw_data['parameters']["windowsize"])
    hash_size = int(raw_data['parameters']["hashsize"])
    table_size = int(raw_data['parameters']["tablesize"])
    data = np.array(data)
    print(data.shape)
    print('Starting: ' + str(time()-t0))
    tables_hash_function = [np.random.uniform(-1, 1, size=(window_size, hash_size)) for _ in range(table_size)]
    print('Init time: ' + str(time() - t0))
    tables = []
    for index in range(table_size):
        t1 = time()
        table = defaultdict(list)
        # signatures_bool = []
        # for window in data:
        #     signatures_bool.append(np.dot([1, 1, 1], np.dot(window, tables_hash_function[index])) > 0)
        signatures_bool = np.dot([1, 1, 1], np.dot(data, tables_hash_function[index])) > 0
        signatures = [''.join(['1' if x else '0' for x in lst]) for lst in signatures_bool]
        for i in range(len(signatures)):
            table[signatures[i]].append(i)
        print(time()-t1)
        tables.append(table)

    print('Creation time: ' + str(time() - t0))
    hash_functions = np.array(tables_hash_function).tolist()
    response = {}
    for table_index in range(table_size):
        response[str(table_index)] = {
            "hash": hash_functions[table_index],
            "entries": tables[table_index]
        }
    response = orjson.dumps(response)
    return response

195
196
@app.route('/query', methods=['POST'])
def query():
197
    t0 = time()
198
    raw_data = orjson.loads(request.data)
199
    window = raw_data['window']
200
    output = preprocessing.minmax_scale(window, (-1, 1), axis=1)
201
202
203
204
205
206
207
208
209
    response = orjson.dumps(output.tolist())
    print("Query done: " + str(time()-t0))
    return response

@app.route('/similarity', methods=['POST'])
def similarity():
    t0 = time()
    raw_data = orjson.loads(request.data)
    window = raw_data['query']
210
211
212
    tables = raw_data["tables"]
    neighbours = []

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
213
    output = defaultdict(list)
214
215

    for t in tables.values():
216
        signature_bool = np.dot([1, 1, 1], np.dot(window, t["hash"])) > 0
217
        signature = ''.join(['1' if x else '0' for x in signature_bool])
218
        neighbours.extend(t["entries"][signature])
219
220
    neighbours_with_frequency = dict(Counter(neighbours))
    for index, frequency in neighbours_with_frequency.items():
221
        output[str(frequency)].append(index)
222
    response = orjson.dumps(output)
223
224
225
226
227
228
229
230
231
232
233
234
235
236
    print("Similarity done: " + str(time()-t0))
    return response

@app.route('/average-progress', methods=['POST'])
def average_progress():
    t0 = time()
    raw_data = orjson.loads(request.data)
    all_windows = raw_data['windows']
    data = np.load('processed-data.npy')
    output = []
    actual_windows = []
    print("Initialized: " + str(time() - t0))
    for windows in all_windows:
        t1 = time()
237
        actual_windows.extend([item[0] for item in data[windows]])
238
239
240
        if len(actual_windows) == 0:
            output.append([])
            continue
241
242
243
244
245
246
247
248
        max_values = np.maximum.reduce(actual_windows).tolist()
        min_values = np.minimum.reduce(actual_windows).tolist()
        average_values = (np.sum(actual_windows, 0)/len(actual_windows)).tolist()
        output.append({
            'average': average_values,
            'max': max_values,
            'min': min_values
        })
249
250
251
        print("Average calculated: " + str(time() - t1))
    response = orjson.dumps(output)
    print("Averages calculated: " + str(time() - t0))
252
253
    return response

254
255
@app.route('/average-table', methods=['POST'])
def average_table():
256
257
258
259
260
    t0 = time()
    raw_data = orjson.loads(request.data)
    all_windows = raw_data['windows']
    data = np.load('processed-data.npy')
    output = []
261
    print("Initialized: " + str(time() - t0))
262
263
    for windows in all_windows:
        t1 = time()
264
        actual_windows = [item[0] for item in data[windows]]
265
        print(len(actual_windows))
266
267
268
269
270
271
272
        average_values = np.average(actual_windows, 0)
        # average_values = (np.sum(actual_windows, 0) / len(actual_windows))
        std_values = np.std(actual_windows, 0)
        max_values = average_values + std_values
        min_values = average_values - std_values
        # max_values = np.maximum.reduce(actual_windows).tolist()
        # min_values = np.minimum.reduce(actual_windows).tolist()
273
        output.append({
274
275
276
            'average': average_values.tolist(),
            'max': max_values.tolist(),
            'min': min_values.tolist()
277
        })
278
279
        print("Average calculated: " + str(time() - t1))
    response = orjson.dumps(output)
280
    print("Averages calculated: " + str(time() - t0))
281
282
283
284
285
    return response

@app.route('/update', methods=['POST'])
def update():
    t0 = time()
286
    print("Start")
287
    raw_data = orjson.loads(request.data)
288
289
    print("Data loaded: " + str(time() - t0))
    data = np.load('processed-data.npy')
290
291
    label_data = raw_data["labelData"]
    tables = raw_data["tables"]
292
    window = raw_data["query"]
293
294
295
296
297
298

    window_size = int(raw_data['parameters']["windowsize"])
    hash_size = int(raw_data['parameters']["hashsize"])
    table_size = int(raw_data['parameters']["tablesize"])
    new_tables = []

299
300
    correct_indices = [int(index) for index, value in label_data.items() if value is True]
    incorrect_indices = [int(index) for index, value in label_data.items() if value is False]
301

302
    print("Initialized: " + str(time() - t0))
303
304
    for t in tables.values():
        valid = True
305
306
        signature_bool = np.dot([1, 1, 1], np.dot(window, t["hash"])) > 0
        signature = ''.join(['1' if x else '0' for x in signature_bool])
307
        neighbours = t["entries"][signature]
308
309
310
311
312
313
314
315
316
317
        for index in correct_indices:
            if index not in neighbours:
                valid = False
                break
        for index in incorrect_indices:
            if index in neighbours:
                valid = False
                break
        if valid:
            new_tables.append(t)
318
319
320
321
322
323
    print("Filtered good tables: " + str(time() - t0))
    for index in range(table_size - len(new_tables)):
        entries = defaultdict(list)
        t1 = time()
        while True:
            hash_function = np.random.randn(window_size, hash_size)
324
            correct_signatures = [''.join(np.dot([1, 1, 1], (np.dot(data[i], hash_function) > 0)).astype('int').astype('str')) for
325
                                  i in
326
                                  correct_indices]
327
328
            incorrect_signatures = [''.join((np.dot(data[i], hash_function) > 0).astype('int').astype('str')) for
                                    i
329
330
331
332
333
334
335
                                    in incorrect_indices]
            if correct_signatures.count(correct_signatures[0]) == len(
                    correct_signatures) and incorrect_signatures.count(
                    correct_signatures[0]) == 0:
                break
        print("first: " + str(time() - t1))
        t2 = time()
336
        signatures_bool = np.dot([1, 1, 1], np.dot(data, hash_function)) > 0
337
338
339
340
341
342
343
344
        signatures = [''.join(['1' if x else '0' for x in lst]) for lst in signatures_bool]
        for i in range(len(signatures)):
            entries[signatures[i]].append(i)
        print("second: " + str(time() - t2))
        new_tables.append({
            "hash": hash_function.tolist(),
            "entries": entries
        })
345

346
347
348
349
350
351
352
353
    print('Update time: ' + str(time() - t0))
    response = {}
    for table_index in range(len(new_tables)):
        response[table_index] = {
            "hash": new_tables[table_index]["hash"],
            "entries": new_tables[table_index]["entries"]
        }
    response = jsonify(response)
354
    return response