-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_util.py
78 lines (74 loc) · 3.15 KB
/
extract_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
import time_util
from importlib import reload
reload(time_util)
"""
This function returns the
For extracting input and target tensors (https://www.kaggle.com/anikannal/solar-power-generation-data)
offset is used for specifying the lookback interval for energy quantity
"""
def columns_of_interrest(original_production_data, original_weather_data):
j = 0
# Hashmap of type (date_time, index)
weather_date_times = {}
ret = {"DATE_TIME":[], "SOURCE_KEY":[], "DC_POWER":[], "AC_POWER":[], "DAILY_YIELD":[]}
rows_for_deletion = [] # List of rows that will be deleted because the 2 datasets are incomplete
for i in range(len(original_production_data)):
# Production data is added
for col in ret.keys():
ret[col].append(original_production_data.loc[i, col])
ret["AMBIENT_TEMPERATURE"] = []
ret["IRRADIATION"] = []
_convert_date_format(original_weather_data)
for i in range(len(original_weather_data)):
weather_date_times[original_weather_data["DATE_TIME"][i]] = i
# Weather data is added
for i in range(len(original_production_data)):
current_date_time = original_production_data["DATE_TIME"][i]
try:
j = weather_date_times[current_date_time]
except:
"""
There is no data recorded at this date_time
Invalid data is added so the dataframe can be created from the dictionary
The entire row will be deleted later
"""
rows_for_deletion.append(current_date_time)
ret["AMBIENT_TEMPERATURE"].append("NaN")
ret["IRRADIATION"].append("NaN")
continue
ret["AMBIENT_TEMPERATURE"].append(original_weather_data['AMBIENT_TEMPERATURE'][j])
ret["IRRADIATION"].append(original_weather_data['IRRADIATION'][j])
df = pd.DataFrame(data=ret)
# Remove the invalid rows
for row in rows_for_deletion:
df[df.DATE_TIME != row]
# Split DATE_TIME in DATE and TIME columns
for i in range(len(df['DATE_TIME'])):
date, time = df['DATE_TIME'][i].split(" ")
df.loc[i, 'DATE'] = date
df.loc[i, 'TIME'] = time
df.loc[i, 'SECONDS'] = _time_to_seconds(time)
del df['DATE_TIME']
return df
def add_offset_columns(data, offset):
for e in range(offset, len(data)):
data.loc[e, 'PREVIOUS_DAY_DC'] = data.loc[e - offset, 'DC_POWER']
data.loc[e, 'PREVIOUS_DAY_AC'] = data.loc[e - offset, 'AC_POWER']
return data[offset:]
"""
The 2 datasets (production and weather have different formats for column DATE_TIME)
This fuction converts the column to "yyyy-mm-dd HH:MM:SS" format
"""
def _convert_date_format(date_time):
for i in range(len(date_time)):
e = date_time['DATE_TIME'][i]
date, time = e.split(" ")
split_date = date.split("-")
split_date[0], split_date[2] = split_date[2], split_date[0]
time = time[0:5] # Eliminam secundele
date_time.loc[i, "DATE_TIME"] = "-".join(split_date) + " " + time
# Time format: HH:MM
def _time_to_seconds(time):
hours, minutes = time.split(":")
return int(hours) * 3600 + int(minutes) * 60