Big Query Analytics Improve Query Performance
Pretty new for large query and timeouts on a 100M point dataset. I am trying to find the points at which we reach a consistent series of values ββaround 0 (stop) and the points at which we consistently get above 0 (starts).
I kept a nested subquery that determines the start time of a file for its own dataset, but that didn't help. (seconds increase across multiple "files".
The part causing the problem is the initial aggregation of the previous pts and the next pts.
WITH test AS
(SELECT 'A' as ACM, CAST('2017-01-01' AS DATE) as file_date, CAST('10:10:10' AS TIME) as file_time , 0.0 as value, 0.1 as seconds
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.2 #start
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 2000, 0.3
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 1000, 0.4
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.5
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', -1000, 0.6
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', -2000, 0.7
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.8 #stop
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.9
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.0 #start
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.1
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.2
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 2000, 1.3
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.4
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.5
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', -1000, 1.6
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', -2000, 1.7
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.8
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.9
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 2000, 2.0
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 1000, 2.1
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 0, 2.2 #stop
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 20, 2.3
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 0, 2.4
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.1
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.2 #start
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 2000, 0.3
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 1000, 0.4
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.5
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', -1000, 0.6
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', -2000, 0.7
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.8 #stop
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.9
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.0 #start
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.1
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.2
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 2000, 1.3
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.4
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.5
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', -1000, 1.6
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', -2000, 1.7
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.8
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.9
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 2000, 2.0
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 1000, 2.1
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 0, 2.2 #stop
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 20, 2.3
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 0, 2.4 )
SELECT
acm,
file_date,
start_file_time,
file_times,
agg_sec as start_stop
FROM (
SELECT
acm,
file_date,
start_file_time,
file_times,
ARRAY_AGG(kind) OVER w AS agg_kind,
ARRAY_AGG(seconds) OVER w AS agg_sec
FROM (
SELECT
acm,
file_date,
start_file_time,
ARRAY(SELECT DISTINCT x FROM UNNEST(file_times) as x) AS file_times,
seconds,
CASE
WHEN (ABS(prev_val) < 50 and ABS(next_val) >= 50 and next_avg >= 50 and prev_avg < 50 ) THEN 'start'
WHEN (ABS(next_val) < 50 and ABS(prev_val) >= 50 and prev_avg >= 50 and next_avg < 50 ) THEN 'stop'
END as kind,
prev_val, next_val, prev_avg, next_avg
FROM (
SELECT
s.acm as acm,
s.file_date as file_date,
s.start_file_time as start_file_time,
seconds,
value,
ARRAY_AGG(s.file_time) OVER (PARTITION BY s.acm, s.file_date, s.start_file_time) as file_times,
AVG(ABS(value)) OVER prev as prev_avg,
NTH_VALUE(value, 2) OVER prev as prev_val,
AVG(ABS(value)) OVER next as next_avg,
NTH_VALUE(value, 2) OVER next as next_val
FROM test v
JOIN (
SELECT
acm,
file_date,
file_time,
TIME_SUB(file_time, INTERVAL CAST(FLOOR(MIN(seconds)) AS INT64) SECOND) as start_file_time
FROM test
GROUP BY acm, file_date, file_time
) s ON s.acm = v.acm AND s.file_date = v.file_date AND s.file_time = v.file_time
WINDOW prev AS (PARTITION BY s.acm, s.file_date, s.start_file_time ORDER BY seconds ROWS 2 PRECEDING), next AS (PARTITION BY s.acm, s.file_date, s.start_file_time ORDER BY seconds ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING)
)
WHERE value = 0)
WHERE kind IN ('start', 'stop')
WINDOW w AS (PARTITION BY acm, file_date, start_file_time ORDER BY seconds ROWS 1 PRECEDING))
WHERE ARRAY_LENGTH(agg_kind) = 2 AND agg_kind[ORDINAL(1)] = 'start' AND agg_kind[ORDINAL(2)] = 'stop'
;
source to share
Check if the version below will change
I tried to keep the source as much as possible
#standardSQL
WITH test AS
(SELECT 'A' AS ACM, CAST('2017-01-01' AS DATE) AS file_date, CAST('10:10:10' AS TIME) AS file_time , 0.0 AS value, 0.1 AS seconds
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.2 #start
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 2000, 0.3
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 1000, 0.4
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.5
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', -1000, 0.6
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', -2000, 0.7
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.8 #stop
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.9
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.0 #start
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.1
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.2
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 2000, 1.3
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.4
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.5
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', -1000, 1.6
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', -2000, 1.7
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.8
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.9
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 2000, 2.0
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 1000, 2.1
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 0, 2.2 #stop
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 20, 2.3
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 0, 2.4
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.1
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.2 #start
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 2000, 0.3
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 1000, 0.4
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.5
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', -1000, 0.6
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', -2000, 0.7
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.8 #stop
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.9
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.0 #start
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.1
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.2
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 2000, 1.3
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.4
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.5
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', -1000, 1.6
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', -2000, 1.7
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.8
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.9
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 2000, 2.0
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 1000, 2.1
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 0, 2.2 #stop
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 20, 2.3
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 0, 2.4
), temp1 AS (
SELECT acm, file_date, value, seconds,
TIME_SUB(file_time, INTERVAL CAST(FLOOR(seconds) AS INT64) SECOND) AS start_file_time
FROM test
), temp2 AS (
SELECT
acm, file_date, start_file_time, seconds,
AVG(ABS(value)) OVER prev AS prev_avg,
NTH_VALUE(value, 2) OVER prev AS prev_val,
AVG(ABS(value)) OVER next AS next_avg,
NTH_VALUE(value, 2) OVER next AS next_val
FROM temp1 WINDOW
prev AS (PARTITION BY acm, file_date, start_file_time ORDER BY seconds ROWS 2 PRECEDING),
next AS (PARTITION BY acm, file_date, start_file_time ORDER BY seconds ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING)
), temp3 AS (
SELECT
acm, file_date, start_file_time, seconds,
CASE
WHEN (ABS(prev_val) < 50 AND ABS(next_val) >= 50 AND next_avg >= 50 AND prev_avg < 50 ) THEN 'start'
WHEN (ABS(next_val) < 50 AND ABS(prev_val) >= 50 AND prev_avg >= 50 AND next_avg < 50 ) THEN 'stop'
END AS kind
FROM temp2
), temp4 AS (
SELECT *,
COUNTIF(kind = 'start') OVER (PARTITION BY acm, file_date, start_file_time ORDER BY seconds) +
COUNTIF(kind = 'stop') OVER (PARTITION BY acm, file_date, start_file_time ORDER BY seconds ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS grp
FROM temp3
)
SELECT
acm, file_date, start_file_time,
MIN(seconds) AS start_seconds,
MAX(seconds) AS stop_seconds
FROM temp4
GROUP BY acm, file_date, start_file_time, grp
HAVING MIN(kind) != MAX(kind)
-- ORDER BY 1, 2, 3, 4
source to share
Hopefully this query will give you the results you are looking for and is able to successfully process your dataset:
SELECT
* EXCEPT(file_data),
ARRAY(SELECT STRUCT(seconds, kind) FROM UNNEST(file_data) WHERE kind IS NOT NULL) file_data
FROM(
SELECT
ACM,
file_date,
start_file_time,
ARRAY(SELECT DISTINCT file_time FROM UNNEST(file_data)) file_times,
ARRAY(SELECT STRUCT(seconds, IF(value = 0, (CASE WHEN ABS(NTH_VALUE(value, 2) OVER(prev)) < 50 AND ABS(NTH_VALUE(value, 2) OVER(next)) >= 50 AND AVG(ABS(value)) OVER(next) >= 50 and AVG(ABS(value)) OVER(prev) < 50 THEN 'start'
WHEN ABS(NTH_VALUE(value, 2) OVER(next)) < 50 AND ABS(NTH_VALUE(value, 2) OVER(prev)) >= 50 AND AVG(ABS(value)) OVER(prev) >= 50 and AVG(ABS(value)) OVER(next) < 50 THEN 'stop' END), NULL) as kind)
FROM UNNEST(file_data) WINDOW prev AS (ORDER BY seconds ROWS 2 PRECEDING), next as(ORDER BY seconds ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING)) file_data
FROM(
SELECT
ACM,
file_date,
TIME_SUB(file_time, INTERVAL CAST(FLOOR(seconds) AS INT64) SECOND) AS start_file_time,
ARRAY_AGG(STRUCT(file_time, value, seconds)) file_data
FROM test
GROUP BY ACM, file_date, start_file_time
)
)
Its results are exactly what you describe as "start" and "stop" in your data test
.
A few notes:
- I avoided costly surgery
JOIN
. - Make the most of ARRAY and STRUCT, which not only increases storage efficiency, but also improves query performance when processing only the desired data, i.e. no need to handle duplicate data.
- There are only 2
WINDOW
, each of which is used inside the corresponding ARRAY structure, which improves performance. Note that this is possible because I have aggregated everything in STRUCT arrays, so there is no need for more complex occlusal clauses as the data is already "sorted". - There is no data duplication in this request.
- Note that the results are in some different structure now, I would recommend using this new one as it is more efficient at data storage and further processing.
Let me know if this works for you.
source to share