-
-
Notifications
You must be signed in to change notification settings - Fork 25
Expand file tree
/
Copy pathhtmlElementPopularity.sql
More file actions
47 lines (46 loc) · 963 Bytes
/
htmlElementPopularity.sql
File metadata and controls
47 lines (46 loc) · 963 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
CREATE TEMPORARY FUNCTION getElements(payload STRING)
RETURNS ARRAY<STRING> LANGUAGE js AS '''
try {
var elements = JSON.parse(payload);
if (Array.isArray(elements) || typeof elements != 'object') return [];
return Object.keys(elements);
} catch (e) {
return [];
}
''';
SELECT
client,
element,
COUNT(DISTINCT root_page) AS pages,
total,
COUNT(DISTINCT root_page) / total AS pct,
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT page LIMIT 5), ' ') AS sample_urls
FROM
`httparchive.crawl.pages`
JOIN
(
SELECT
date,
client,
COUNT(DISTINCT root_page) AS total
FROM
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}'
GROUP BY
date,
client
)
USING (date, client),
UNNEST(getElements(TO_JSON_STRING(custom_metrics.element_count))) AS element
WHERE
date = '${YYYY-MM-DD}'
GROUP BY
client,
total,
element
HAVING
COUNT(DISTINCT root_page) >= 10
ORDER BY
pages / total DESC,
client