NoteThis transformation contains intermediate data preparation and creates a table ‘_temp_sessions’. This table you will use in the following transformations.
Use the SQL code below and change {{projectId}}
and {{datasetName}}
to your own.
#Sessionization of GA4 data
WITH
input AS
(
SELECT
'{{start_suffix}}' AS date_start,
'{{end_suffix}}' AS date_end,
{{session_time}} as session_time,
),
events AS
(
SELECT
--parameters
PARSE_DATE('%Y%m%d', _TABLE_SUFFIX) AS date,
e.event_timestamp AS timestamp,
'owox-office.analytics_327985429.events_' as dataSource,
e.user_pseudo_id,
e.user_id,
e.event_name,
(SELECT IF(STRPOS(value.string_value,'?')>0,SUBSTR(value.string_value,1,STRPOS(value.string_value,'?')-1),value.string_value) FROM UNNEST (event_params) WHERE key = 'page_location') as fullPath,
(SELECT MAX(IF(ep.key='ga_session_id', ep.value.int_value, null)) FROM UNNEST(e.event_params) AS ep) AS ga_session_id,
FIRST_VALUE((SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'source' ) IGNORE nulls) OVER (PARTITION BY user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = 'ga_session_id') ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS session_source,
FIRST_VALUE((SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'medium' ) IGNORE nulls) OVER (PARTITION BY user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = 'ga_session_id') ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS session_medium,
FIRST_VALUE((SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'campaign' ) IGNORE nulls) OVER (PARTITION BY user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = 'ga_session_id') ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS session_campaign,
FIRST_VALUE((SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'term' ) IGNORE nulls) OVER (PARTITION BY user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = 'ga_session_id') ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS session_keyword,
FIRST_VALUE((SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'content' ) IGNORE nulls) OVER (PARTITION BY user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = 'ga_session_id') ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS session_adcontent,
(SELECT MAX(IF(ep.key='ga_session_number', value.int_value, null)) FROM UNNEST (e.event_params) AS ep) as visitNumber,
IF(event_name='session_start',CONCAT(user_pseudo_id,'_',event_timestamp),null) AS session_start_id,
IF(event_name='session_start',1,0) as true_session,
e.platform,
e.device.category AS deviceCategory,
e.device.operating_system AS operatingSystem,
e.device.operating_system_version as operatingSystemInfo,
e.device.browser as browser,
e.device.browser_version as browserVersion,
e.geo.country as country,
e.geo.region as region,
e.geo.city as city,
IF(traffic_source.source is null, '(not set)', traffic_source.source) as user_source,
IF(traffic_source.medium is null, '(not set)', traffic_source.medium) user_medium,
IF(traffic_source.name is null, '(not set)', traffic_source.medium) user_campaign,
--utm markup
(SELECT MAX(IF(ep.key='source', ep.value.string_value, null)) FROM UNNEST(e.event_params) AS ep) AS source,
(SELECT MAX(IF(ep.key='medium', ep.value.string_value, null)) FROM UNNEST(e.event_params) AS ep) AS medium,
(SELECT MAX(IF(ep.key='campaign', ep.value.string_value, null)) FROM UNNEST(e.event_params) AS ep) AS campaign,
(SELECT MAX(IF(ep.key='term', ep.value.string_value, null)) FROM UNNEST(e.event_params) AS ep) AS keyword,
(SELECT MAX(IF(ep.key='content', ep.value.string_value, null)) FROM UNNEST(e.event_params) AS ep) AS adContent,
(SELECT MAX(IF(ep.key='gclid', ep.value.string_value, null)) FROM UNNEST(e.event_params) AS ep) AS gclid,
1 as hits,
IF(event_name='page_view',1,0) as pageviews,
IF(event_name<>'page_view',1,0) as events,
IF(event_name='purchase',e.ecommerce.transaction_id,null) as transaction_id,
IF(event_name='purchase',e.ecommerce.purchase_revenue,0.0) as transactionRevenue,
IF(event_name='user_engagement',1,0) as not_bounces,
0.0 as cost,
0 as impressions,
0 as clicks
FROM
input AS i, `{{project_id}}.{{dataset_id}}.events_*` AS e
WHERE
_TABLE_SUFFIX BETWEEN i.date_start AND i.date_end
--AND
--e.event_name NOT IN ('first_open') --'app_remove','app_clear_data','os_update','app_update'
),
events_utm AS
(
SELECT
* EXCEPT(fullPath),
IF(event_name='session_start',SUBSTR(fullPath,LENGTH(CONCAT((SPLIT(fullPath,'/')[ORDINAL(1)]),(SPLIT(fullPath,'/')[ORDINAL(2)]),(SPLIT(fullPath,'/')[ORDINAL(3)])))+3,LENGTH(fullPath)),null) as landingPage,
IF(event_name='session_start',SPLIT(fullPath,'/')[ORDINAL(3)],null) as landingHost,
SUBSTR(fullPath,LENGTH(CONCAT((SPLIT(fullPath,'/')[ORDINAL(1)]),(SPLIT(fullPath,'/')[ORDINAL(2)]),(SPLIT(fullPath,'/')[ORDINAL(3)])))+3,LENGTH(fullPath)) as page,
(SPLIT(fullPath,'/')[ORDINAL(3)]) as hostName,
CASE
WHEN gclid IS NOT null THEN 1
WHEN source IS NOT null OR medium IS NOT null THEN 1
ELSE null
END AS utm
FROM
events
),
sessions_markup AS --setting session_id for events
(
SELECT
* EXCEPT(prev_timestamp, session_start_mark, session_number),
CONCAT(user_pseudo_id, '_', session_number) AS session_id
FROM
(
SELECT
*,
SUM(session_start_mark) OVER W AS session_number
FROM
(
SELECT
*,
CASE --here you can add as many conditions as you like to interrupt the timeline
WHEN event_name='session_start' THEN 1
WHEN prev_timestamp IS null THEN 1
WHEN timestamp-prev_timestamp>session_time*60*1000*1000 THEN 1 -- the maximum interval between hits without breaking the session - 30 minutes by 60 seconds by 1000 ms by 1000 mcs
WHEN utm=1 THEN 1
ELSE null
END AS session_start_mark
FROM
(
SELECT
*, LAG(timestamp) OVER (PARTITION BY date, user_pseudo_id ORDER BY timestamp ASC, IF(event_name='session_start', 1, 2) ASC) AS prev_timestamp
FROM
events_utm
), input
)
WINDOW W AS
(
PARTITION BY date, user_pseudo_id
ORDER BY timestamp ASC, IF(event_name='session_start', 1, 2) ASC
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
)
)
),
sessions_array AS --main table level - session for one date
(
SELECT
* EXCEPT(session_source, session_medium, session_campaign, session_keyword, session_adContent, source, medium, campaign, keyword, adContent, gclid, events, u),
CASE
WHEN session_id is not null THEN IFNULL(session_source,'(direct)')
ELSE IF(gclid IS NOT null, 'google', IFNULL(source, '(direct)'))
END AS session_source,
CASE
WHEN session_id is not null THEN IFNULL(session_medium,'(none)')
ELSE IF(gclid IS NOT null, 'google', IFNULL(medium, '(none)'))
END AS session_medium,
CASE
WHEN session_id is not null THEN IFNULL(session_campaign,null)
ELSE IF(gclid IS NOT null, IF(source='google' AND medium='cpc', campaign, null), campaign)
END AS session_campaign,
CASE
WHEN session_id is not null THEN IFNULL(session_keyword,null)
ELSE IF(gclid IS NOT null, IF(source='google' AND medium='cpc', keyword, null), keyword)
END AS session_keyword,
CASE
WHEN session_id is not null THEN IFNULL(session_adContent,null)
ELSE IF(gclid IS NOT null, IF(source='google' AND medium='cpc', adContent, null), adContent)
END AS session_adContent,
gclid,
u, events
FROM
(
SELECT
* EXCEPT(events),
--utm markup
(SELECT source FROM UNNEST(events) AS e WHERE e.utm=1) AS source,
(SELECT medium FROM UNNEST(events) AS e WHERE e.utm=1) AS medium,
(SELECT campaign FROM UNNEST(events) AS e WHERE e.utm=1) AS campaign,
(SELECT keyword FROM UNNEST(events) AS e WHERE e.utm=1) AS keyword,
(SELECT adContent FROM UNNEST(events) AS e WHERE e.utm=1) AS adContent,
(SELECT gclid FROM UNNEST(events) AS e WHERE e.utm=1) AS gclid,
(SELECT deviceCategory FROM (SELECT deviceCategory, ROW_NUMBER() OVER(ORDER BY timestamp DESC) AS RN FROM UNNEST(events) AS e WHERE deviceCategory IS NOT null) WHERE RN=1) AS deviceCategory,
(SELECT operatingSystem FROM (SELECT operatingSystem, ROW_NUMBER() OVER(ORDER BY timestamp DESC) AS RN FROM UNNEST(events) AS e WHERE operatingSystem IS NOT null) WHERE RN=1) AS operatingSystem,
(SELECT operatingSystemInfo FROM (SELECT operatingSystemInfo, ROW_NUMBER() OVER(ORDER BY timestamp DESC) AS RN FROM UNNEST(events) AS e WHERE operatingSystemInfo IS NOT null) WHERE RN=1) AS operatingSystemInfo,
(SELECT browser FROM (SELECT browser, ROW_NUMBER() OVER(ORDER BY timestamp DESC) AS RN FROM UNNEST(events) AS e WHERE browser IS NOT null) WHERE RN=1) AS browser,
(SELECT browserVersion FROM (SELECT browserVersion, ROW_NUMBER() OVER(ORDER BY timestamp DESC) AS RN FROM UNNEST(events) AS e WHERE browserVersion IS NOT null) WHERE RN=1) AS browserVersion,
(SELECT country FROM (SELECT country, ROW_NUMBER() OVER(ORDER BY timestamp DESC) AS RN FROM UNNEST(events) AS e WHERE country IS NOT null) WHERE RN=1) AS country,
(SELECT region FROM (SELECT region, ROW_NUMBER() OVER(ORDER BY timestamp DESC) AS RN FROM UNNEST(events) AS e WHERE region IS NOT null) WHERE RN=1) AS region,
(SELECT city FROM (SELECT city, ROW_NUMBER() OVER(ORDER BY timestamp DESC) AS RN FROM UNNEST(events) AS e WHERE city IS NOT null) WHERE RN=1) AS city,
ARRAY(
SELECT AS STRUCT
*
FROM
UNNEST(events)
ORDER BY timestamp ASC
) AS events
FROM
(
SELECT
date,
dataSource,
user_pseudo_id,
session_id as pseudo_session_id,
MAX(landingHost) as landingHost,
MAX(landingPage) as landingPage,
MAX(session_start_id) as session_id,
MAX(session_source) as session_source,
MAX(session_medium) as session_medium,
MAX(session_campaign) as session_campaign,
MAX(session_keyword) as session_keyword,
MAX(session_adContent) as session_adContent,
MAX(true_session) as isTrueSession,
MIN(timestamp) AS sStart,
MAX(timestamp) AS sEnd,
MAX(user_source) as user_source,
MAX(user_medium) as user_medium,
MAX(user_campaign) as user_campaign,
ARRAY(--here is the full list of user_ids that occur in sessions
SELECT AS STRUCT
user_id AS id
FROM
sessions_markup AS t2
WHERE
t2.date=t1.date
AND
t2.user_pseudo_id=t1.user_pseudo_id
AND
t2.session_id=t1.session_id
AND
user_id IS NOT null
GROUP BY 1
) AS u,
ARRAY(
SELECT AS STRUCT
* EXCEPT(date, user_pseudo_id, session_id,session_start_id,true_session,landingPage,landingHost,dataSource, user_source, user_medium, user_campaign, session_source, session_medium, session_campaign, session_keyword, session_adContent)
FROM
sessions_markup AS t2
WHERE
t2.date=t1.date
AND
t2.user_pseudo_id=t1.user_pseudo_id
AND
t2.session_id=t1.session_id
) AS events
FROM
sessions_markup AS t1
GROUP BY 1,2,3,4
)
)
)
SELECT * FROM sessions_array
You will get the data in the table '_temp_sessions' with the schema below:
Field name | Type | |
---|---|---|
date |
DATE | |
dataSource |
STRING | |
user_pseudo_id |
STRING | |
pseudo_session_id |
STRING | |
landingHost |
STRING | |
landingPage |
STRING | |
session_id |
STRING | |
isTrueSession |
INTEGER | |
sStart |
INTEGER | |
sEnd |
INTEGER | |
user_source |
STRING | |
user_medium |
STRING | |
user_campaign |
STRING | |
deviceCategory |
STRING | |
operatingSystem |
STRING | |
operatingSystemInfo |
STRING | |
browser |
STRING | |
browserVersion |
STRING | |
country |
STRING | |
region |
STRING | |
city |
STRING | |
session_source |
STRING | |
session_medium |
STRING | |
session_campaign |
STRING | |
session_keyword |
STRING | |
session_adContent |
STRING | |
gclid |
STRING | |
u |
RECORD | |
u.id |
STRING | |
events |
RECORD | |
events.timestamp |
INTEGER | |
events.user_id |
STRING | |
events.event_name |
STRING | |
events.ga_session_id |
INTEGER | |
events.visitNumber |
INTEGER | |
events.platform |
STRING | |
events.deviceCategory |
STRING | |
events.operatingSystem |
STRING | |
events.operatingSystemInfo |
STRING | |
events.browser |
STRING | |
events.browserVersion |
STRING | |
events.country |
STRING | |
events.region |
STRING | |
events.city |
STRING | |
events.source |
STRING | |
events.medium |
STRING | |
events.campaign |
STRING | |
events.keyword |
STRING | |
events.adContent |
STRING | |
events.gclid |
STRING | |
events.hits |
INTEGER | |
events.pageviews |
INTEGER | |
events.events |
INTEGER | |
events.not_bounces |
INTEGER | |
events.transaction_id |
STRING | |
events.transactionRevenue |
FLOAT | |
events.cost |
FLOAT | |
events.impressions |
INTEGER | |
events.clicks |
INTEGER | |
events.page |
STRING | |
events.hostName |
STRING | |
events.utm |
INTEGER | |
events.date_start |
STRING | |
events.date_end |
STRING | |
events.session_time |
INTEGER |
5 Comments