From e76810ce63d40caf69cef38cd0f69bc9cc5a29b2 Mon Sep 17 00:00:00 2001 From: Sushank Sinha Date: Wed, 1 Jul 2026 01:23:33 +0530 Subject: [PATCH 1/3] Dashboard: tab reconciliation, Google Ads channel/tab, per-channel growth charts --- server/index.js | 199 ++++++++++++++++++++++++----- src/App.tsx | 33 +++++ src/components/GoogleAdsTab.tsx | 133 +++++++++++++++++++ src/components/GoogleTab.tsx | 13 +- src/components/LinkedInRawTab.tsx | 13 +- src/components/OverviewTab.tsx | 9 ++ src/components/Sidebar.tsx | 4 +- src/components/UserGrowthChart.tsx | 23 +++- src/components/YouTubeRawTab.tsx | 13 +- src/shared/hooks/useFetchData.ts | 3 +- src/shared/types.ts | 4 +- 11 files changed, 399 insertions(+), 48 deletions(-) create mode 100644 src/components/GoogleAdsTab.tsx diff --git a/server/index.js b/server/index.js index a8191e5..f440737 100644 --- a/server/index.js +++ b/server/index.js @@ -199,18 +199,25 @@ function getPreviousPeriod(start, end) { // Source-classification rules applied to columns named utm_source / utm_medium / referrer_domain. // Reused inside multiple CTEs so all attribution decisions stay consistent. +// Google Ads: utm_source=google (paid: cpc / demand_gen). Checked BEFORE organic so a paid +// click is 'google_ads' even when its referrer is google.com. // Google: organic search only (google.com + country variants, search.google.com, -// Android quick-search). Excludes accounts.google.com (OAuth callback -// that fires on every Google-OAuth signup), mail/gemini/notebooklm/etc. -// YouTube: utm_source=youtube OR YouTube domains. -// LinkedIn: utm_source=linkedin (excluding utm_medium=bio) OR LinkedIn domains. +// Android quick-search). The accounts.google.com OAuth callback (no utm_source) +// stays 'other'; mail/gemini/notebooklm/etc are excluded. +// YouTube: utm_source=youtube OR utm_medium=youtube OR YouTube domains. +// LinkedIn: utm_source=linkedin (excluding utm_medium=bio) OR utm_medium=linkedin OR LinkedIn domains. +// utm_medium catches links mis-tagged utm_source=description (a video-description link), +// where utm_medium carries the real channel and utm_campaign the video id. const SOURCE_CASE_SQL = ` CASE WHEN lower(utm_source) = 'linkedin' AND coalesce(lower(utm_medium),'') <> 'bio' THEN 'linkedin' WHEN referrer_domain IN ('linkedin.com','com.linkedin.android','lnkd.in') THEN 'linkedin' + WHEN lower(utm_medium) = 'linkedin' THEN 'linkedin' WHEN lower(utm_source) = 'youtube' THEN 'youtube' WHEN referrer_domain IN ('youtube.com','m.youtube.com','com.google.android.youtube','youtu.be') THEN 'youtube' + WHEN lower(utm_medium) = 'youtube' THEN 'youtube' + WHEN lower(utm_source) = 'google' THEN 'google_ads' WHEN referrer_domain ~* '^(www\\.)?google\\.[a-z.]+$' OR referrer_domain = 'search.google.com' OR referrer_domain = 'com.google.android.googlequicksearchbox' THEN 'google' @@ -357,12 +364,15 @@ attributed_signups AS ( FROM signup_source )`; -// Per-session all-time first-pageview classification, scoped to sessions active in the range. -// "Active" = had ANY umami event during the range. We classify by the session's TRUE first -// pageview (which may predate the range — sessions linger across days), so that the same -// session has the same source/landing_page in attributed_signups and in classified_sessions. -// This guarantees every converted session shows up in source_redirects, so conversion rates -// are well-defined per landing page. Composes after ATTRIBUTED_SIGNUPS_CTES via comma. +// Per-session classification, scoped to sessions active in the range. +// "Active" = had ANY umami event during the range. The landing page (url_path) comes from the +// session's TRUE first pageview (which may predate the range, since sessions linger across days), +// but the SOURCE is resolved the same way as attributed_signups: the earliest source-bearing +// event of the session (session_range_any_source), falling back to the first pageview. This keeps +// redirect classification consistent with conversion attribution, so a session whose real source +// sits on a later or custom event is classified by that source here too (not by its first, often +// "Direct", pageview), and every converted session shows up in source_redirects. +// Composes after ATTRIBUTED_SIGNUPS_CTES via comma. const RANGE_CLASSIFIED_SESSIONS_CTES = ` range_active_sessions AS ( SELECT DISTINCT session_id @@ -385,6 +395,45 @@ session_all_time_first_view AS ( AND uwe.event_type = 1 ORDER BY uwe.session_id, uwe.created_at ASC ), +session_range_any_source AS ( + -- The earliest source-bearing event of each active session (any event type). Mirrors + -- session_any_source but spans ALL range-active sessions, not just converting ones. Skips + -- sourceless events and auth-callback / internal referrers. + SELECT DISTINCT ON (uwe.session_id) + uwe.session_id, + uwe.utm_source, + uwe.utm_medium, + uwe.utm_campaign, + uwe.referrer_domain + FROM umami_website_event uwe + WHERE uwe.session_id IN (SELECT session_id FROM range_active_sessions) + AND ( + coalesce(uwe.utm_source, '') <> '' + OR ( + coalesce(uwe.referrer_domain, '') <> '' + AND uwe.referrer_domain NOT IN ( + 'accounts.google.com', 'login.microsoftonline.com', 'login.live.com', + 'appleid.apple.com', 'github.com', 'medblocks.com' + ) + ) + ) + ORDER BY uwe.session_id, uwe.created_at ASC +), +session_range_resolved AS ( + -- Source columns resolved to (earliest source-bearing event, else first pageview); landing + -- fields (url_path/url_query/utm_term) stay from the first pageview. + SELECT + fv.session_id, + fv.url_path, + fv.url_query, + fv.utm_term, + COALESCE(NULLIF(sas.utm_source,''), fv.utm_source) AS utm_source, + COALESCE(NULLIF(sas.utm_medium,''), fv.utm_medium) AS utm_medium, + COALESCE(NULLIF(sas.utm_campaign,''), fv.utm_campaign) AS utm_campaign, + COALESCE(NULLIF(sas.referrer_domain,''), fv.referrer_domain) AS referrer_domain + FROM session_all_time_first_view fv + LEFT JOIN session_range_any_source sas ON sas.session_id = fv.session_id +), classified_sessions AS ( SELECT session_id, @@ -395,7 +444,7 @@ classified_sessions AS ( utm_campaign, utm_term, ${SOURCE_CASE_SQL} AS source - FROM session_all_time_first_view + FROM session_range_resolved )`; // The compact form used by /api/totals — only the attributed_signups CTE. @@ -572,6 +621,7 @@ SELECT (count(*) FILTER (WHERE source = 'linkedin'))::int AS linkedin_conversions, (count(*) FILTER (WHERE source = 'youtube'))::int AS youtube_conversions, (count(*) FILTER (WHERE source = 'google'))::int AS google_conversions, + (count(*) FILTER (WHERE source = 'google_ads'))::int AS google_ads_conversions, (count(*) FILTER (WHERE source = 'other'))::int AS other_conversions FROM attributed_signups`; @@ -586,12 +636,14 @@ FROM attributed_signups`; const linkedinConversions = cur[0]?.linkedin_conversions ?? 0; const youtubeConversions = cur[0]?.youtube_conversions ?? 0; const googleConversions = cur[0]?.google_conversions ?? 0; + const googleAdsConversions = cur[0]?.google_ads_conversions ?? 0; const otherConversions = cur[0]?.other_conversions ?? 0; const prevTotalUsers = prev[0]?.total_users ?? 0; const prevLinkedinConversions = prev[0]?.linkedin_conversions ?? 0; const prevYoutubeConversions = prev[0]?.youtube_conversions ?? 0; const prevGoogleConversions = prev[0]?.google_conversions ?? 0; + const prevGoogleAdsConversions = prev[0]?.google_ads_conversions ?? 0; const prevOtherConversions = prev[0]?.other_conversions ?? 0; res.json({ @@ -599,11 +651,13 @@ FROM attributed_signups`; linkedinConversions, youtubeConversions, googleConversions, + googleAdsConversions, otherConversions, prevTotalUsers, prevLinkedinConversions, prevYoutubeConversions, prevGoogleConversions, + prevGoogleAdsConversions, prevOtherConversions, }); } finally { @@ -629,10 +683,13 @@ source_redirects AS ( GROUP BY url_path ), source_conversions AS ( - SELECT landing_page AS url_path, count(*)::int AS user_converted + -- Conversions come straight from attributed_signups (same source-of-truth as the Overview). + -- Page-less signups (no entry pageview) fold into one '(no entry page)' bucket so the tab + -- total ties out exactly to the Overview count for this source. + SELECT COALESCE(landing_page, '(no entry page)') AS url_path, count(*)::int AS user_converted FROM attributed_signups - WHERE source = '${source}' AND landing_page IS NOT NULL - GROUP BY landing_page + WHERE source = '${source}' + GROUP BY 1 )${includeQueries ? `, queries_by_path AS ( SELECT @@ -654,13 +711,13 @@ top_queries_by_path AS ( GROUP BY url_path )` : ''} SELECT - r.url_path AS post, - r.redirect_count, + COALESCE(r.url_path, c.url_path) AS post, + COALESCE(r.redirect_count, 0)::int AS redirect_count, COALESCE(c.user_converted, 0)::int AS user_converted${includeQueries ? `, COALESCE(array_to_json(q.queries), '[]'::json) AS queries` : ''} FROM source_redirects r -LEFT JOIN source_conversions c ON c.url_path = r.url_path${includeQueries ? ` -LEFT JOIN top_queries_by_path q ON q.url_path = r.url_path` : ''} +FULL OUTER JOIN source_conversions c ON c.url_path = r.url_path${includeQueries ? ` +LEFT JOIN top_queries_by_path q ON q.url_path = COALESCE(r.url_path, c.url_path)` : ''} ORDER BY user_converted DESC, redirect_count DESC`; } @@ -685,6 +742,78 @@ app.get("/api/google", async (req, res) => { } }); +// Google Ads — landing-page rollup of paid Google signups (utm_source=google, cpc/demand_gen). +// Separate from /api/google (organic search). No organic search-query join here, paid clicks +// are not organic queries; the campaign lives in utm_campaign instead. +app.get("/api/google-ads", async (req, res) => { + try { + const { start, end } = asRange(req); + const { prevStart, prevEnd } = getPreviousPeriod(start, end); + const sql = buildSourceLandingPageQuery('google_ads'); + const client = await pool.connect(); + try { + const [{ rows }, { rows: prevRows }] = await Promise.all([ + client.query(sql, [start, end]), + client.query(sql, [prevStart, prevEnd]), + ]); + res.json({ rows, prevRows }); + } finally { + client.release(); + } + } catch (e) { + console.error("Error in /api/google-ads:", e); + res.status(500).json({ error: e.message || "Internal server error" }); + } +}); + +// Per-source signup growth: daily conversions over the last 30 days (bars) + the ALL-TIME +// cumulative total of that channel up to each date (the green line), matching the Overview chart. +// Same shape as /api/user-growth (date / daily_count / cumulative_count) so it reuses the chart. +app.get("/api/source-growth", async (req, res) => { + try { + const source = String(req.query.source || ''); + if (!['linkedin', 'youtube', 'google', 'google_ads'].includes(source)) { + return res.status(400).json({ error: "source must be one of linkedin, youtube, google, google_ads" }); + } + const now = new Date(); + const end = now.toISOString(); + const windowStart = new Date(now.getTime() - 30 * 24 * 60 * 60 * 1000).toISOString(); + const epoch = '2000-01-01'; + // $1=epoch, $2=now -> attributed_signups classifies ALL signups (needed for the all-time + // cumulative). $3=source, $4=windowStart -> the 30-day day axis for the bars. + const growthQuery = `WITH ${ATTRIBUTED_SIGNUPS_CTES}, +channel AS ( + SELECT date_trunc('day', s.date_created) AS d + FROM attributed_signups asg + JOIN signups s ON s.user_id = asg.user_id + WHERE asg.source = $3 +), +day_series AS ( + SELECT generate_series( + date_trunc('day', $4::timestamptz), + date_trunc('day', $2::timestamptz), + '1 day'::interval + ) AS d +) +SELECT + ds.d AS date, + (SELECT count(*) FROM channel c WHERE c.d = ds.d)::int AS daily_count, + (SELECT count(*) FROM channel c WHERE c.d <= ds.d)::int AS cumulative_count +FROM day_series ds +ORDER BY ds.d ASC`; + const client = await pool.connect(); + try { + const { rows } = await client.query(growthQuery, [epoch, end, source, windowStart]); + res.json(rows); + } finally { + client.release(); + } + } catch (e) { + console.error("Error in /api/source-growth:", e); + res.status(500).json({ error: e.message || "Internal server error" }); + } +}); + // Other-source signups — drill-down for everything not LinkedIn/YouTube/Google. // Each row = (landing_page, sub_source) where sub_source labels which kind of "other" // (direct, Brevo email, Google-OAuth callback, Bing/DuckDuckGo, AI chat, internal, etc.). @@ -698,10 +827,13 @@ app.get("/api/other", async (req, res) => { CASE WHEN asg.landing_page IS NULL THEN 'No entry pageview' WHEN lower(asg.utm_source) = 'brevo' - OR asg.referrer_domain ILIKE '%sendibm%' THEN 'Brevo / Email' + OR asg.referrer_domain ILIKE '%sendibm%' + OR asg.referrer_domain ILIKE '%brevo%' THEN 'Brevo / Email' WHEN asg.referrer_domain = 'accounts.google.com' THEN 'Google OAuth callback' - WHEN asg.referrer_domain IN ('bing.com','duckduckgo.com','search.brave.com','search.yahoo.com') THEN 'Other search engine' - WHEN asg.referrer_domain IN ('chatgpt.com','perplexity.ai','claude.ai','gemini.google.com','notebooklm.google.com') THEN 'AI chat' + WHEN asg.referrer_domain IN ('bing.com','duckduckgo.com','search.brave.com','search.yahoo.com','ecosia.org') + OR asg.referrer_domain ILIKE '%.search.yahoo.com' THEN 'Other search engine' + WHEN asg.referrer_domain IN ('chatgpt.com','perplexity.ai','claude.ai','gemini.google.com','notebooklm.google.com','copilot.microsoft.com') + OR lower(asg.utm_source) IN ('chatgpt.com','chatgpt','perplexity','perplexity.ai','claude.ai','claude','gemini','copilot') THEN 'AI chat' WHEN asg.referrer_domain = 'medblocks.com' THEN 'Internal' WHEN coalesce(asg.referrer_domain,'') = '' AND coalesce(asg.utm_source,'') = '' THEN 'Direct' ELSE COALESCE(NULLIF(asg.referrer_domain,''), NULLIF(asg.utm_source,''), 'Other') @@ -1061,8 +1193,7 @@ ORDER BY COALESCE(c.user_converted, 0) DESC, r.redirect_count DESC;` // LinkedIn Raw — landing-page rollup of LinkedIn-attributed signups. // Each row = (landing_page) for sessions classified as `linkedin`. Schema mirrors // /api/google's response (post, redirect_count, user_converted) so the existing -// LinkedInRawTab continues to render. content_id is no longer included; with -// last-touch landing-page attribution it is no longer the row key. +// LinkedInRawTab continues to render. content_id is no longer included. app.get("/api/linkedin-raw", async (req, res) => { try { const { start, end } = asRange(req); @@ -1145,7 +1276,7 @@ yt_redirects AS ( ), yt_conversions AS ( SELECT - asg.landing_page AS url_path, + COALESCE(asg.landing_page, '(no entry page)') AS url_path, COALESCE( CASE WHEN substring(asg.utm_campaign, '^([A-Za-z0-9_-]{11})(?:[^A-Za-z0-9_-]|$)') !~ '^[0-9]+$' THEN substring(asg.utm_campaign, '^([A-Za-z0-9_-]{11})(?:[^A-Za-z0-9_-]|$)') END, @@ -1157,7 +1288,6 @@ yt_conversions AS ( count(*)::int AS user_converted FROM attributed_signups asg WHERE asg.source = 'youtube' - AND asg.landing_page IS NOT NULL AND coalesce(lower(asg.utm_medium),'') NOT IN ('cpc','paid_video') GROUP BY 1, 2 ), @@ -1172,35 +1302,34 @@ yt_paid_redirects AS ( ), yt_paid_conversions AS ( SELECT - asg.landing_page AS url_path, + COALESCE(asg.landing_page, '(no entry page)') AS url_path, asg.utm_campaign, count(*)::int AS user_converted FROM attributed_signups asg WHERE asg.source = 'youtube' - AND asg.landing_page IS NOT NULL AND coalesce(lower(asg.utm_medium),'') IN ('cpc','paid_video') GROUP BY 1, 2 ) SELECT 'organic'::text AS bucket, - r.url_path AS post, - r.video_id, - r.redirect_count, + COALESCE(r.url_path, c.url_path) AS post, + COALESCE(r.video_id, c.video_id) AS video_id, + COALESCE(r.redirect_count, 0)::int AS redirect_count, COALESCE(c.user_converted, 0)::int AS user_converted, NULL::text AS utm_campaign FROM yt_redirects r -LEFT JOIN yt_conversions c +FULL OUTER JOIN yt_conversions c ON c.url_path = r.url_path AND c.video_id IS NOT DISTINCT FROM r.video_id UNION ALL SELECT 'paid'::text AS bucket, - r.url_path AS post, + COALESCE(r.url_path, c.url_path) AS post, NULL::text AS video_id, - r.redirect_count, + COALESCE(r.redirect_count, 0)::int AS redirect_count, COALESCE(c.user_converted, 0)::int AS user_converted, - r.utm_campaign + COALESCE(r.utm_campaign, c.utm_campaign) AS utm_campaign FROM yt_paid_redirects r -LEFT JOIN yt_paid_conversions c +FULL OUTER JOIN yt_paid_conversions c ON c.url_path = r.url_path AND c.utm_campaign IS NOT DISTINCT FROM r.utm_campaign ORDER BY user_converted DESC, redirect_count DESC`; diff --git a/src/App.tsx b/src/App.tsx index 5d4337f..1de9e57 100644 --- a/src/App.tsx +++ b/src/App.tsx @@ -10,6 +10,7 @@ import { Sidebar } from './components/Sidebar' import { DashboardLayout } from './components/DashboardLayout' import { OverviewTab } from './components/OverviewTab' import { GoogleTab } from './components/GoogleTab' +import { GoogleAdsTab } from './components/GoogleAdsTab' import { OtherTab } from './components/OtherTab' import { SearchQueriesTab } from './components/SearchQueriesTab' import { YTSearchRankingTab } from './components/YTSearchRankingTab' @@ -29,6 +30,7 @@ const tabTitles: Record = { 'linkedin-raw': 'LinkedIn Raw Analytics', 'youtube-raw': 'YouTube Raw Analytics', google: 'Google Analytics', + 'google-ads': 'Google Ads', other: 'Other / Unattributed', 'search-queries': 'Search Queries', 'yt-search-ranking': 'YouTube Search Ranking', @@ -52,10 +54,21 @@ function App() { const { data: userGrowthData } = useFetchData('/user-growth', []) // New Endpoint const { data: googleData, loading: googleLoading, error: googleError } = useFetchData<{rows: Row[], prevRows: Row[]}>(`/google${query}`, [query]) + const { data: googleAdsData, loading: googleAdsLoading, error: googleAdsError } = useFetchData<{rows: Row[], prevRows: Row[]}>(`/google-ads${query}`, [query]) + + // Per-channel 30-day growth (daily + cumulative conversions), fetched only for the active channel tab. + const sourceGrowthKey = activeTab === 'linkedin-raw' ? 'linkedin' + : activeTab === 'youtube-raw' ? 'youtube' + : activeTab === 'google' ? 'google' + : activeTab === 'google-ads' ? 'google_ads' + : '' + const { data: sourceGrowthData, loading: sourceGrowthLoading } = useFetchData(sourceGrowthKey ? `/source-growth?source=${sourceGrowthKey}` : '', [sourceGrowthKey]) const { data: otherData, loading: otherLoading, error: otherError } = useFetchData<{rows: OtherRow[], prevRows: OtherRow[]}>(`/other${query}`, [query]) const googleRows = googleData?.rows || [] const googlePrevRows = googleData?.prevRows || [] + const googleAdsRows = googleAdsData?.rows || [] + const googleAdsPrevRows = googleAdsData?.prevRows || [] const otherRows = otherData?.rows || [] const otherPrevRows = otherData?.prevRows || [] const { data: searchQueryRows, loading: searchQueriesLoading, error: searchQueriesError } = useFetchData(`/search-queries${query}`, [query]) @@ -126,6 +139,7 @@ function App() { linkedinConversions: totals.prevLinkedinConversions ?? 0, youtubeConversions: totals.prevYoutubeConversions ?? 0, googleConversions: totals.prevGoogleConversions ?? 0, + googleAdsConversions: totals.prevGoogleAdsConversions ?? 0, otherConversions: totals.prevOtherConversions ?? 0, } : null} /> @@ -139,6 +153,8 @@ function App() { prevRows={liRawPrevRows} loading={liRawLoading} error={liRawError} + growthData={sourceGrowthData ?? undefined} + growthLoading={sourceGrowthLoading} /> )} @@ -151,6 +167,8 @@ function App() { paidRows={ytRawPaidRows} loading={ytRawLoading} error={ytRawError} + growthData={sourceGrowthData ?? undefined} + growthLoading={sourceGrowthLoading} /> )} @@ -162,6 +180,21 @@ function App() { prevRows={googlePrevRows} loading={googleLoading} error={googleError} + growthData={sourceGrowthData ?? undefined} + growthLoading={sourceGrowthLoading} + /> + )} + + {activeTab === 'google-ads' && ( + )} diff --git a/src/components/GoogleAdsTab.tsx b/src/components/GoogleAdsTab.tsx new file mode 100644 index 0000000..5e7d502 --- /dev/null +++ b/src/components/GoogleAdsTab.tsx @@ -0,0 +1,133 @@ +import type { Row } from '../shared/types' +import { StatsCard } from '../shared/components/StatsCard' +import { ErrorCard } from '../shared/components/ErrorCard' +import { usePerformanceMetrics } from '../shared/hooks/usePerformanceMetrics' +import { formatPercentage } from '../shared/utils/formatters' +import { ExpandableText } from '../shared/components/ExpandableText' +import { UserGrowthChart } from './UserGrowthChart' + +type GoogleAdsTabProps = { + start: Date + end: Date + rows: Row[] + prevRows?: Row[] + loading: boolean + error: string | null + growthData?: any[] + growthLoading?: boolean +} + +export function GoogleAdsTab({ rows, prevRows = [], loading, error, growthData, growthLoading }: GoogleAdsTabProps) { + const totals = usePerformanceMetrics(rows) + const prevTotals = usePerformanceMetrics(prevRows) + + const getTrend = (current: number, prev: number) => { + if (prev === 0) { + return { + direction: (current > 0 ? 'up' : 'neutral') as 'up' | 'neutral', + value: undefined + }; + } + const diff = current - prev; + const pct = (diff / prev) * 100; + return { + direction: (diff > 0 ? 'up' : diff < 0 ? 'down' : 'neutral') as 'up' | 'down' | 'neutral', + value: `${Math.abs(pct).toFixed(1)}%` + }; + }; + + const pagesTrend = getTrend(rows.length, prevRows.length); + const redirectsTrend = getTrend(totals.redirects, prevTotals.redirects); + const conversionsTrend = getTrend(totals.conversions, prevTotals.conversions); + const conversionRate = totals.redirects > 0 ? (totals.conversions / totals.redirects) * 100 : 0; + const prevConversionRate = prevTotals.redirects > 0 ? (prevTotals.conversions / prevTotals.redirects) * 100 : 0; + const conversionRateTrend = getTrend(conversionRate, prevConversionRate); + + return ( + <> + {error && } + +
+
+ Google Ads: paid Google signups (utm_source=google, cpc / demand_gen), kept separate from organic Google search. +
+ +
+ + + + 0 ? formatPercentage(conversionRate) : '0%'} + loading={loading} + prevValue={prevTotals.redirects > 0 ? formatPercentage(prevConversionRate) : '0%'} + trend={conversionRateTrend.direction} + trendValue={conversionRateTrend.value} + /> +
+ + + + {loading ? ( +
Loading...
+ ) : ( +
+ + + + + + + + + + + {rows.map((r) => ( + + + + + + + ))} + + + + + + + +
Landing PageRedirectsConversionsRate
{r.redirect_count}{r.user_converted}{r.redirect_count > 0 ? formatPercentage((r.user_converted / r.redirect_count) * 100) : '0%'}
Total{totals.redirects}{totals.conversions}{totals.redirects > 0 ? formatPercentage((totals.conversions / totals.redirects) * 100) : '0%'}
+
+ )} +
+ + ) +} diff --git a/src/components/GoogleTab.tsx b/src/components/GoogleTab.tsx index 095b080..cb8c447 100644 --- a/src/components/GoogleTab.tsx +++ b/src/components/GoogleTab.tsx @@ -4,6 +4,7 @@ import { GooglePerformanceTable } from '../shared/components/GooglePerformanceTa import { ErrorCard } from '../shared/components/ErrorCard' import { usePerformanceMetrics } from '../shared/hooks/usePerformanceMetrics' import { formatPercentage } from '../shared/utils/formatters' +import { UserGrowthChart } from './UserGrowthChart' type GoogleTabProps = { start: Date @@ -12,9 +13,11 @@ type GoogleTabProps = { prevRows?: Row[] loading: boolean error: string | null + growthData?: any[] + growthLoading?: boolean } -export function GoogleTab({ rows, prevRows = [], loading, error }: GoogleTabProps) { +export function GoogleTab({ rows, prevRows = [], loading, error, growthData, growthLoading }: GoogleTabProps) { const totals = usePerformanceMetrics(rows) const prevTotals = usePerformanceMetrics(prevRows) @@ -80,6 +83,14 @@ export function GoogleTab({ rows, prevRows = [], loading, error }: GoogleTabProp /> + + diff --git a/src/components/LinkedInRawTab.tsx b/src/components/LinkedInRawTab.tsx index 3c98040..50704db 100644 --- a/src/components/LinkedInRawTab.tsx +++ b/src/components/LinkedInRawTab.tsx @@ -4,6 +4,7 @@ import { ErrorCard } from '../shared/components/ErrorCard' import { usePerformanceMetrics } from '../shared/hooks/usePerformanceMetrics' import { formatPercentage } from '../shared/utils/formatters' import { ExpandableText } from '../shared/components/ExpandableText' +import { UserGrowthChart } from './UserGrowthChart' type LinkedInRawTabProps = { start: Date @@ -12,9 +13,11 @@ type LinkedInRawTabProps = { prevRows?: LinkedInRawRow[] loading: boolean error: string | null + growthData?: any[] + growthLoading?: boolean } -export function LinkedInRawTab({ rows, prevRows = [], loading, error }: LinkedInRawTabProps) { +export function LinkedInRawTab({ rows, prevRows = [], loading, error, growthData, growthLoading }: LinkedInRawTabProps) { const totals = usePerformanceMetrics(rows) const prevTotals = usePerformanceMetrics(prevRows) @@ -84,6 +87,14 @@ export function LinkedInRawTab({ rows, prevRows = [], loading, error }: LinkedIn /> + + {loading ? (
Loading...
) : ( diff --git a/src/components/OverviewTab.tsx b/src/components/OverviewTab.tsx index 21c2909..96e6b50 100644 --- a/src/components/OverviewTab.tsx +++ b/src/components/OverviewTab.tsx @@ -44,6 +44,7 @@ export function OverviewTab({ const liTrend = getTrend(totals?.linkedinConversions, prevTotals?.linkedinConversions); const ytTrend = getTrend(totals?.youtubeConversions, prevTotals?.youtubeConversions); const googleTrend = getTrend(totals?.googleConversions, prevTotals?.googleConversions); + const googleAdsTrend = getTrend(totals?.googleAdsConversions, prevTotals?.googleAdsConversions); const otherTrend = getTrend(totals?.otherConversions, prevTotals?.otherConversions); return ( @@ -95,6 +96,14 @@ export function OverviewTab({ trendValue={googleTrend?.value} prevValue={prevTotals?.googleConversions} /> + ('both'); const chartData = useMemo(() => { @@ -58,7 +67,7 @@ export function UserGrowthChart({ data, loading }: UserGrowthChartProps) { return (
-

User Growth (Last 30 Days)

+

{title}

+ + {/* Custom table with YouTube info */}
{loading ? ( diff --git a/src/shared/hooks/useFetchData.ts b/src/shared/hooks/useFetchData.ts index 5a5ec31..f9631bf 100644 --- a/src/shared/hooks/useFetchData.ts +++ b/src/shared/hooks/useFetchData.ts @@ -29,7 +29,8 @@ export function useFetchData(path: string, dependencies: any[] = []): UseFetc useEffect(() => { let mounted = true - + if (!path) { setLoading(false); return () => { mounted = false } } + const run = async () => { setLoading(true) setError(null) diff --git a/src/shared/types.ts b/src/shared/types.ts index fe385f8..ea65a14 100644 --- a/src/shared/types.ts +++ b/src/shared/types.ts @@ -3,11 +3,13 @@ export interface Totals { linkedinConversions: number youtubeConversions: number googleConversions: number + googleAdsConversions: number otherConversions: number prevTotalUsers?: number prevLinkedinConversions?: number prevYoutubeConversions?: number prevGoogleConversions?: number + prevGoogleAdsConversions?: number prevOtherConversions?: number } @@ -135,7 +137,7 @@ export interface UmamiRawData { topEvents: UmamiEventRow[] } -export type TabType = 'overview' | 'google' | 'other' | 'brevo' | 'search-queries' | 'yt-search-ranking' | 'raw-umami' | 'linkedin-raw' | 'youtube-raw' | 'contact-us' +export type TabType = 'overview' | 'google' | 'google-ads' | 'other' | 'brevo' | 'search-queries' | 'yt-search-ranking' | 'raw-umami' | 'linkedin-raw' | 'youtube-raw' | 'contact-us' // Row in the Other tab: signups not attributed to LinkedIn/YouTube/Google. // sub_source labels the kind (Direct / Brevo / OAuth callback / Bing / etc.). From ae322c7ce143b8f04fc1d836d6a0c50f210e9dab Mon Sep 17 00:00:00 2001 From: Sushank Sinha Date: Wed, 1 Jul 2026 01:30:22 +0530 Subject: [PATCH 2/3] docs: update README for first-touch attribution, Google Ads, growth charts; analytics terminology --- AnalyticsTerminology.md | 2543 +++++++++++++++++++++++++++++++++++++++ README.md | 179 ++- 2 files changed, 2656 insertions(+), 66 deletions(-) create mode 100644 AnalyticsTerminology.md diff --git a/AnalyticsTerminology.md b/AnalyticsTerminology.md new file mode 100644 index 0000000..a8360c3 --- /dev/null +++ b/AnalyticsTerminology.md @@ -0,0 +1,2543 @@ +# Analytics Reference Handbook + +> The canonical analytics knowledge base for engineers, product managers, QA engineers, designers, and data analysts. +> +> **Version:** 1.0 +> **Status:** Living document +> **Audience:** Junior developers through senior architects + +This document is intended to be detailed enough that any developer can understand analytics concepts, implement tracking correctly, debug issues, and make future analytics decisions without needing external documentation. + +--- + +## Table of Contents + +1. [Introduction](#1-introduction) +2. [Core Analytics Terminology](#2-core-analytics-terminology) +3. [How Analytics Works](#3-how-analytics-works) +4. [Event Lifecycle](#4-event-lifecycle) +5. [Attribution Models](#5-attribution-models) +6. [Industry Standards](#6-industry-standards) +7. [Analytics Naming Conventions](#7-analytics-naming-conventions) +8. [Event Design Best Practices](#8-event-design-best-practices) +9. [User Identification](#9-user-identification) +10. [Sessions](#10-sessions) +11. [Funnels](#11-funnels) +12. [Cohorts](#12-cohorts) +13. [Retention](#13-retention) +14. [Common Metrics](#14-common-metrics) +15. [Event Taxonomy](#15-event-taxonomy) +16. [Data Quality](#16-data-quality) +17. [Privacy and Compliance](#17-privacy-and-compliance) +18. [Implementation Best Practices](#18-implementation-best-practices) +19. [Debugging Analytics](#19-debugging-analytics) +20. [Real Project Examples](#20-real-project-examples) +21. [Our Analytics Standards](#21-our-analytics-standards) +22. [Developer Checklist](#22-developer-checklist) +23. [FAQ](#23-faq) +24. [Appendix](#24-appendix) + +--- + +### How to read this document + +| If you are a... | Start here | +|---|---| +| Junior developer | [Sections 1-4](#1-introduction), then [7-10](#7-analytics-naming-conventions) | +| Senior engineer / architect | [Sections 3-6](#3-how-analytics-works), [15-18](#15-event-taxonomy) | +| Product manager | [Sections 1, 5, 11-14](#1-introduction) | +| QA engineer | [Sections 16, 19, 22](#16-data-quality) | +| Data analyst | [Sections 5, 12-14, 16](#5-attribution-models) | +| Designer | [Sections 1, 2, 11](#1-introduction) | + +> **Callout legend used throughout this document** +> +> - ✅ **Best Practice** - the recommended way to do something +> - ⚠️ **Common Pitfall** - a mistake people repeatedly make +> - 💡 **Pro Tip** - an expert shortcut or nuance +> - 📌 **Important Note** - something you must not miss +> - 🧪 **Example** - a concrete, practical illustration + +--- + +## 1. Introduction + +### 1.1 What is analytics? + +**Analytics** is the systematic discipline of collecting, measuring, and interpreting data about how people interact with a product or business, in order to make better decisions. + +At its simplest, analytics answers four families of questions: + +| Question family | Example | What it tells you | +|---|---|---| +| **What happened?** | "How many users signed up last week?" | Descriptive analytics | +| **Why did it happen?** | "Why did signups drop on Tuesday?" | Diagnostic analytics | +| **What will happen?** | "How many users will renew next month?" | Predictive analytics | +| **What should we do?** | "Which feature should we build next?" | Prescriptive analytics | + +Analytics is not a single tool or dashboard. It is a pipeline that begins with a real human action and ends with a human decision. + +### 1.2 Why product analytics matters + +Without analytics, product teams operate on opinion, memory, and the loudest voice in the room. With analytics, teams operate on evidence. + +Concretely, product analytics lets you: + +- Discover where users get stuck (drop-off in funnels). +- Quantify the impact of a feature or experiment. +- Prioritize the roadmap by data instead of guesswork. +- Detect regressions (a sudden drop in conversions after a release). +- Understand which acquisition channels actually produce valuable users. +- Forecast revenue and retention. + +> 📌 **Important Note:** Analytics is a decision-support system, not a vanity scoreboard. Every metric you track should connect to a decision someone could make. If no decision depends on a number, you probably should not be spending engineering effort to track it. + +### 1.3 Business analytics vs product analytics + +These two disciplines overlap but answer different questions for different audiences. + +| Dimension | Business Analytics | Product Analytics | +|---|---|---| +| **Primary question** | "Is the company healthy and growing?" | "How do users behave inside the product?" | +| **Typical metrics** | Revenue, MRR, ARR, CAC, LTV, margin | Feature adoption, funnel conversion, retention, session depth | +| **Granularity** | Aggregate, financial | Event-level, behavioral | +| **Primary audience** | Executives, finance, sales | Product managers, engineers, designers | +| **Time horizon** | Quarterly, annual | Daily, weekly, per-release | +| **Typical tools** | Looker, Tableau, Power BI, spreadsheets | Mixpanel, Amplitude, PostHog, GA4 | +| **Source data** | Billing, CRM, ERP, finance systems | Event streams from apps and websites | + +> 🧪 **Example:** Business analytics tells you "revenue grew 8% this quarter." Product analytics tells you "the onboarding checklist feature increased week-1 retention from 34% to 41%, which is what drove the revenue growth." The first describes the outcome; the second explains the mechanism. + +### 1.4 Why event tracking is important + +An **event** is a record that "something happened" at a point in time, performed by someone, with context attached. Event tracking is the foundation that everything else in this document depends on. + +Why events (rather than just page counts) matter: + +- **Events are composable.** From a stream of granular events you can later compute funnels, retention, cohorts, and revenue. You cannot go the other way: aggregate counters cannot be decomposed into the behavior that produced them. +- **Events are future-proof.** You may not know today what question you will ask in six months. A well-designed event stream lets you answer questions you have not thought of yet. +- **Events carry context.** Each event can carry properties (which button, which plan, how much) that make later analysis rich. + +> ⚠️ **Common Pitfall:** Teams often start by tracking only top-line counters ("total signups") and discover months later that they cannot answer "which channel did those signups come from?" because the context was never captured at event time. Capture context when the event happens; you cannot reconstruct it afterward. + +### 1.5 How analytics drives product decisions + +The decision loop looks like this: + +1. **Hypothesis:** "We believe shortening signup to one step will increase conversion." +2. **Instrumentation:** Add events for each signup step. +3. **Measurement:** Observe the funnel before and after the change. +4. **Decision:** Keep, revert, or iterate based on the measured effect. + +This is a continuous loop, not a one-time project. + +### 1.6 The analytics lifecycle + +Every piece of analytics data travels through the same lifecycle, from a human action to a human decision: + +``` +User Action + → Event Generated + → SDK + → Data Collection + → Processing + → Attribution + → Warehouse + → Dashboard + → Decision Making +``` + +```mermaid +flowchart TD + A[User Action
click, view, purchase] --> B[Event Generated
name + properties] + B --> C[SDK
captures + enriches] + C --> D[Data Collection
network request to server] + D --> E[Processing
validate, dedupe, enrich] + E --> F[Attribution
credit assigned to channels] + F --> G[(Data Warehouse
durable storage)] + G --> H[Dashboard
charts, funnels, reports] + H --> I[Decision Making
roadmap, marketing, fixes] + I -.feedback loop.-> A + + style A fill:#e1f5ff,stroke:#0288d1 + style I fill:#e8f5e9,stroke:#388e3c + style G fill:#fff3e0,stroke:#f57c00 +``` + +| Stage | What happens | Who/what owns it | +|---|---|---| +| User Action | A human does something | The user | +| Event Generated | Code decides "this is worth recording" | Application developer | +| SDK | Library captures the event and attaches context | Analytics SDK | +| Data Collection | Event is sent over the network | SDK + ingestion API | +| Processing | Validation, deduplication, enrichment | Analytics platform | +| Attribution | Conversions are credited to marketing touches | Attribution engine | +| Warehouse | Data is stored durably for querying | Data warehouse | +| Dashboard | Humans view aggregated results | BI / analytics tool | +| Decision Making | A human acts on what they see | Product, marketing, eng | + +> 💡 **Pro Tip:** When debugging "the number looks wrong," walk this lifecycle from left to right. The bug is almost always at a specific stage (event never generated, SDK dropped it, network failed, deduplication removed it, attribution credited the wrong channel). Knowing the stages turns a vague "analytics is broken" into a precise diagnosis. See [Section 19](#19-debugging-analytics). + +--- + +## 2. Core Analytics Terminology + +This is the shared vocabulary for the rest of the document. For each term you get a definition, why it matters, an example, and common mistakes. A condensed A-Z version lives in the [Appendix](#241-complete-glossary-a-z). + +### 2.1 User concepts + +#### Anonymous user + +- **Definition:** A user whose real identity is unknown to your system. Tracked only by a device or browser identifier, not a user ID. +- **Why it matters:** Most visitors are anonymous before they sign up. If you cannot track anonymous users, you lose the entire pre-signup journey (which channel brought them, what they browsed). +- **Example:** A first-time visitor browsing your pricing page. You know `device_id: d-9f2a...` but not who they are. +- **Common mistakes:** Treating each anonymous visit as a brand-new user, inflating your user counts. Anonymous identity must persist across visits via a stable device/cookie ID. + +#### Identified user + +- **Definition:** A user you have linked to a known identity (a `user_id`) via an explicit `identify` call. +- **Why it matters:** Identification is what lets you connect pre-signup anonymous behavior to post-signup behavior, and stitch the same person across devices. +- **Example:** After login, you call `identify("user_123")`, linking the anonymous device to that account. +- **Common mistakes:** Calling `identify` with a value that is not stable (e.g. an email that can change) instead of an immutable internal ID. + +#### Logged-in user + +- **Definition:** A user with an active authenticated session right now. +- **Why it matters:** Distinguishes current authenticated state from "we know who this is in general." A user can be identified (we know them) but logged out (no active session). +- **Example:** A user who entered credentials and currently holds a valid session token. +- **Common mistakes:** Conflating "logged in" with "identified." See the comparison below. + +> 📌 **Important Note - these are not the same thing:** +> +> | Term | Meaning | Can be true while logged out? | +> |---|---|---| +> | Anonymous | Identity unknown | Yes | +> | Identified | We have linked a `user_id` to this device | Yes (you can still know who they are after logout) | +> | Logged-in | Active authenticated session now | No | + +#### User ID + +- **Definition:** A stable, unique identifier assigned by *your* system to a person/account. +- **Why it matters:** It is the primary key that ties together all of a person's behavior across devices and time. +- **Example:** `user_id: "usr_8c1e0b"`. +- **Common mistakes:** Using email, phone, or username as the user ID. These are personally identifiable, can change, and may be shared. Use an opaque internal ID. + +#### Device ID + +- **Definition:** A unique identifier for a specific device or browser, usually generated by the SDK and stored in a cookie, local storage, or device storage. +- **Why it matters:** It is how anonymous users are tracked before they identify, and how a single device's history is grouped. +- **Example:** `device_id: "d-7af3c9e1"` stored in a first-party cookie. +- **Common mistakes:** Regenerating the device ID on every page load (caused by storing it somewhere that does not persist), which fragments one user into many. + +#### Session ID + +- **Definition:** An identifier for a single continuous period of activity. See [Sessions](#10-sessions). +- **Why it matters:** Sessions group events into meaningful "visits" for duration, bounce, and engagement analysis. +- **Example:** `session_id: "s-20260629-0033"`. +- **Common mistakes:** Reusing one session ID forever, or starting a new one on every event. A session must reflect a real bounded visit. + +#### Visitor + +- **Definition:** A loose term, usually a synonym for a unique device/browser that has visited (often anonymous). +- **Why it matters:** "Visitors" is the top of most marketing funnels. +- **Example:** "12,000 unique visitors this week." +- **Common mistakes:** Equating visitors with people. One person on three devices is three visitors until identity stitching merges them. + +#### Returning user + +- **Definition:** A user/visitor who has been seen before (not their first ever visit). +- **Why it matters:** Returning vs new is a basic health signal: a healthy product grows its returning base. +- **Example:** A visitor whose `device_id` was first seen 10 days ago. +- **Common mistakes:** Counting someone as "returning" within the same session. + +#### Active user + +- **Definition:** A user who performed a meaningful action within a time window (the definition of "meaningful" is yours to set). +- **Why it matters:** "Active users" (DAU/WAU/MAU) is the headline engagement metric for most products. See [Retention](#13-retention). +- **Example:** A user who logged in and sent at least one message today counts as a daily active user. +- **Common mistakes:** Defining "active" as merely opening the app. Choose an action that reflects real value (see "north-star" thinking in [Section 14](#14-common-metrics)). + +#### New user + +- **Definition:** A user on their first ever interaction with the product. +- **Why it matters:** New-user volume is your acquisition signal; new vs returning splits drive most early-funnel analysis. +- **Example:** A user whose first event ever was 3 minutes ago. +- **Common mistakes:** Re-counting an existing user as "new" because their device ID reset or they appeared on a new device. + +### 2.2 Session concepts + +#### Session + +- **Definition:** A group of events from one user that occur together in time, representing a single visit. +- **Why it matters:** Sessions are the unit for "visit" analysis: how long, how deep, did they bounce. +- **Example:** A 12-minute visit containing 1 page view, 3 clicks, and 1 purchase. +- **Common mistakes:** Comparing session counts across tools without checking that they define sessions the same way (they often do not, see [Section 6](#6-industry-standards)). + +#### Session start + +- **Definition:** The first event of a session, or an explicit `session_start` event. +- **Why it matters:** Marks the beginning of a visit and is where landing page / referrer / UTM are captured. +- **Example:** User opens the app after 40 minutes of inactivity, triggering a new session start. + +#### Session end + +- **Definition:** The point a session is considered over, usually inferred after a timeout of inactivity rather than an explicit action. +- **Why it matters:** Determines session duration and which page is the exit page. +- **Example:** No activity for 30 minutes, so the previous session is closed. +- **Common mistakes:** Expecting a clean "session end" event on the web. Browsers often close without warning, so ends are usually inferred. + +#### Session timeout + +- **Definition:** The period of inactivity after which the next event starts a new session. The industry default is **30 minutes**. +- **Why it matters:** It is the single knob that most affects session counts and durations. +- **Example:** With a 30-minute timeout, returning after 31 minutes starts session #2. +- **Common mistakes:** Different timeouts in web vs mobile producing inconsistent session counts. + +#### Session duration + +- **Definition:** Time between the first and last event of a session. +- **Why it matters:** A rough engagement proxy. +- **Example:** First event 10:00:00, last event 10:12:30 → duration 12m30s. +- **Common mistakes:** Single-event sessions have a duration of zero (there is no "last minus first"), which can silently drag down averages. + +#### Bounce + +- **Definition:** A session with no meaningful engagement, classically a single-page session with no interaction. +- **Why it matters:** High bounce on a landing page suggests a mismatch between expectation and content. +- **Example:** User lands, reads nothing, leaves within 2 seconds → bounce. +- **Common mistakes:** Treating all single-page visits as bounces even when the user read a full article for 5 minutes (that is engaged, not a bounce). Modern tools (GA4) define bounce as the inverse of "engaged session," not just single-page. + +#### Engagement + +- **Definition:** Evidence that the user actually interacted meaningfully (time spent, scroll, clicks, conversions). +- **Why it matters:** A better quality signal than raw page views. +- **Example:** GA4 counts a session as "engaged" if it lasts 10+ seconds, has a conversion, or has 2+ page views. + +### 2.3 Page concepts + +#### Page view + +- **Definition:** A record that a web page was loaded/viewed. +- **Why it matters:** The fundamental web metric; the basis for traffic and content reports. +- **Example:** `page_view` with `path: "/pricing"`. +- **Common mistakes:** In single-page apps (SPAs), the page does not reload on navigation, so you must fire `page_view` manually on route changes. + +#### Screen view + +- **Definition:** The mobile equivalent of a page view: a screen was shown. +- **Why it matters:** Mobile navigation analytics depend on it. +- **Example:** `screen_view` with `screen_name: "Checkout"`. +- **Common mistakes:** Forgetting to fire screen views on modal/overlay screens. + +#### Landing page + +- **Definition:** The first page a user sees in a session. +- **Why it matters:** It is where acquisition channels deliver users; key for campaign and SEO analysis. +- **Example:** A Google Ad sends users to `/promo/summer`, the landing page. + +#### Exit page + +- **Definition:** The last page a user sees before the session ends. +- **Why it matters:** High exit rates on non-final pages can indicate friction. +- **Example:** Many sessions ending on `/checkout/payment` is a red flag. +- **Common mistakes:** Confusing exit page (last page of any session) with bounce (a single-page session). + +#### Referrer + +- **Definition:** The URL/source that sent the user to your page (from the HTTP `Referer` header or `document.referrer`). +- **Why it matters:** Tells you where traffic came from when UTMs are absent. +- **Example:** `referrer: "https://news.ycombinator.com/"`. +- **Common mistakes:** Relying on referrer alone. It is often stripped (HTTPS→HTTP, privacy settings, in-app browsers). Pair it with UTMs. + +#### Navigation flow + +- **Definition:** The ordered sequence of pages/screens a user moves through. +- **Why it matters:** Reveals real paths users take, which are rarely the ones designers imagined. +- **Example:** `/home → /search → /product/42 → /cart → /checkout`. + +### 2.4 Timing concepts + +#### First touch + +- **Definition:** The very first marketing interaction that brought a user into your orbit, ever. +- **Why it matters:** Core to first-touch attribution (which channel *discovered* this user). See [Section 5](#5-attribution-models). +- **Example:** A user's first touch was a Google Ad 30 days before they bought. + +#### Last touch + +- **Definition:** The most recent marketing interaction before a conversion. +- **Why it matters:** Core to last-touch attribution (which channel *closed* this user). +- **Example:** The user's last touch before buying was an email link. + +#### First visit + +- **Definition:** The first session a user ever has on your product. +- **Why it matters:** Anchors "new user" and cohort definitions. +- **Example:** First visit on 2026-06-01 places the user in the June cohort. + +> 💡 **Pro Tip - first touch vs first visit:** First *touch* is about the marketing channel that earned the attention (an ad impression/click). First *visit* is the first actual session on your property. They often coincide but not always: a user might be "touched" by a billboard or a podcast ad (no click) and *visit* later by typing your URL directly. See [FAQ](#23-faq). + +#### First session + +- **Definition:** A synonym for first visit in most tools; the inaugural session. +- **Why it matters:** Many onboarding metrics ("activated within first session") depend on it. + +#### Event timestamp + +- **Definition:** The time an event is considered to have occurred. +- **Why it matters:** Determines ordering, sessionization, and which day a metric falls on. +- **Example:** `timestamp: "2026-06-29T10:12:30.221Z"`. + +#### Server timestamp + +- **Definition:** The time the analytics server received the event. +- **Why it matters:** Reliable and monotonic; immune to wrong device clocks. +- **Example:** Event created on a phone at 10:00 but received at 10:45 (was offline). Server timestamp = 10:45. + +#### Client timestamp + +- **Definition:** The time the client/device claims the event happened. +- **Why it matters:** More accurate to the user's real action, but only as trustworthy as the device clock. +- **Example:** Client says 10:00; correct if the clock is right. + +> ⚠️ **Common Pitfall - clock skew:** Device clocks are frequently wrong (manually set, wrong timezone, drift). Relying solely on client timestamps produces events "in the future" or out of order. A common fix: record both, and compute `clock_skew = server_received - client_sent` to correct client times. See [Data Quality](#16-data-quality). + +#### Time on page + +- **Definition:** How long a user spent on a particular page/screen. +- **Why it matters:** Engagement and content-quality signal. +- **Example:** 45 seconds on an article page. +- **Common mistakes:** Classically computed as "next page view time minus this page view time," which means the **last** page in a session has no measurable time on page (there is no next view). Modern SDKs use visibility/heartbeat events instead. + +### 2.5 Traffic concepts (UTM parameters) + +**UTM parameters** ("Urchin Tracking Module," named after the company Google acquired to build Google Analytics) are tags you append to a URL to record where a click came from. They are the de-facto standard for campaign attribution. + +A tagged URL looks like: + +``` +https://example.com/signup?utm_source=google&utm_medium=cpc&utm_campaign=summer_sale&utm_term=running+shoes&utm_content=hero_banner +``` + +| Parameter | Question it answers | Example values | Required? | +|---|---|---|---| +| `utm_source` | **Where** did the traffic come from? | `google`, `facebook`, `newsletter`, `youtube` | Strongly recommended | +| `utm_medium` | **What type** of channel? | `cpc`, `paid`, `organic`, `email`, `video`, `social` | Strongly recommended | +| `utm_campaign` | **Which** campaign/initiative? | `summer_sale`, `q3_launch` | Recommended | +| `utm_term` | Which **keyword** (paid search)? | `running+shoes` | Optional | +| `utm_content` | Which **creative/variant**? | `hero_banner`, `text_link_a` | Optional (great for A/B) | + +> 🧪 **Example - source / medium pairs you will see constantly:** +> +> | source / medium | Meaning | +> |---|---| +> | `google / cpc` | Paid Google Search click (cost-per-click) | +> | `facebook / paid` | Paid Facebook ad | +> | `linkedin / organic` | Unpaid LinkedIn post/share | +> | `newsletter / email` | A link inside your email newsletter | +> | `youtube / video` | A link from a YouTube video/description | + +#### How UTM values flow through the system + +```mermaid +flowchart LR + A[Marketer builds
tagged URL] --> B[User clicks link
?utm_source=google...] + B --> C[Landing page loads
UTMs in URL query string] + C --> D[SDK reads UTMs
from window.location] + D --> E[SDK stores UTMs
cookie / storage] + E --> F[UTMs attached to
session + events] + F --> G[On conversion,
attribution reads stored UTMs] + G --> H[Dashboard credits
the channel] + + style A fill:#e1f5ff,stroke:#0288d1 + style H fill:#e8f5e9,stroke:#388e3c +``` + +The critical insight: the UTM values exist in the URL **only at the moment of the click**. The SDK must **read and persist** them immediately, because the user will navigate away and the UTMs will vanish from the URL. The stored UTMs are then attached to the session and recalled at conversion time to assign credit. + +> ✅ **Best Practice:** Persist first-touch UTMs (the very first campaign) *and* last-touch UTMs (the most recent campaign) separately. This lets you run different attribution models later without re-instrumenting. See [Attribution Models](#5-attribution-models). + +> ⚠️ **Common Pitfall:** Inconsistent casing and spelling (`Google` vs `google`, `e-mail` vs `email`) fragments your reports into near-duplicate rows. Enforce a controlled vocabulary and lowercase everything. See [Naming Conventions](#7-analytics-naming-conventions). + +### 2.6 Marketing concepts: Pirate Metrics (AARRR) + +**AARRR**, coined by Dave McClure and nicknamed "Pirate Metrics" (because it sounds like "arrr"), is a framework that breaks the customer journey into five stages. Each stage has its own metrics and its own owner. + +```mermaid +flowchart TD + A[Acquisition
How do users find you?] --> B[Activation
Do they have a great first experience?] + B --> C[Retention
Do they come back?] + C --> D[Revenue
Do they pay?] + D --> E[Referral
Do they tell others?] + E -.referrals become
new acquisition.-> A + + style A fill:#e3f2fd,stroke:#1976d2 + style B fill:#e8f5e9,stroke:#388e3c + style C fill:#fff3e0,stroke:#f57c00 + style D fill:#fce4ec,stroke:#c2185b + style E fill:#f3e5f5,stroke:#7b1fa2 +``` + +| Stage | Question | Example metric | Example event(s) | +|---|---|---|---| +| **Acquisition** | How do users find you? | Traffic by channel, signups | `page_view`, `signup_started` | +| **Activation** | Do they reach first value ("aha moment")? | Activation rate | `onboarding_completed`, `first_project_created` | +| **Retention** | Do they come back? | D1/D7/D30 retention | repeat `app_opened`, `session_start` | +| **Revenue** | Do they pay, and how much? | Conversion rate, ARPU, MRR | `subscription_purchased` | +| **Referral** | Do they bring others? | Viral coefficient, referrals sent | `invite_sent`, `referral_signup` | + +> 💡 **Pro Tip:** Many teams add a leading "Awareness" stage (making AAARRR) or reorder Retention before Revenue. The exact letters matter less than the discipline of measuring each stage of the journey rather than only the final sale. + +Individually, the five stages map to deeper concepts: + +- **Acquisition** = getting users to your product (channels, UTMs, [attribution](#5-attribution-models)). +- **Activation** = the first valuable experience; often gated by [onboarding funnels](#11-funnels). +- **Retention** = bringing users back; measured with [cohorts](#12-cohorts) and [retention curves](#13-retention). +- **Revenue** = monetization; measured with [LTV, ARPU, MRR](#14-common-metrics). +- **Referral** = users acquiring users; the only "free" acquisition channel. + +### 2.7 Event concepts + +These are the building blocks of all behavioral analytics. + +#### Event + +- **Definition:** A record that something happened, at a time, by an actor, with context. +- **Why it matters:** The atomic unit of product analytics. +- **Example:** A user clicked "Checkout." + +#### Event name + +- **Definition:** The string identifying the *type* of event. +- **Why it matters:** It is how you group and count the same action across users. Naming is so important it has [its own section](#7-analytics-naming-conventions). +- **Example:** `"checkout_started"`. + +#### Event properties + +- **Definition:** Key-value context about *this specific occurrence* of the event. +- **Why it matters:** Properties let you slice and filter ("checkouts where `cart_value > 100`"). +- **Example:** `{ "cart_value": 129.99, "item_count": 3, "currency": "USD" }`. + +#### User properties + +- **Definition:** Attributes of the *person* (not the event), stored on the user profile and updated over time. +- **Why it matters:** Let you segment users ("show retention for users where `plan = pro`"). +- **Example:** `{ "plan": "pro", "company_size": "50-100", "signup_date": "2026-01-10" }`. + +> 📌 **Important Note - event properties vs user properties:** +> +> | | Event property | User property | +> |---|---|---| +> | Describes | A single event occurrence | The person | +> | Lifespan | Frozen at event time | Mutable; reflects latest known value | +> | Example | `cart_value` on *this* purchase | `lifetime_value` of the user | +> | Set via | `track(event, properties)` | `identify(userId, traits)` | +> +> Mixing these up is one of the most common modeling errors. Ask: "is this about the click, or about the clicker?" + +#### Custom dimensions + +- **Definition:** (GA terminology) user- or event-scoped attributes you define beyond the defaults. Effectively GA's name for custom properties. +- **Example:** A `membership_tier` custom dimension. + +#### Custom metrics + +- **Definition:** (GA terminology) numeric values you define to aggregate (sum/average). +- **Example:** A `video_seconds_watched` custom metric. + +#### Event parameters + +- **Definition:** (GA4 terminology) the key-value pairs sent with an event. GA4's word for event properties. +- **Example:** `value`, `currency`, `items` on a `purchase` event. + +#### JSON examples + +A minimal event: + +```json +{ + "event": "button_clicked", + "properties": { + "button_name": "Checkout", + "page": "Cart" + } +} +``` + +A fully enriched event as it might look after the SDK adds context (see [Event Lifecycle](#4-event-lifecycle)): + +```json +{ + "event": "checkout_started", + "event_id": "evt_01HZX8K3M2QF", + "timestamp_client": "2026-06-29T10:12:30.221Z", + "timestamp_server": "2026-06-29T10:12:31.004Z", + "anonymous_id": "d-7af3c9e1", + "user_id": "usr_8c1e0b", + "session_id": "s-20260629-0033", + "properties": { + "cart_value": 129.99, + "item_count": 3, + "currency": "USD", + "coupon_applied": false + }, + "user_properties": { + "plan": "free", + "signup_date": "2026-06-01" + }, + "context": { + "page": { "path": "/cart", "referrer": "https://google.com/" }, + "campaign": { + "source": "google", + "medium": "cpc", + "name": "summer_sale" + }, + "device": { "type": "mobile", "os": "iOS 18.2" }, + "library": { "name": "analytics-js", "version": "5.4.0" } + } +} +``` + +--- + +## 3. How Analytics Works + +This section explains the full pipeline from first principles: how a click in a browser becomes a row in a warehouse. + +### 3.1 The pipeline at a glance + +``` +Browser + ↓ +Analytics SDK + ↓ +Network Request + ↓ +Analytics Server (ingestion API) + ↓ +Storage (raw event store / queue) + ↓ +Data Warehouse + ↓ +Dashboard +``` + +```mermaid +flowchart TD + subgraph Client["Client (browser / app)"] + A[User interaction] --> B[Analytics SDK] + B --> C[Local event buffer/queue] + end + C -->|batched HTTP POST| D[Ingestion API / Collector] + subgraph Backend["Analytics backend"] + D --> E[Validation + deduplication] + E --> F[(Raw event store / stream)] + F --> G[Enrichment + sessionization + attribution] + G --> H[(Data Warehouse)] + end + H --> I[Query engine] + I --> J[Dashboards / funnels / cohorts] + + style A fill:#e1f5ff,stroke:#0288d1 + style J fill:#e8f5e9,stroke:#388e3c + style H fill:#fff3e0,stroke:#f57c00 +``` + +### 3.2 Event collection + +The SDK exposes a small API, typically: + +- `track(eventName, properties)` - record a behavioral event. +- `identify(userId, traits)` - associate the current device with a known user. +- `page()` / `screen()` - record a page or screen view. +- `group()` - associate a user with an account/organization (B2B). +- `alias()` - merge two identities. + +When you call `track`, the SDK does **not** usually send the event immediately. It enriches it (adds IDs, timestamps, context, see [Section 4](#4-event-lifecycle)) and places it into a local buffer. + +```javascript +// Conceptual SDK usage +analytics.identify("usr_8c1e0b", { plan: "free" }); + +analytics.track("checkout_started", { + cart_value: 129.99, + item_count: 3, + currency: "USD" +}); +``` + +### 3.3 Buffering and batch uploads + +Sending one network request per event is wasteful (HTTP overhead, battery drain on mobile, rate limits). Instead, SDKs **buffer** events and **flush** them as a batch. + +Typical flush triggers: + +| Trigger | Example default | +|---|---| +| Batch size reached | every 20 events | +| Time interval elapsed | every 10-30 seconds | +| App backgrounded / page unload | flush immediately | +| Manual `flush()` call | on demand | + +```mermaid +sequenceDiagram + participant App + participant SDK + participant Buffer + participant Server + App->>SDK: track(event A) + SDK->>Buffer: enqueue A + App->>SDK: track(event B) + SDK->>Buffer: enqueue B + Note over Buffer: batch size / timer reached + Buffer->>Server: POST [A, B] + Server-->>Buffer: 200 OK + Note over Buffer: clear flushed events +``` + +> ⚠️ **Common Pitfall:** Events buffered but not yet flushed are lost if the page closes or the app is killed. SDKs mitigate this by flushing on `visibilitychange`/`pagehide` (web) or on background (mobile), and by persisting the buffer to storage. When debugging "my last event before navigation is missing," suspect an unflushed buffer. + +### 3.4 Retry and offline mode + +Networks fail. A robust SDK: + +- **Retries** failed uploads with **exponential backoff** (wait 1s, 2s, 4s, 8s...) plus jitter, so a recovering server is not stampeded. +- **Persists** the queue to durable storage (IndexedDB, local storage, disk), so events survive app restarts. +- Supports **offline mode**: events captured with no connectivity are stored and uploaded when the network returns. + +```javascript +// Pseudo-code: retry with exponential backoff +async function flush(batch, attempt = 0) { + try { + await post("/v1/batch", batch); + } catch (err) { + if (attempt >= MAX_RETRIES) return persistToDeadLetter(batch); + const delay = Math.min(BASE * 2 ** attempt, MAX_DELAY) + + Math.random() * JITTER; + await sleep(delay); + return flush(batch, attempt + 1); + } +} +``` + +> 📌 **Important Note:** Offline mode is exactly why **client timestamps** matter. An event created offline at 9:00 and uploaded at 14:00 must keep its 9:00 client timestamp, or your data is wrong by 5 hours. See [clock skew](#244-timing-concepts). + +### 3.5 Event ordering + +Because of batching, retries, and offline mode, events can **arrive out of order**. The server must be able to reconstruct the true sequence. + +How ordering is preserved: + +- **Client timestamps** give the intended order. +- **Monotonic sequence numbers** per device/session (event 1, 2, 3...) break timestamp ties and detect gaps. +- Server reorders by `(session_id, sequence_number)` or `(timestamp_client, received_order)`. + +> 🧪 **Example:** A user adds to cart, then checks out, all offline. Both upload at once. If you sorted by *server* time they could appear simultaneous or reversed. A per-session sequence number (`seq: 1` for add-to-cart, `seq: 2` for checkout) preserves the truth. + +### 3.6 Deduplication and event IDs + +The same event can arrive **more than once**: a retry succeeded on the server but the acknowledgement was lost, so the client retries again. Without protection, you double-count. + +The fix is an **idempotency key**: every event carries a unique `event_id` (often a UUID or ULID) generated **on the client at creation time**. The server keeps a short-term record of seen IDs and discards duplicates. + +```mermaid +flowchart LR + A[Event created
event_id: evt_123] --> B[Sent] + B --> C{Server saw
evt_123 before?} + C -- No --> D[Store + mark seen] + C -- Yes --> E[Discard duplicate] + + style D fill:#e8f5e9,stroke:#388e3c + style E fill:#ffebee,stroke:#c62828 +``` + +| Term | Role | +|---|---| +| **Event ID** | Globally unique key per event, set on the client. Enables deduplication and exactly-once semantics. | +| **Deduplication** | Server-side process of dropping events whose ID was already ingested. | +| **Idempotency** | The property that processing the same event twice has the same effect as once. | + +> ✅ **Best Practice:** Generate the `event_id` on the client at the moment of creation, **not** on the server at receipt. A server-generated ID cannot detect a retried duplicate, because the retry would get a fresh ID. See [FAQ: "Why are events duplicated?"](#23-faq). + +### 3.7 From storage to warehouse to dashboard + +After ingestion and enrichment, events land in durable storage and then a **data warehouse** (BigQuery, Snowflake, Redshift, ClickHouse). The warehouse is optimized for analytical queries over billions of rows. + +- **Storage / raw event store:** append-only, immutable record of every event. The source of truth. +- **Warehouse:** modeled, queryable tables (often transformed via an [ETL/ELT](#2421-our-analytics-standards) process). +- **Dashboard:** the visualization layer (funnels, retention curves, charts) that humans read. + +> 💡 **Pro Tip:** Keep raw events immutable. When (not if) you discover a bug in your transformation logic, you want to reprocess from raw rather than having destroyed the original data. "Raw is sacred" is a core data-engineering principle. + +--- + +## 4. Event Lifecycle + +This section traces a single event from a click to a report, naming every field that gets attached and when. + +### 4.1 The stages + +``` +User clicks button + ↓ +SDK captures + ↓ +Properties attached + ↓ +User properties added + ↓ +Session info attached + ↓ +UTM attached + ↓ +Timestamp added + ↓ +Upload + ↓ +Processing + ↓ +Storage + ↓ +Reporting +``` + +```mermaid +flowchart TD + A[User clicks button] --> B[SDK captures event
name = checkout_started] + B --> C[Event properties attached
cart_value, item_count] + C --> D[User properties added
plan, signup_date] + D --> E[Session info attached
session_id, is_first_session] + E --> F[UTM / campaign attached
source, medium, campaign] + F --> G[Identity attached
anonymous_id, user_id] + G --> H[Timestamps + event_id added
client+server time, UUID] + H --> I[Device/context attached
os, app version, locale] + I --> J[Buffer + Upload] + J --> K[Processing
validate, dedupe, sessionize, attribute] + K --> L[(Storage / Warehouse)] + L --> M[Reporting / Dashboard] + + style A fill:#e1f5ff,stroke:#0288d1 + style M fill:#e8f5e9,stroke:#388e3c +``` + +### 4.2 What each stage adds + +| Stage | Field(s) attached | Source | Notes | +|---|---|---|---| +| SDK captures | `event` (name) | Your `track()` call | The only required argument besides properties | +| Properties | `properties.*` | Your `track()` call | Context of this occurrence | +| User properties | `user_properties.*` | Last `identify()` traits | Snapshot of who the user is | +| Session info | `session_id`, `is_first_session` | SDK session manager | See [Sessions](#10-sessions) | +| UTM / campaign | `context.campaign.*` | Persisted from landing URL | First-touch and/or last-touch | +| Identity | `anonymous_id`, `user_id` | SDK identity store | `user_id` null if anonymous | +| Timestamps + ID | `timestamp_client`, `event_id`, `seq` | SDK at creation | `event_id` enables dedup | +| Device / context | `context.device`, `context.os`, `context.library` | SDK auto-collected | Locale, screen size, app version | +| Upload | (none) | network | Batched | +| Processing | `timestamp_server`, enriched attributes | Server | Validation, geo-IP, attribution | +| Storage | (row in warehouse) | Pipeline | Immutable | +| Reporting | (aggregations) | Query layer | Funnels, retention, etc. | + +> 📌 **Important Note:** The order matters. Identity and UTMs must be resolved **before** upload, because the server may not have the context the client had. The client is the only place that knows the current session, the persisted first-touch UTM, and the device clock. + +> 🧪 **Worked example - the same click, fully enriched:** +> +> A user on an iPhone, who arrived two days ago from a Google Ad and signed up yesterday, clicks "Checkout": +> +> ```json +> { +> "event": "checkout_started", +> "event_id": "evt_01HZX8K3M2QF7Y", +> "seq": 7, +> "timestamp_client": "2026-06-29T10:12:30.221Z", +> "timestamp_server": "2026-06-29T10:12:30.998Z", +> "anonymous_id": "d-7af3c9e1", +> "user_id": "usr_8c1e0b", +> "session_id": "s-20260629-0033", +> "is_first_session": false, +> "properties": { "cart_value": 129.99, "item_count": 3, "currency": "USD" }, +> "user_properties": { "plan": "free", "signup_date": "2026-06-28" }, +> "context": { +> "campaign_first_touch": { "source": "google", "medium": "cpc", "name": "summer_sale" }, +> "campaign_last_touch": { "source": "newsletter", "medium": "email", "name": "weekly_digest" }, +> "device": { "type": "mobile", "os": "iOS 18.2", "model": "iPhone15,3" }, +> "page": { "path": "/cart" }, +> "library": { "name": "analytics-js", "version": "5.4.0" } +> } +> } +> ``` +> +> Notice both first-touch (Google) and last-touch (newsletter) campaigns travel with the event, so any attribution model can be applied later. See [Section 5](#5-attribution-models). + +--- + +## 5. Attribution Models + +**Attribution** is the art and science of deciding *which marketing touchpoint(s) deserve credit* for a conversion. It is one of the most consequential and most misunderstood areas of analytics, because it directly drives how marketing budgets are spent. + +### 5.1 The core problem + +Users rarely convert on first contact. A realistic journey: + +``` +Google Ad → Blog post → Newsletter → LinkedIn → Purchase + (Day 1) (Day 3) (Day 8) (Day 12) (Day 14) +``` + +```mermaid +flowchart LR + A[Google Ad
Day 1
first touch] --> B[Blog post
Day 3] + B --> C[Newsletter
Day 8] + C --> D[LinkedIn
Day 12
last touch] + D --> E[💰 Purchase
Day 14] + + style A fill:#e3f2fd,stroke:#1976d2 + style D fill:#fff3e0,stroke:#f57c00 + style E fill:#e8f5e9,stroke:#388e3c +``` + +Five touchpoints contributed. Who gets credit for the sale? The answer depends on the **attribution model** you choose. Different models will tell you to spend your budget very differently, even on the exact same data. + +> 📌 **Important Note:** Attribution is a *modeling choice*, not a fact. There is no single "correct" answer. The goal is to pick a model whose assumptions match how your buyers actually decide, and to be consistent so trends are comparable over time. + +We will use the journey above (100 USD conversion value) to show how each model assigns credit. + +### 5.2 First-touch attribution + +**100% of credit to the first interaction.** + +| | | +|---|---| +| **Credit split** | Google Ad: 100. All others: 0. | +| **Question it answers** | "Which channel *discovers* new customers?" | +| **Advantages** | Simple; rewards top-of-funnel/awareness channels; good for demand-generation analysis. | +| **Disadvantages** | Ignores everything that nurtured and closed the deal; over-credits the first channel. | +| **When to use** | When your priority is finding new audiences and measuring awareness. | + +### 5.3 Last-touch attribution + +**100% of credit to the final interaction before conversion.** This is the historical default of most tools. + +| | | +|---|---| +| **Credit split** | LinkedIn: 100. All others: 0. | +| **Question it answers** | "Which channel *closes* the sale?" | +| **Advantages** | Dead simple; matches "what happened right before they bought"; easy to explain. | +| **Disadvantages** | Ignores the entire journey that built intent; over-credits bottom-funnel/branded channels. | +| **When to use** | Short sales cycles, or when you only care about the closing channel. | + +> ⚠️ **Common Pitfall - "last non-direct" nuance:** Pure last-touch often credits "direct" traffic (user typed your URL), which is not really a marketing channel. Many tools default to **last non-direct touch** to avoid crediting the conversion to "the user already knew us." Know which variant your tool uses, see [FAQ](#23-faq). + +### 5.4 Linear attribution + +**Credit split equally across all touchpoints.** + +| | | +|---|---| +| **Credit split** | Each of 5 touches: 20. | +| **Question it answers** | "What is the full set of channels involved, weighted evenly?" | +| **Advantages** | Acknowledges the whole journey; no single channel is ignored. | +| **Disadvantages** | Pretends all touches are equally important, which is rarely true. | +| **When to use** | Long, multi-touch journeys where every interaction plausibly matters. | + +### 5.5 Position-based (U-shaped) attribution + +**Most credit to the first and last touch, the rest shared among middle touches.** A common split is 40% first, 40% last, 20% shared by the middle. + +| | | +|---|---| +| **Credit split** | Google Ad: 40, LinkedIn: 40, the three middle touches share 20 → ~6.7 each. | +| **Question it answers** | "Who discovered and who closed, while still acknowledging the middle?" | +| **Advantages** | Rewards the two arguably most important moments (first interest, final push). | +| **Disadvantages** | The 40/20/40 weighting is a convention, not a derived truth. | +| **When to use** | When both demand generation and closing matter and you want a balanced default. | + +### 5.6 Time-decay attribution + +**More credit to touches closer in time to the conversion.** Credit decays exponentially as you go back (e.g. a 7-day half-life). + +| | | +|---|---| +| **Credit split** | LinkedIn (Day 12) > Newsletter (Day 8) > Blog (Day 3) > Google Ad (Day 1). | +| **Question it answers** | "Which recent touches drove the final decision?" | +| **Advantages** | Reflects that recent interactions often weigh more in a decision. | +| **Disadvantages** | Systematically under-credits awareness channels that plant the first seed. | +| **When to use** | Longer sales cycles where momentum builds toward the close. | + +### 5.7 Data-driven attribution + +**Credit assigned by a statistical/ML model that learns each touchpoint's actual incremental contribution** from your historical converting and non-converting paths. + +```mermaid +flowchart TD + A[Thousands of journeys
converting + non-converting] --> B[Model learns:
how much does each channel
change conversion probability?] + B --> C[Credit = measured
incremental lift per touch] + C --> D[Weights differ per
journey and channel mix] + + style A fill:#e1f5ff,stroke:#0288d1 + style D fill:#e8f5e9,stroke:#388e3c +``` + +| | | +|---|---| +| **Credit split** | Determined by the model; e.g. it might learn the newsletter is the real driver and weight it highest, even though it is neither first nor last. | +| **Question it answers** | "What is each channel's *incremental* contribution, empirically?" | +| **Advantages** | Most accurate when data volume is sufficient; reduces human bias. | +| **Disadvantages** | Requires lots of data; is a "black box"; harder to explain; can shift as the model retrains (see [FAQ: "Why is attribution changing?"](#23-faq)). | +| **When to use** | High traffic, mature analytics, where precision justifies complexity. GA4 and large ad platforms default to this. | + +### 5.8 Side-by-side comparison + +Using the 5-touch, 100 USD journey: + +| Touchpoint | First-touch | Last-touch | Linear | Position (40/20/40) | Time-decay | Data-driven | +|---|---|---|---|---|---|---| +| Google Ad (Day 1) | **100** | 0 | 20 | 40 | ~7 | learned | +| Blog (Day 3) | 0 | 0 | 20 | ~6.7 | ~12 | learned | +| Newsletter (Day 8) | 0 | 0 | 20 | ~6.7 | ~24 | learned | +| LinkedIn (Day 12) | 0 | **100** | 20 | 40 | ~57 | learned | +| Purchase touch | n/a | n/a | n/a | n/a | n/a | n/a | +| **Total** | 100 | 100 | 100 | 100 | 100 | 100 | + +(Time-decay figures are illustrative; exact values depend on the half-life.) + +> 💡 **Pro Tip:** Notice every model totals 100. Attribution **redistributes** a fixed amount of credit; it never creates more. The debate is purely about *how to split*, which is why two honest analysts can disagree without either being wrong. + +### 5.9 Choosing a model + +| Your situation | Suggested starting model | +|---|---| +| Short, impulse purchases | Last-touch | +| Awareness/brand-building focus | First-touch | +| Long B2B sales cycle | Position-based or time-decay | +| Lots of data, mature team | Data-driven | +| You are not sure | Position-based (a reasonable balanced default) | + +> ✅ **Best Practice:** Persist enough raw touch data (first-touch and last-touch UTMs, plus the full touch timeline if possible) so you can recompute *any* model later. The worst position is having committed to one model at collection time and being unable to answer "what would last-touch say?" without re-instrumenting. + +--- + +## 6. Industry Standards + +Different platforms make different default choices. Knowing them prevents the classic "the same metric is different in two tools" confusion (it is usually a definition difference, not a bug). + +### 6.1 Platform comparison + +| Platform | Session model | User identification | Default attribution | Event naming style | Best at | +|---|---|---|---|---|---| +| **Google Analytics 4** | Event-based; 30-min timeout, resets on new campaign | `user_id` + Google Signals + device | Data-driven (default) | `snake_case`, recommended event list | Web + app, free, ads integration | +| **Mixpanel** | 30-min inactivity sessions (configurable) | `distinct_id`; `identify`/`alias` merge | Configurable; report-time | `Title Case` or `snake_case` (Object Action) | Product analytics, funnels, retention | +| **Amplitude** | 30-min inactivity (configurable) | `user_id` + `device_id` resolution | Configurable; report-time | `Object Action` (`Song Played`) | Behavioral cohorts, experimentation | +| **Segment** | Does not define sessions; it is a router | `userId` + `anonymousId`; `identify`/`alias` | n/a (forwards to destinations) | `Object Action`, Title Case (spec) | CDP / piping data to many tools | +| **Adobe Analytics** | Visit-based; 30-min timeout | Visitor ID / ECID; CDP via AEP | Rule-based + Adobe data-driven | eVars/props/events (numbered) | Enterprise, deep customization | +| **PostHog** | 30-min inactivity sessions | `distinct_id`; `identify`/`alias`/`merge` | Configurable | `snake_case` recommended | Open-source, self-host, product+session replay | + +### 6.2 Notable details and trade-offs + +**Google Analytics 4** + +- Strengths: free, tight Google Ads integration, BigQuery export, cross web+app. +- Weaknesses: sampling on large/complex queries, steeper learning curve, data thresholds. +- Enterprise usage: marketing attribution, web traffic, ad ROAS. +- Quirk: a **new session starts whenever a new campaign/UTM is detected**, even within the timeout window, which can inflate session counts versus other tools. + +**Mixpanel** + +- Strengths: fast funnels and retention, friendly UI, generous identity merging. +- Weaknesses: sessions are a later add-on (it is event-first), cost at scale. +- Enterprise usage: product teams analyzing feature adoption. +- Naming: official guidance favors `Object Action` in Title Case (`Signup Completed`). + +**Amplitude** + +- Strengths: behavioral cohorts, pathfinder, experiment analysis. +- Weaknesses: can be expensive; modeling discipline required to stay clean. +- Naming: `Object Action` (`Song Played`, `Page Viewed`). + +**Segment (a CDP, not an analytics tool)** + +- Strengths: collect once, route to many destinations; consistent schema. +- Weaknesses: it does not analyze; you still need a destination tool. Cost scales with volume. +- Spec: the canonical `track`/`identify`/`page`/`group`/`alias` API that many others mimic. + +**Adobe Analytics** + +- Strengths: extreme enterprise customization, governance, integration with Adobe Experience Cloud. +- Weaknesses: complexity, cost, specialist skills (eVars, props, processing rules). +- Enterprise usage: large enterprises with dedicated analytics teams. + +**PostHog** + +- Strengths: open-source, self-hostable (data sovereignty), bundles analytics + session replay + feature flags + experiments. +- Weaknesses: younger ecosystem; self-hosting has ops cost. +- Enterprise usage: privacy-sensitive orgs, engineering-led product teams. + +> 💡 **Pro Tip - the unifying pattern:** Almost everyone uses a **30-minute inactivity session** and an **`identify`/`alias`/`merge`** identity model derived from the Segment spec. If you learn that core model, every tool feels familiar. The biggest gotcha is GA4 restarting sessions on new campaigns. + +> ⚠️ **Common Pitfall:** Comparing absolute numbers across tools and assuming a discrepancy is a bug. A 5-10% gap between, say, GA4 sessions and Mixpanel sessions is *expected* due to differing session definitions, bot filtering, and consent handling. Compare *trends within one tool*, not absolute numbers across tools. + +--- + +## 7. Analytics Naming Conventions + +Naming is where most analytics implementations quietly rot. Good names are the difference between a self-explanatory dataset and an unusable mess of `btn2`, `ButtonClickFinal`, and `checkout_v3_REAL`. + +### 7.1 The golden rule: pick one convention and enforce it + +| | Good | Bad | +|---|---|---| +| Event name | `checkout_started` | `CheckoutButtonClickedAgain2` | +| Casing | `snake_case` (lowercase, underscores) | `MixedCase Button-Click` | +| Structure | `object_action` (`order_completed`) | `clicked_the_big_green_button` | +| Tense | Past tense (`signup_completed`) | Future/imperative (`do_signup`) | + +### 7.2 snake_case + +Use **lowercase letters and underscores**: `video_played`, `subscription_purchased`. It is unambiguous, URL-safe, SQL-friendly (no quoting needed), and avoids the casing-fragmentation problem where `videoPlayed`, `VideoPlayed`, and `video played` become three different events. + +> 📌 **Important Note:** Some tools (Mixpanel, Amplitude) conventionally use `Object Action` Title Case. That is fine; the rule is *consistency within your project*, not which style you pick. This handbook uses `snake_case` as its default. Decide once, document it in [Section 21](#21-our-analytics-standards), and enforce it. + +### 7.3 The object-action pattern + +Name events as `_`, both stated as a noun + past-tense verb: + +| Object | Action | Event name | +|---|---|---| +| order | completed | `order_completed` | +| video | played | `video_played` | +| signup | started | `signup_started` | +| invite | sent | `invite_sent` | + +This groups naturally: all `order_*` events sort together, all `*_completed` events can be found with one filter. It scales far better than free-form names. + +### 7.4 Naming consistency + +- Use the **same word for the same concept** everywhere. Do not mix `signup`, `sign_up`, and `register` for the same action. +- Keep a controlled vocabulary of verbs: `started`, `completed`, `viewed`, `clicked`, `created`, `deleted`, `updated`, `failed`. +- Apply the same discipline to **property names** (`cart_value`, not `cartVal` in one place and `cart_total` in another). + +### 7.5 Versioning + +Events evolve. Strategies, in rough order of preference: + +1. **Add, do not mutate.** Add a new property rather than redefining an old one. Old data stays valid. +2. **Version the property, not the name,** when semantics change: include `schema_version: 2` so consumers know how to interpret it. +3. **Only as a last resort, version the event name** (`checkout_started_v2`) when the meaning fundamentally changed and the old and new must be distinguished. This is ugly; avoid if possible. + +> ⚠️ **Common Pitfall:** Silently changing what an existing event means (e.g. `checkout_started` used to fire on cart open, now fires on the payment page). Historical trends break and nobody knows why. If meaning changes, treat it as a new event or version it explicitly. + +### 7.6 Reserved names + +Most platforms reserve certain event and property names (`$identify`, `page_view`, `session_start`, names beginning with `$` or `ga_` or `firebase_`). Using them collides with built-in behavior. + +> ✅ **Best Practice:** Check your platform's reserved-name list before designing your taxonomy. Avoid prefixes like `$`, `ga_`, `firebase_`, and `mp_` for your custom events. + +### 7.7 Prefixes + +Prefixes help namespace events by domain or surface: + +- By feature area: `billing_invoice_paid`, `auth_login_succeeded`. +- By platform when needed: keep the same event name across platforms and use a `platform` property instead of `web_login` / `ios_login`. One event, one property, is far easier to analyze. + +> 💡 **Pro Tip:** Prefer **one event with a property** over **many near-identical events**. `button_clicked` with `{ button_name: "checkout" }` beats `checkout_button_clicked`, `cancel_button_clicked`, `save_button_clicked`. It keeps your event count low and your analysis flexible. (Caveat: do not over-collapse; truly distinct actions deserve distinct events.) + +--- + +## 8. Event Design Best Practices + +Designing an event well, before you write the code, saves months of cleanup. + +### 8.1 How to design an event + +Ask, in order: + +1. **What decision will this event inform?** If none, do not track it. +2. **What is the object and action?** Gives you the name ([Section 7](#7-analytics-naming-conventions)). +3. **What context will an analyst need to slice this?** Gives you the properties. +4. **Who owns this event** and where is it documented? ([Section 15](#15-event-taxonomy)). + +### 8.2 How many properties? + +Enough to answer your questions, not so many that the event becomes a junk drawer. A practical range is **5-15 properties** for a rich event. If you find yourself adding 40 properties, the event is probably doing too much. + +| Property tier | Definition | Example for `order_completed` | +|---|---|---| +| **Required** | Always present; analysis breaks without them | `order_id`, `value`, `currency` | +| **Recommended** | Present whenever known; enable key slices | `item_count`, `payment_method`, `coupon_code` | +| **Optional** | Nice to have; situational | `gift_wrap`, `delivery_notes` | + +### 8.3 Required vs optional properties + +> ✅ **Best Practice:** Define a small set of **globally required properties** that ride on *every* event (e.g. `app_version`, `platform`, `session_id`). Then per-event required properties on top. Enforce them in a thin tracking wrapper so a missing required property is caught at development time, not discovered in the warehouse months later. + +```javascript +// A thin wrapper that enforces required global properties +function track(event, properties = {}) { + const globals = { + app_version: APP_VERSION, + platform: getPlatform(), + session_id: getSessionId(), + }; + if (!event || !/^[a-z]+(_[a-z]+)+$/.test(event)) { + throw new Error(`Invalid event name: ${event}`); // enforce snake_case + } + analytics.track(event, { ...globals, ...properties }); +} +``` + +### 8.4 Avoid high-cardinality properties + +**Cardinality** is the number of distinct values a property can take. Very high cardinality (unbounded unique values) makes a poor *dimension* to group by, bloats indexes, and is rarely useful as a breakdown. + +| Property | Cardinality | Good as a group-by dimension? | +|---|---|---| +| `plan` (free/pro/enterprise) | Low (3) | ✅ Yes | +| `country` | Medium (~200) | ✅ Yes | +| `user_id` | Very high | ⚠️ As an identifier, yes; as a chart breakdown, no | +| `raw_search_query` | Unbounded | ❌ No (store it, but do not group reports by it) | +| `full_url_with_querystring` | Unbounded | ❌ No (parse out the useful parts instead) | + +> 💡 **Pro Tip:** High-cardinality values are fine to *store* (you may need a raw search query for debugging). The pitfall is using them as the *axis* of a chart, which produces a million one-row buckets. Capture raw, but also capture a normalized low-cardinality version for grouping (e.g. `search_query_length_bucket: "1-3 words"`). + +### 8.5 Avoid PII in events + +**PII (Personally Identifiable Information)** includes names, emails, phone numbers, addresses, government IDs, precise location, and payment details. Do not put it in event names or properties unless you have a deliberate, compliant reason and the right controls. + +> 📌 **Important Note:** PII in analytics is a legal and security liability ([GDPR/CCPA](#17-privacy-and-compliance)). Analytics tools are widely shared internally and often replicate data to third parties. Keep PII in your secure systems of record, and reference users in analytics by an opaque `user_id` only. + +| Instead of tracking... | Track... | +|---|---| +| `email: "jane@acme.com"` | `user_id: "usr_8c1e0b"`, `email_domain: "acme.com"` | +| `full_address` | `country`, `postal_code_prefix` | +| `credit_card_number` | `payment_method: "visa"`, `card_last4` only if truly needed | +| `phone_number` | nothing, or a hashed token if matching is required | + +### 8.6 A complete worked example + +> 🧪 **Example - designing `subscription_purchased`:** +> +> Decision it informs: "Which plans and channels drive revenue, and what is ARPU by cohort?" +> +> ```json +> { +> "event": "subscription_purchased", +> "properties": { +> "plan": "pro", // low cardinality, required +> "billing_period": "annual", // low cardinality, required +> "value": 199.00, // required for revenue +> "currency": "USD", // required to interpret value +> "is_trial_conversion": true, // recommended, enables slice +> "coupon_code": "LAUNCH20", // recommended (low cardinality if controlled) +> "payment_method": "card" // recommended +> } +> } +> ``` +> +> Notice: no PII, no unbounded properties, every field maps to a question an analyst will ask, and `value` + `currency` always travel together (a revenue number without a currency is meaningless). + +--- + +## 9. User Identification + +Identity is how you connect a stream of anonymous events to a real person, across sessions and devices. Done well, it is invisible. Done badly, it splits one user into many or merges different users into one. + +### 9.1 The identity journey + +``` +Anonymous user + ↓ (browses with device_id only) +Sign up + ↓ (identify call links device_id ↔ user_id) +Merge identities + ↓ (pre-signup anonymous history attributed to the user) +Known user +``` + +```mermaid +flowchart TD + A[Anonymous user
device_id: d-7af3] --> B[Browses pricing, blog
events tagged with device_id] + B --> C[Signs up
user_id: usr_8c1e created] + C --> D[identify usr_8c1e, device d-7af3] + D --> E[Identity stitching
pre-signup events linked to user] + E --> F[Known user
full history, any device] + + style A fill:#e1f5ff,stroke:#0288d1 + style F fill:#e8f5e9,stroke:#388e3c +``` + +### 9.2 Identity stitching + +**Identity stitching** is connecting the anonymous identifier (`device_id` / `anonymous_id`) to the known `user_id` so that the pre-identification behavior is credited to the right person. When you call `identify`, the platform records the mapping and (in most tools) retroactively links prior anonymous events. + +> 🧪 **Example:** A visitor reads three blog posts and the pricing page over two days, all anonymous (`device_id: d-7af3`). On day 3 they sign up; you call `identify("usr_8c1e", ...)`. Stitching ties those earlier blog and pricing views to `usr_8c1e`, so you can now see that "users who read the pricing page convert better," a fact that would be invisible without stitching. + +### 9.3 Aliasing + +**Aliasing** explicitly tells the platform that two identifiers are the *same* person, typically merging a pre-signup anonymous ID with a new `user_id`. Some platforms require an explicit `alias()` call; others merge automatically on `identify`. + +```javascript +// At signup, before the user_id is associated: +analytics.alias("usr_8c1e0b"); // tie anonymous_id -> new user_id +analytics.identify("usr_8c1e0b", { // then set who they are + plan: "free", + signup_date: "2026-06-29" +}); +``` + +> ⚠️ **Common Pitfall:** Calling `alias` more than once for the same user, or aliasing two *real* users together (e.g. on a shared device). Aliasing is often irreversible and can permanently merge two people's data. Alias exactly once, at the identity-creation moment. + +### 9.4 Cross-device tracking + +A single person uses a phone, a laptop, and a tablet. Each has its own `device_id`. **Cross-device tracking** unifies them under one `user_id`. + +```mermaid +flowchart TD + P[Phone
device d-aaa] --> U[user_id: usr_8c1e] + L[Laptop
device d-bbb] --> U + T[Tablet
device d-ccc] --> U + U --> V[Unified user view
all devices, one person] + + style V fill:#e8f5e9,stroke:#388e3c +``` + +The mechanism: the user logs in (or otherwise identifies) on each device, and each device's `device_id` is mapped to the same `user_id`. Before login, the devices remain separate anonymous identities. + +> 📌 **Important Note:** You can only stitch devices that have *all* been identified. A user who logs in on their laptop but only browses anonymously on their phone leaves the phone history unlinked. This is a fundamental limit, not a bug. + +### 9.5 Identity model summary + +| Identifier | Scope | Stable across logout? | Stable across devices? | +|---|---|---|---| +| `anonymous_id` / `device_id` | One browser/device | Yes (until storage cleared) | No | +| `user_id` | One person/account | Yes | Yes (once identified on each device) | +| `session_id` | One visit | No | No | + +> ✅ **Best Practice:** Use an **opaque, immutable internal `user_id`**. Never use email/username as the identity key (it can change, it is PII, and people share accounts). Set the `user_id` as early as you safely can after authentication, and let the platform stitch the prior anonymous activity. + +--- + +## 10. Sessions + +A **session** groups a user's events into a single "visit." Sessions are how you measure visit duration, depth, bounce, and engagement. + +### 10.1 How sessions are created + +Most platforms create sessions implicitly: the **first event after a period of inactivity** starts a new session. There is usually no explicit "start a session" call; the SDK manages a session timer. + +```mermaid +flowchart TD + A[Event arrives] --> B{Active session
exists?} + B -- No --> C[Start new session
new session_id] + B -- Yes --> D{Last event >
timeout ago?} + D -- Yes --> C + D -- No --> E[Continue session
refresh last-activity time] + C --> F[Attach session_id to event] + E --> F + + style C fill:#e8f5e9,stroke:#388e3c +``` + +### 10.2 When sessions expire: the 30-minute timeout + +The industry-standard **session timeout is 30 minutes of inactivity**. If a user does nothing for 30 minutes and then acts, that action begins a new session. + +> 🧪 **Example:** +> - 10:00 user opens app (session A starts) +> - 10:10 user clicks (session A continues; timer resets to 10:10) +> - 10:50 user clicks again. Gap from 10:10 is 40 minutes > 30 → **session B starts.** +> +> Result: two sessions, even though it is "one sitting" loosely. The 30-minute rule is a convention that approximates "the user stepped away." + +### 10.3 Custom timeout + +The 30-minute default does not fit every product. A meditation app with 60-minute sessions, or a trading app where 5 minutes idle means "gone," may configure a different timeout. + +| Product type | Reasonable timeout | +|---|---| +| General web/app | 30 min (default) | +| Long-form content / video | 60+ min | +| High-frequency utility (trading, chat) | shorter, 5-15 min | + +> ⚠️ **Common Pitfall:** Changing the session timeout retroactively changes historical session counts and durations, breaking trend comparisons. Pick a timeout deliberately and document it ([Section 21](#21-our-analytics-standards)); change it rarely and annotate the date on charts. + +### 10.4 Background app behavior (mobile) + +On mobile, "inactivity" is complicated by backgrounding: + +- When the app goes to the **background**, the SDK usually pauses the session timer. +- If the user returns **within the timeout**, the same session resumes. +- If they return **after** the timeout, a new session starts. +- A short background (e.g. answering a notification) should *not* fragment one session into two. + +> 💡 **Pro Tip:** Test the "background for 31 minutes then resume" case explicitly. It is the boundary that most often reveals a misconfigured mobile session manager. + +### 10.5 Web behavior + +On the web: + +- Sessions are tracked via cookies/storage and the same 30-minute inactivity rule. +- A new browser tab to your site is usually the *same* session if within the window. +- **Closing the browser does not reliably end the session** (no guaranteed event fires); the session simply times out. +- GA4 also **starts a new session when a new UTM/campaign is detected**, even mid-window (see [Section 6](#6-industry-standards)). + +### 10.6 Mobile vs web sessions + +| Aspect | Web | Mobile | +|---|---|---| +| Trigger to start | First event / page load | App open / first event | +| Inactivity timer | 30 min default | 30 min default, paused on background | +| Clean end? | No (inferred by timeout) | Sometimes (on background, configurable) | +| Backgrounding | n/a (tab visibility) | Pauses/handles session | +| Campaign restart (GA4) | Yes | Yes | + +> 📌 **Important Note:** Because web and mobile compute sessions slightly differently, a cross-platform "total sessions" number is an approximation. When precision matters, segment by platform. + +--- + +## 11. Funnels + +A **funnel** measures how many users progress through an ordered sequence of steps toward a goal, and where they drop off. It is the single most useful product-analytics tool for finding friction. + +### 11.1 What a funnel shows + +```mermaid +flowchart TD + A["Step 1: Visited pricing
10,000 users (100%)"] --> B["Step 2: Started signup
4,000 users (40%)"] + B --> C["Step 3: Verified email
2,800 users (28%)"] + C --> D["Step 4: Completed setup
1,500 users (15%)"] + D --> E["Step 5: Purchased
600 users (6%)"] + + style A fill:#e3f2fd,stroke:#1976d2 + style E fill:#e8f5e9,stroke:#388e3c +``` + +The big drop from step 1 to step 2 (60% lost) is where you focus. Each step's **conversion rate** is `users_at_step / users_at_previous_step`; the **overall conversion** is `users_at_last_step / users_at_first_step` (here 6%). + +### 11.2 Types of funnels + +| Funnel type | Question | Example steps | +|---|---|---| +| **Marketing funnel** | How does an audience become a lead? | Impression → Click → Landing → Lead | +| **Product funnel** | How do users complete an in-product task? | Open editor → Add item → Save → Share | +| **Conversion funnel** | How do users reach a revenue goal? | Cart → Checkout → Payment → Purchase | + +These overlap; a full journey often chains a marketing funnel into a product funnel into a conversion funnel. + +### 11.3 Funnel analysis nuances + +- **Order matters or not?** A *strict* funnel requires steps in exact order; a *relaxed* funnel allows other events in between. Choose deliberately. +- **Conversion window:** funnels usually require completion within a time window (e.g. "signup within 7 days of first visit"). A 1-hour window and a 30-day window give very different numbers. +- **Unique users vs events:** count each user once per step, or you can over-count someone who retried. + +> 🧪 **Example - e-commerce checkout funnel:** +> +> | Step | Event | Users | Step conv. | Drop-off | +> |---|---|---|---|---| +> | 1 | `cart_viewed` | 8,000 | - | - | +> | 2 | `checkout_started` | 5,200 | 65% | 2,800 lost | +> | 3 | `shipping_submitted` | 4,600 | 88% | 600 lost | +> | 4 | `payment_submitted` | 3,900 | 85% | 700 lost | +> | 5 | `order_completed` | 3,600 | 92% | 300 lost | +> +> Overall conversion: 3,600 / 8,000 = **45%**. The worst step is cart → checkout (65%), so that is where a "guest checkout" experiment would be aimed. + +> ✅ **Best Practice:** Instrument *every* funnel step as its own event with consistent properties (`cart_value`, `step_number`). You cannot build a funnel for a step you never tracked, and you cannot diagnose drop-off without properties to segment by (device, plan, country). + +> ⚠️ **Common Pitfall:** Defining a too-tight conversion window so legitimate conversions fall outside it, making the funnel look worse than reality. Match the window to your real buying cycle. + +--- + +## 12. Cohorts + +A **cohort** is a group of users who share a defining characteristic, usually *when they started* or *what they did*. Cohort analysis compares groups over time and is the backbone of retention and behavioral analysis. + +### 12.1 Static cohorts + +- **Definition:** A fixed list of users, frozen at creation. Membership never changes. +- **Use:** "Users who signed up during the March launch." You want to track *those exact people* forever. +- **Example:** Export the 1,200 users from the launch week and follow their retention for a year. + +### 12.2 Dynamic cohorts + +- **Definition:** A rule-based group whose membership is recomputed continuously. Users enter and leave as they meet or stop meeting the criteria. +- **Use:** "Users active in the last 7 days." The set changes daily. +- **Example:** A "power users" cohort defined as "sent 10+ messages in the last week" automatically gains and loses members. + +### 12.3 Behavioral cohorts + +- **Definition:** A group defined by *actions taken* (a special, powerful kind of dynamic cohort). +- **Use:** Compare outcomes of users who did X versus those who did not. +- **Example:** "Users who created a project in their first session" vs "users who did not," compared on Day-30 retention. If the first cohort retains far better, "create a project in session one" becomes your activation goal. + +| Cohort type | Membership | Changes over time? | Typical use | +|---|---|---|---| +| Static | Fixed list | No | Track a specific group long-term | +| Dynamic | Rule-based | Yes | Live segments (active, at-risk) | +| Behavioral | Action-based | Yes | Find what behaviors drive retention | + +```mermaid +flowchart LR + subgraph Behavioral cohort comparison + A[Users who created a
project in session 1] --> A2[Day-30 retention: 58%] + B[Users who did NOT] --> B2[Day-30 retention: 19%] + end + A2 --> C[Insight: drive project
creation in onboarding] + B2 --> C + + style C fill:#e8f5e9,stroke:#388e3c +``` + +> 💡 **Pro Tip:** Behavioral cohorts are how you discover your **activation event**, the early action most correlated with long-term retention (Facebook's famous "7 friends in 10 days," Slack's "2,000 messages sent"). Find yours, then redesign onboarding to drive users toward it. See [Retention](#13-retention) and [Funnels](#11-funnels). + +--- + +## 13. Retention + +**Retention** measures whether users come back over time. It is arguably the single most important indicator of product-market fit: acquisition without retention is a leaky bucket. + +### 13.1 The retention curve + +Retention is usually shown as the percentage of a cohort still active N days after they started. + +```mermaid +flowchart LR + D0["Day 0
100%"] --> D1["Day 1
40%"] + D1 --> D7["Day 7
22%"] + D7 --> D30["Day 30
15%"] + D30 --> D90["Day 90
12% (flattens)"] + + style D0 fill:#e3f2fd,stroke:#1976d2 + style D90 fill:#e8f5e9,stroke:#388e3c +``` + +A healthy curve **flattens** into a stable plateau (a retained core), rather than decaying to zero. The plateau height is your long-term retained fraction. + +### 13.2 Day 1, Day 7, Day 30 + +These are checkpoints on the curve: + +- **Day 1 (D1):** Did they come back the next day? An early signal of first-experience quality. +- **Day 7 (D7):** Weekly habit forming? +- **Day 30 (D30):** Durable, monthly-relevant? + +> 🧪 **Example:** D1 = 40%, D7 = 22%, D30 = 15% means of 100 new users, 40 returned on day 1, 22 in the day-7 window, 15 in the day-30 window. + +### 13.3 Classic vs rolling retention + +This distinction trips up many people: + +| | Classic (N-day / bracket) retention | Rolling (unbounded) retention | +|---|---|---| +| **Counts a user as retained on Day N if...** | they were active *on exactly Day N* (or in that day's bracket) | they were active *on Day N or any day after* | +| **Strictness** | Stricter; misses users who skip day N but return later | More forgiving; counts anyone who ever came back at/after N | +| **Best for** | Daily-habit products (where daily return matters) | Infrequent-use products (where "still a user" matters more than daily) | + +> 🧪 **Example:** A user active on Day 0, absent Day 7, but active Day 9. +> - **Classic D7 retention:** not retained (was not active on day 7). +> - **Rolling D7 retention:** retained (active on day 9, which is ≥ day 7). +> +> Choose the definition that matches how often your product is *meant* to be used. Reporting the wrong one makes a healthy infrequent-use product look like it is dying. + +### 13.4 Stickiness: DAU, WAU, MAU, and the ratio + +| Metric | Meaning | +|---|---| +| **DAU** | Daily Active Users: unique users active in a day | +| **WAU** | Weekly Active Users: unique users active in a 7-day window | +| **MAU** | Monthly Active Users: unique users active in a 30-day window | +| **DAU/MAU** | **Stickiness ratio**: fraction of monthly users who use it on an average day | + +**Stickiness = DAU / MAU.** It approximates "how many days per month does an average active user show up." + +> 🧪 **Example:** DAU = 20,000, MAU = 100,000 → DAU/MAU = 0.20 (20%), meaning the average monthly user is active about 6 days a month (0.20 × 30). For a daily-habit product, 20%+ is decent and 50%+ is excellent; for an inherently weekly product, a lower ratio is normal and not alarming. + +> 📌 **Important Note:** "Active" must be defined by a *meaningful* action (see [Active user](#21-user-concepts)). If "active" just means "opened the app," your DAU/MAU is measuring notifications, not value. + +> ✅ **Best Practice:** Pair retention with a behavioral cohort analysis ([Section 12](#12-cohorts)) to find *why* the retained core retains, then drive new users toward that behavior. Retention tells you *whether*; cohorts tell you *why*. + +--- + +## 14. Common Metrics + +Definitions, formulas, and worked examples for the metrics you will be asked about constantly. Currency examples use USD. + +### 14.1 Engagement and conversion metrics + +| Metric | Formula | What it measures | +|---|---|---| +| **CTR** (Click-Through Rate) | `clicks / impressions` | How compelling a link/ad/element is | +| **Conversion Rate** | `conversions / total_users (or sessions)` | How well you turn visitors into goal-completers | +| **Bounce Rate** | `bounced_sessions / total_sessions` | Share of sessions with no meaningful engagement | +| **Engagement Rate** | `engaged_sessions / total_sessions` | Inverse of bounce (GA4 style) | +| **Avg. Session Duration** | `total_session_time / sessions` | Rough depth of visits | +| **Retention Rate** | `users_active_in_period / cohort_size` | Share of a cohort that returns | + +> 🧪 **Worked examples:** +> - **CTR:** 500 clicks on 50,000 impressions → 500 / 50,000 = **1.0%**. +> - **Conversion Rate:** 600 purchases from 10,000 visitors → 600 / 10,000 = **6.0%**. +> - **Bounce Rate:** 3,000 bounced of 8,000 sessions → 3,000 / 8,000 = **37.5%**. +> - **Retention Rate (D7):** 220 of a 1,000-user cohort active in the day-7 window → **22%**. + +### 14.2 Business and revenue metrics + +| Metric | Formula | What it measures | +|---|---|---| +| **CAC** (Customer Acquisition Cost) | `total_acquisition_spend / new_customers` | What it costs to win a customer | +| **LTV** (Lifetime Value) | `ARPU × average_customer_lifespan` (simple) | Total value a customer brings | +| **ROAS** (Return on Ad Spend) | `revenue_from_ads / ad_spend` | Revenue per advertising dollar | +| **ROI** (Return on Investment) | `(gain − cost) / cost` | General return on any spend | +| **ARPU** (Avg Revenue Per User) | `total_revenue / total_users` | Revenue intensity per user | +| **MRR** (Monthly Recurring Revenue) | `sum of monthly subscription value` | Predictable monthly subscription revenue | +| **ARR** (Annual Recurring Revenue) | `MRR × 12` | Annualized recurring revenue | + +> 🧪 **Worked examples:** +> - **CAC:** spent 50,000 USD on ads, gained 500 customers → 50,000 / 500 = **100 USD per customer**. +> - **LTV (simple):** ARPU 20 USD/month, average lifespan 18 months → 20 × 18 = **360 USD**. +> - **LTV:CAC ratio:** 360 / 100 = **3.6:1**. A common rule of thumb is that healthy SaaS wants **LTV:CAC ≥ 3:1**. +> - **ROAS:** 40,000 USD revenue from 10,000 USD ad spend → 40,000 / 10,000 = **4.0** (often written 4:1 or 400%). +> - **ROI:** gain 40,000, cost 10,000 → (40,000 − 10,000) / 10,000 = **3.0 (300%)**. +> - **ARPU:** 200,000 USD revenue across 100,000 users → **2.00 USD per user**. +> - **MRR:** 1,000 subscribers averaging 25 USD/month → **25,000 USD MRR**. **ARR** = 25,000 × 12 = **300,000 USD**. + +> 📌 **Important Note - LTV:CAC is the unit-economics heartbeat.** If CAC > LTV, you lose money on every customer; growth makes it worse, not better. Watch this ratio before scaling acquisition spend. + +> 💡 **Pro Tip:** ROAS and ROI look similar but differ: ROAS is *gross* revenue per ad dollar (ignores costs and margin), while ROI is *net* return after costs. A 4:1 ROAS can still be unprofitable if your product margin is below 25%. Always ask "ROAS on revenue or on margin?" + +--- + +## 15. Event Taxonomy + +A **taxonomy** is the organized, governed catalog of all your events and properties. Without one, every team invents its own names and the dataset becomes unusable within a year. The taxonomy is what makes analytics *scale across teams*. + +### 15.1 How to organize events + +Group events by **domain/feature area**, then by object, then by action: + +``` +analytics/ + auth/ + auth_login_succeeded + auth_login_failed + auth_signup_started + auth_signup_completed + billing/ + billing_checkout_started + billing_subscription_purchased + billing_subscription_canceled + content/ + content_post_created + content_post_published + content_post_shared + onboarding/ + onboarding_step_viewed + onboarding_completed +``` + +This is a *logical* grouping (in a spec/registry), not necessarily a folder on disk, though many teams do keep a tracking-plan file or directory mirroring it. + +### 15.2 Ownership + +Every event should have an **owner** (a team or person) responsible for its correctness and meaning. Orphaned events are the ones that silently break. + +| Event domain | Owner | +|---|---| +| `auth_*` | Identity team | +| `billing_*` | Payments team | +| `content_*` | Content/editor team | +| `onboarding_*` | Growth team | + +### 15.3 Governance + +**Governance** is the lightweight process that keeps the taxonomy clean: + +- A **tracking plan**: the single source of truth listing every event, its properties, types, and owner. (Tools: a spreadsheet, Avo, Segment Protocols, or a YAML/JSON schema in your repo.) +- A **review step**: new events are reviewed before shipping (naming, properties, no PII, no duplication of an existing event). +- **Validation**: schemas enforced in CI or at ingestion so bad events are caught early ([Section 16](#16-data-quality)). + +### 15.4 Versioning and deprecation + +- **Versioning:** when an event's schema must change, prefer adding properties or a `schema_version` over redefining meaning ([Section 7.5](#75-versioning)). +- **Deprecation:** when retiring an event, mark it **deprecated** in the tracking plan with a date and a replacement, keep accepting it for a grace period, then stop. Never silently delete, downstream dashboards depend on it. + +```mermaid +flowchart LR + A[Proposed] --> B[Reviewed
naming, props, PII] + B --> C[Active] + C --> D[Deprecated
dated + replacement noted] + D --> E[Removed
after grace period] + + style C fill:#e8f5e9,stroke:#388e3c + style E fill:#ffebee,stroke:#c62828 +``` + +> ✅ **Best Practice:** Treat the tracking plan as code: version it, review changes via pull request, and validate events against it automatically. Analytics that is "documented in someone's head" does not survive team growth. + +> ⚠️ **Common Pitfall:** Letting any engineer add any event with no review. Within months you have `signup`, `sign_up`, `SignUp`, `user_registered`, and `registration_complete`, all meaning the same thing, and no report can be trusted. Governance is cheap; cleanup is not. + +--- + +## 16. Data Quality + +Analytics is only as trustworthy as its data. This section catalogs the failure modes and how to defend against them. + +### 16.1 The common failure modes + +| Problem | Symptom | Typical cause | Defense | +|---|---|---|---| +| **Missing events** | Counts lower than reality | Buffer not flushed, ad-blockers, code path not instrumented | Flush on unload; server-side tracking; coverage tests | +| **Duplicate events** | Counts inflated | Retries without idempotency, double-firing handlers | Client-side `event_id` + server dedup ([Section 3.6](#36-deduplication-and-event-ids)) | +| **Late events** | Yesterday's numbers change today | Offline/queued events arriving late | Use event (client) timestamp; allow a "lookback" reprocessing window | +| **Timezone issues** | Daily counts shift by hours | Mixing UTC and local time | Store everything in UTC; convert only at display | +| **Clock skew** | Events "in the future" or misordered | Wrong device clocks | Record server time too; correct via skew offset | +| **Null / missing values** | Broken segments, "(not set)" rows | Optional props omitted, races | Required-property enforcement; sensible defaults | +| **Schema drift** | New property types break queries | Unreviewed event changes | Schema validation in CI / at ingestion | + +### 16.2 Timezones and clock skew in depth + +> 📌 **Important Note - store in UTC, display in local.** Every timestamp in your pipeline should be UTC. Convert to a user's or analyst's local timezone *only* at the presentation layer. Mixing zones at storage time is a permanent, hard-to-unwind corruption. + +**Clock skew** correction pattern: + +```javascript +// On the client, send when the event was created (client clock). +// On the server, also record receipt time (trusted clock). +// skew = server_received - client_sent (per device, smoothed) +// corrected_event_time = client_event_time + skew +``` + +This keeps the *true ordering and relative timing* from the client while anchoring to a trusted absolute clock. + +### 16.3 Late events and reprocessing + +Because of offline mode, an event from "yesterday" can arrive "today." Two implications: + +- **Yesterday's metrics are not final** until your lookback window closes (commonly 24-72 hours). +- Your warehouse jobs should **reprocess** recent days, not only the current one, so late arrivals land in the correct (event-time) day. + +### 16.4 Schema validation and monitoring + +```javascript +// Example: validate an event against a schema before sending +const checkoutSchema = { + required: ["cart_value", "currency"], + types: { cart_value: "number", currency: "string", item_count: "number" } +}; + +function validate(event, props, schema) { + for (const key of schema.required) { + if (props[key] == null) throw new Error(`Missing required: ${key}`); + } + for (const [key, type] of Object.entries(schema.types)) { + if (props[key] != null && typeof props[key] !== type) { + throw new Error(`Wrong type for ${key}: expected ${type}`); + } + } +} +``` + +**Monitoring** to put in place: + +- **Volume alerts:** event volume drops >X% versus the same weekday last week → likely a broken release. +- **Schema alerts:** a new/unexpected property or type appears → drift. +- **Null-rate alerts:** a required property's null rate spikes → instrumentation bug. +- **Freshness alerts:** the pipeline has not received events in N minutes → ingestion outage. + +> 💡 **Pro Tip:** The most valuable single alert is "event volume dropped sharply." It catches the most damaging and most common failure (a release that broke tracking) before stakeholders notice their dashboards went flat. + +> ⚠️ **Common Pitfall:** Discovering a tracking bug *weeks* later, by which point the data is permanently incomplete (you cannot retroactively capture events that were never sent). Monitoring exists precisely to shrink that detection gap to hours. + +--- + +## 17. Privacy and Compliance + +Analytics collects data about people, which makes it subject to privacy law and ethical obligation. Non-compliance carries real fines (GDPR penalties reach into the tens of millions of euros) and real reputational cost. + +> 📌 **Important Note:** This section is general guidance for engineers, **not legal advice.** Always confirm specifics with your organization's legal/privacy team and your [internal standards](#21-our-analytics-standards). + +### 17.1 Key regulations + +| Regulation | Region | Core requirement (simplified) | +|---|---|---| +| **GDPR** | EU / EEA | Lawful basis (often consent) before processing personal data; rights to access, delete, port; data minimization | +| **CCPA / CPRA** | California | Right to know, delete, and opt out of "sale/sharing" of personal info | +| Others (LGPD, PIPEDA, etc.) | Brazil, Canada, ... | Broadly similar consent + rights frameworks | + +### 17.2 Cookie consent and consent management + +Many analytics identifiers rely on cookies/storage, which in the EU generally require **prior consent**. + +- A **Consent Management Platform (CMP)** presents the consent banner and records the user's choices. +- Your SDK must **respect** that choice: do not set tracking cookies or send identified events until consent is granted (for the categories that require it). +- **Consent mode**: some platforms support a degraded, cookieless "no-consent" mode that collects only aggregate, non-identifying signals. + +```mermaid +flowchart TD + A[User arrives] --> B[CMP shows consent banner] + B --> C{Consent given?} + C -- Yes --> D[Full tracking
cookies + identified events] + C -- No --> E[No tracking, or
cookieless aggregate mode] + C -- Not yet --> F[Hold events
until decision] + + style D fill:#e8f5e9,stroke:#388e3c + style E fill:#fff3e0,stroke:#f57c00 +``` + +### 17.3 PII and data minimization + +- **PII** (see [Section 8.5](#85-avoid-pii-in-events)) should be kept out of analytics by default. +- **Data minimization** (a GDPR principle): collect only what you actually need for a defined purpose. "We might want it someday" is not a lawful basis. + +### 17.4 Retention policies + +You should not keep raw personal data forever. A **retention policy** defines how long each data class is kept before deletion or anonymization. + +| Data class | Example retention | +|---|---| +| Raw identified events | 14-26 months (GA4 caps at 14 months for some data) | +| Aggregated/anonymized metrics | Often indefinite (no longer personal) | +| Consent records | Kept as proof, per legal guidance | + +### 17.5 Honoring user rights + +Build the operational ability to: + +- **Delete** a user's data on request (right to erasure). Your `user_id` mapping makes this feasible; PII scattered through event properties makes it a nightmare, another reason to keep PII out. +- **Export** a user's data (right to access/portability). +- **Opt out** of tracking and have that choice persist. + +> ✅ **Best Practice:** Design for deletion from day one. If every personal record is keyed by an opaque `user_id`, a deletion request is "delete rows where user_id = X." If PII is smeared across raw event text, it is nearly impossible. Privacy-by-design is cheaper than privacy-by-cleanup. + +> ⚠️ **Common Pitfall:** Putting an email or name directly into an event property "just for convenience." It becomes PII you must now find, secure, delete on request, and disclose in audits, across every downstream tool the event was forwarded to. + +--- + +## 18. Implementation Best Practices + +Where and how you fire events matters as much as what you fire. The big architectural choice is **client-side vs server-side** tracking. + +### 18.1 Client-side vs server-side tracking + +| | Client-side (frontend/mobile) | Server-side (backend/API) | +|---|---|---| +| **Fires from** | Browser/app | Your server | +| **Good for** | UI interactions, page/screen views, clicks | Transactions, payments, anything that must be reliable | +| **Reliability** | Lower (ad-blockers, network, closed tabs) | Higher (you control it) | +| **Context richness** | Rich UI context, device, UTMs | Authoritative business data | +| **Privacy** | Exposes keys to client; blockable | Keys hidden; harder to block | +| **Latency** | Immediate | Slight (server round-trip) | + +> ✅ **Best Practice - hybrid tracking.** Use **client-side** for UI/engagement (clicks, views, scrolls) and **server-side** for the events you cannot afford to lose (`order_completed`, `subscription_purchased`, `payment_failed`). Critical money events fired from the server are immune to ad-blockers and closed tabs. + +### 18.2 Frontend + +- Initialize the SDK once, early. +- Fire `page`/`screen` on every route change (manually in SPAs, see [Section 2.3](#23-page-concepts)). +- Wrap `track` in a thin helper that enforces required properties and naming ([Section 8.3](#83-required-vs-optional-properties)). +- Flush the buffer on `pagehide`/`visibilitychange`. + +### 18.3 Backend / API / server-side tracking + +- Fire revenue and state-change events from the server where the source of truth lives. +- Pass through the `anonymous_id`/`user_id` from the client so server events stitch to the same user. +- Use the server clock for `timestamp_server`, but still respect client `event_id` for dedup if relaying client events. + +```javascript +// Server-side critical event (Node-style pseudocode) +app.post("/api/checkout/complete", async (req, res) => { + const order = await completeOrder(req.body); + analytics.track({ + userId: req.user.id, // authoritative identity + event: "order_completed", + properties: { + order_id: order.id, + value: order.total, // trusted, from your DB + currency: order.currency, + item_count: order.items.length + } + }); + res.json({ ok: true }); +}); +``` + +### 18.4 Mobile + +- Handle background/foreground for sessions ([Section 10.4](#104-background-app-behavior-mobile)). +- Persist the queue to disk for offline support. +- Respect OS-level tracking permissions (e.g. App Tracking Transparency). + +### 18.5 Offline tracking + +- Persist unsent events durably; upload on reconnect. +- Keep **client timestamps** so late uploads land on the correct day ([Section 16.3](#163-late-events-and-reprocessing)). + +### 18.6 Feature flags and experiments + +- When running an A/B test, attach the **variant** the user saw as a property (or user property): `experiment_id`, `variant`. +- This lets you slice every downstream metric by variant, which is the entire point of experimentation. + +```json +{ + "event": "checkout_completed", + "properties": { "value": 49.0, "currency": "USD" }, + "user_properties": { + "experiment_one_step_checkout": "variant_b" + } +} +``` + +> 💡 **Pro Tip:** Record the experiment exposure (`experiment_viewed`) as its own event, not just as a property on downstream events. You need to know who was *exposed* to a variant, including those who did not convert, to compute the experiment correctly. + +> ⚠️ **Common Pitfall:** Tracking only client-side and then being surprised that ~10-30% of money events are missing (ad-blockers, privacy browsers, network drops). For revenue, track server-side. The finance team will not accept "ad-blockers ate our revenue numbers." + +--- + +## 19. Debugging Analytics + +When a number looks wrong, debug systematically along the [lifecycle](#16-the-analytics-lifecycle): is the event generated, sent, received, processed, stored, and shown correctly? + +### 19.1 Browser DevTools: the Network tab + +The fastest first check. Filter the Network tab for your analytics endpoint (e.g. requests to `/v1/track`, `/v1/batch`, `collect`, or your tool's domain) and inspect: + +- Is a request fired at all when you perform the action? (No request → event not generated or SDK not loaded.) +- What is the **payload**? Confirm the event name, properties, `user_id`, `session_id` are correct. +- What is the **response status**? `200` is good; `4xx` means a malformed/unauthorized request; nothing means it was blocked or never sent. + +```mermaid +flowchart TD + A[Perform the action] --> B{Network request
to analytics endpoint?} + B -- No --> C[Event not generated
or SDK not initialized
or ad-blocker] + B -- Yes --> D{Payload correct?
name, props, ids} + D -- No --> E[Instrumentation bug
fix the track call] + D -- Yes --> F{Response 2xx?} + F -- No --> G[Server rejected:
auth/schema/rate limit] + F -- Yes --> H{Visible in tool
after processing delay?} + H -- No --> I[Filtering, dedup,
identity, or timezone issue] + H -- Yes --> J[✅ Working] + + style J fill:#e8f5e9,stroke:#388e3c + style C fill:#ffebee,stroke:#c62828 +``` + +### 19.2 SDK debug mode + +Most SDKs have a debug/verbose mode that logs every captured event to the console before sending: + +```javascript +analytics.init(WRITE_KEY, { debug: true }); +// Console will print each enriched event payload as it is tracked. +``` + +This shows you the *enriched* event (with IDs, session, UTMs) so you can confirm the SDK attached everything correctly ([Section 4](#4-event-lifecycle)). + +### 19.3 Server logs + +For server-side events, log the outbound analytics call (event name, user, key properties) at a debug level. This confirms the backend actually fired the event and with what payload, independent of any UI. + +### 19.4 Common implementation mistakes + +| Symptom | Likely cause | +|---|---| +| Event never appears | SDK not initialized; ad-blocker; wrong write key; event name typo | +| Event appears twice | Handler bound twice; retry without idempotency; React strict-mode double-invoke in dev | +| Right count, wrong user | `identify` not called, or called with the wrong/unstable ID | +| UTMs missing | Not persisted at landing; lost on redirect; stripped by SPA routing | +| Numbers off by hours | Timezone mixing (UTC vs local) | +| Last action before navigation missing | Buffer not flushed on unload | +| Properties are `null`/"(not set)" | Property omitted or set after the event fired (race condition) | + +### 19.5 Validation checklist (per event) + +- [ ] A network request fires exactly once per user action. +- [ ] Event name matches the tracking plan (correct casing, snake_case). +- [ ] All required properties are present and correctly typed. +- [ ] `user_id` (if logged in) and `session_id` are attached. +- [ ] UTMs/campaign are present when arriving from a tagged link. +- [ ] Response is `2xx`. +- [ ] The event shows up in the analytics tool after the processing delay. +- [ ] No duplicate appears for a single action. + +> 💡 **Pro Tip:** Reproduce in an **incognito window with extensions disabled** to rule out ad-blockers and stale cookies. A surprising share of "analytics is broken" reports are a blocker on the developer's own machine. + +> ⚠️ **Common Pitfall:** Concluding "the event is broken" because it is not in the dashboard *yet*. Most tools have a processing delay (seconds to hours). Confirm the **request succeeded in the Network tab** first; that proves the client side works regardless of dashboard lag. + +--- + +## 20. Real Project Examples + +End-to-end walkthroughs showing every event, its properties, user properties, session, and attribution, from first touch to dashboard. Use these as templates. + +### 20.1 Example A: SaaS signup from Google Ads to paid subscription + +**Journey:** User clicks a Google Ad → lands → signs up → verifies email → buys a subscription. + +``` +Google Ads click → Landing → Signup → Email verified → Subscription purchased +``` + +```mermaid +flowchart LR + A[Google Ad click
utm_source=google] --> B[Landing page] + B --> C[signup_completed] + C --> D[email_verified] + D --> E[subscription_purchased
💰] + + style A fill:#e3f2fd,stroke:#1976d2 + style E fill:#e8f5e9,stroke:#388e3c +``` + +**Events fired, in order:** + +```json +// 1. Landing (anonymous). UTMs captured and persisted here. +{ + "event": "page_viewed", + "anonymous_id": "d-7af3", + "session_id": "s-001", + "properties": { "path": "/promo/summer" }, + "context": { "campaign": { "source": "google", "medium": "cpc", "name": "summer_sale", "term": "project_tool" } } +} + +// 2. Signup completed -> identify + alias here +{ + "event": "signup_completed", + "anonymous_id": "d-7af3", + "user_id": "usr_5501", + "session_id": "s-001", + "properties": { "method": "email" } +} + +// 3. Email verified +{ + "event": "email_verified", + "user_id": "usr_5501", + "session_id": "s-001" +} + +// 4. Subscription purchased (fired SERVER-SIDE for reliability) +{ + "event": "subscription_purchased", + "user_id": "usr_5501", + "properties": { "plan": "pro", "billing_period": "monthly", "value": 29.0, "currency": "USD", "is_trial_conversion": false } +} +``` + +**User properties after this journey:** + +```json +{ + "user_id": "usr_5501", + "plan": "pro", + "signup_date": "2026-06-29", + "acquisition_source": "google", + "acquisition_medium": "cpc", + "first_touch_campaign": "summer_sale" +} +``` + +| Aspect | Value | +|---|---| +| Session | One session `s-001` spans landing → purchase (within 30 min) | +| First touch | `google / cpc / summer_sale` | +| Last touch | same (single-session journey) | +| Attribution (last-touch) | 100% to Google CPC | +| Dashboard view | Acquisition funnel: Landing → Signup → Verified → Purchased, with conversion % per step and revenue by source | + +### 20.2 Example B: E-commerce checkout + +**Journey:** Browse → add to cart → checkout → pay → order complete. + +```json +// Events (key ones) +{ "event": "product_viewed", "properties": { "product_id": "sku_42", "price": 49.0, "currency": "USD" } } +{ "event": "product_added_to_cart", "properties": { "product_id": "sku_42", "quantity": 2, "cart_value": 98.0, "currency": "USD" } } +{ "event": "checkout_started", "properties": { "cart_value": 98.0, "item_count": 2, "currency": "USD" } } +{ "event": "payment_submitted","properties": { "payment_method": "card", "cart_value": 98.0, "currency": "USD" } } +{ "event": "order_completed", "properties": { "order_id": "ord_771", "value": 98.0, "currency": "USD", "item_count": 2 } } // SERVER-SIDE +``` + +| Aspect | Value | +|---|---| +| Funnel | `product_viewed → product_added_to_cart → checkout_started → payment_submitted → order_completed` | +| Critical event | `order_completed` fired server-side (revenue must not be lost) | +| Key properties | `value` + `currency` together; `item_count`; `payment_method` | +| Dashboard | Checkout funnel with drop-off per step; revenue and AOV (average order value = revenue / orders) | + +> 🧪 **Average Order Value:** if 3,600 orders produced 352,800 USD, AOV = 352,800 / 3,600 = **98 USD**. + +### 20.3 Example C: Mobile app onboarding + +**Journey:** Install → first open → onboarding steps → activation. + +```json +{ "event": "app_installed" } // attributed to install source +{ "event": "app_opened", "properties": { "is_first_open": true } } +{ "event": "onboarding_step_viewed", "properties": { "step": 1, "step_name": "welcome" } } +{ "event": "onboarding_step_viewed", "properties": { "step": 2, "step_name": "connect_account" } } +{ "event": "onboarding_completed", "properties": { "steps_total": 3, "duration_seconds": 74 } } +{ "event": "first_project_created" } // the activation event +``` + +| Aspect | Value | +|---|---| +| Session | First session begins at `app_opened`; background handling matters ([Section 10.4](#104-background-app-behavior-mobile)) | +| Activation | `first_project_created` is the behavioral activation event ([Section 12.3](#123-behavioral-cohorts)) | +| Funnel | Onboarding step funnel reveals which step loses users | +| Dashboard | Onboarding completion rate; activation rate; D1/D7 retention split by "completed onboarding" cohort | + +### 20.4 Example D: Subscription renewal (and churn) + +**Journey:** Active subscription → renewal attempt → success or failure → possible churn. + +```json +{ "event": "subscription_renewal_attempted", "user_id": "usr_5501", "properties": { "plan": "pro", "value": 29.0, "currency": "USD" } } // SERVER-SIDE +{ "event": "subscription_renewed", "user_id": "usr_5501", "properties": { "plan": "pro", "value": 29.0, "currency": "USD", "renewal_count": 6 } } +// OR, on failure: +{ "event": "subscription_payment_failed", "user_id": "usr_5501", "properties": { "plan": "pro", "reason": "card_declined" } } +{ "event": "subscription_canceled", "user_id": "usr_5501", "properties": { "plan": "pro", "reason": "voluntary", "tenure_months": 6 } } +``` + +| Aspect | Value | +|---|---| +| Source of truth | All renewal/billing events fired **server-side** | +| Key metrics | MRR, churn rate (`canceled / active_at_start`), renewal success rate | +| User property updates | `plan` → `canceled`, `churn_date` set, `lifetime_value` updated | +| Dashboard | MRR movement (new, expansion, churned), churn cohort analysis, failed-payment recovery funnel | + +> 💡 **Pro Tip:** Distinguish **voluntary churn** (user chose to cancel) from **involuntary churn** (payment failed). They have completely different fixes: voluntary churn is a product/value problem; involuntary churn is a payments/dunning problem. The `reason` property is what lets you tell them apart. + +--- + +## 21. Our Analytics Standards + +> 📌 **This section is intentionally a template.** Everything below is a placeholder for *your organization's* concrete decisions. Replace each `TODO` with your real standard. Keep this section self-contained so teams can update it without touching the rest of the handbook. + +> **TODO:** Replace this entire section with your organization's analytics implementation details. + +### 21.1 Platforms + +> **TODO:** List the analytics platform(s) in use. + +| Purpose | Tool | Owner | +|---|---|---| +| Product analytics | `TODO (e.g. Amplitude / Mixpanel / PostHog)` | `TODO` | +| Web/marketing analytics | `TODO (e.g. GA4)` | `TODO` | +| CDP / data routing | `TODO (e.g. Segment)` | `TODO` | +| Data warehouse | `TODO (e.g. BigQuery / Snowflake)` | `TODO` | + +### 21.2 Conventions and core decisions + +| Decision | Our standard | +|---|---| +| **Event naming convention** | `TODO (e.g. snake_case, object_action, past tense)` | +| **Attribution model** | `TODO (e.g. position-based; data-driven in GA4)` | +| **Session timeout** | `TODO (e.g. 30 minutes)` | +| **Identity strategy** | `TODO (identify on login; alias once at signup)` | +| **User ID strategy** | `TODO (opaque internal id, never email)` | +| **Required event properties (global)** | `TODO (e.g. app_version, platform, session_id)` | +| **Client vs server split** | `TODO (UI client-side; revenue server-side)` | + +### 21.3 Ownership and governance + +> **TODO:** Define who owns events and how changes are reviewed. + +- Event ownership model: `TODO` +- Tracking plan location: `TODO (link)` +- Review/approval process for new events: `TODO` + +### 21.4 Links and resources + +> **TODO:** Fill in internal links. + +- Dashboards: `TODO (links)` +- Tracking plan / event catalog: `TODO (link)` +- Internal analytics docs: `TODO (link)` +- Data warehouse access: `TODO (link)` +- ETL/ELT process docs: `TODO (link)` + +### 21.5 QA checklist (ours) + +> **TODO:** Replace with your organization's required QA gates. + +- [ ] `TODO` +- [ ] `TODO` +- [ ] `TODO` + +--- + +## 22. Developer Checklist + +Use this before shipping any analytics change. (Your team's version lives in [Section 21.5](#215-qa-checklist-ours).) + +### 22.1 Before shipping a new event + +- [ ] **Event exists** in the tracking plan (or has been added and reviewed). +- [ ] **Naming reviewed** against convention (snake_case, object_action, past tense, no reserved names). +- [ ] **Required properties added** and correctly typed. +- [ ] **No PII** in the event name or properties. +- [ ] **No unbounded high-cardinality property** used as a reporting dimension. +- [ ] **UTM/attribution tested** if this event is part of an acquisition flow. +- [ ] **Identity verified** (`user_id` attached when logged in; anonymous handled). +- [ ] **Session attached** and behaves correctly across navigation/background. +- [ ] **Debug verified** (SDK debug mode shows the enriched payload; Network tab shows a 2xx). +- [ ] **Dashboard verified** (the event appears and populates the intended chart/funnel). +- [ ] **Critical/revenue events fire server-side**, not only client-side. +- [ ] **Owner assigned** and documented. +- [ ] **QA approved.** + +### 22.2 Before shipping an experiment + +- [ ] Exposure event (`experiment_viewed`) fires for everyone bucketed. +- [ ] `experiment_id` and `variant` attached to downstream metrics. +- [ ] Both variants verified end-to-end. + +### 22.3 Before deprecating an event + +- [ ] Marked deprecated in the tracking plan with date + replacement. +- [ ] Downstream dashboards/queries migrated. +- [ ] Grace period observed before removal. + +--- + +## 23. FAQ + +Practical answers to the questions developers ask most. + +**1. What is the difference between First Touch and First Visit?** +First *touch* is the first marketing interaction that earned the user's attention (often an ad click/impression). First *visit* is the first actual session on your property. They usually coincide but not always: a podcast ad (touch) can precede a later direct visit. See [Section 2.4](#24-timing-concepts). + +**2. What is the difference between a Session and a User?** +A user is a person (identified by `user_id`). A session is a single visit by that person. One user has many sessions over their lifetime. See [Sections 2.1](#21-user-concepts) and [10](#10-sessions). + +**3. Why are events duplicated?** +Usually a retry that succeeded but whose acknowledgement was lost, so the client retried again, or a handler bound twice (including React strict-mode double-invocation in dev). Fix with a client-generated `event_id` and server-side deduplication. See [Section 3.6](#36-deduplication-and-event-ids). + +**4. Why is my attribution changing for past periods?** +If you use data-driven attribution, the model retrains and redistributes credit. Late-arriving touches and conversion windows also shift historical credit. This is expected behavior, not a bug. See [Section 5.7](#57-data-driven-attribution). + +**5. Why isn't my event visible in the dashboard?** +Check, in order: did a network request fire (DevTools)? Was the response 2xx? Has the processing delay elapsed? Is a filter/segment hiding it? Is identity/timezone misrouting it? See [Section 19](#19-debugging-analytics). + +**6. When should I use a user property vs an event property?** +User property = a fact about the *person* (plan, signup date), mutable, reflects latest value. Event property = context of *this occurrence* (cart value on this purchase), frozen. Ask "is this about the clicker or the click?" See [Section 2.7](#27-event-concepts). + +**7. What should never be tracked?** +PII (emails, names, phone, full address, payment numbers, precise location) unless you have a deliberate compliant reason and controls; passwords/secrets ever; unbounded values as reporting dimensions. See [Sections 8.5](#85-avoid-pii-in-events) and [17](#17-privacy-and-compliance). + +**8. Should I track on the client or the server?** +Client for UI/engagement, server for anything you cannot lose (revenue, state changes). Hybrid is the norm. See [Section 18.1](#181-client-side-vs-server-side-tracking). + +**9. Why is my user counted as "new" again?** +Their `device_id` likely reset (cleared storage, private mode, new device) and they were not identified, so stitching could not link them. See [Section 9](#9-user-identification). + +**10. Why do two tools report different session counts?** +They define sessions differently (timeout, campaign-restart, bot filtering, consent). A 5-10% gap is normal. Compare trends within one tool. See [Section 6](#6-industry-standards). + +**11. What session timeout should I use?** +30 minutes unless you have a specific reason. Shorter for high-frequency utilities, longer for long-form content. See [Section 10.3](#103-custom-timeout). + +**12. How do I track UTMs in a single-page app?** +Read UTMs from the landing URL on first load and persist them immediately; SPA route changes drop the query string, so you must capture before navigation. Fire `page` manually on route changes. See [Sections 2.5](#25-traffic-concepts-utm-parameters) and [2.3](#23-page-concepts). + +**13. Why are my UTMs missing after a redirect?** +Redirects (auth, link shorteners) often strip query parameters. Capture UTMs before the redirect, or carry them through it. See [Section 2.5](#25-traffic-concepts-utm-parameters). + +**14. What is a bounce, exactly?** +Classically a single-page session with no interaction. Modern tools (GA4) define it as a non-engaged session (under 10s, no conversion, fewer than 2 views). See [Section 2.2](#22-session-concepts). + +**15. What is the difference between classic and rolling retention?** +Classic = active on exactly day N; rolling = active on day N or later. Rolling is more forgiving and suits infrequent-use products. See [Section 13.3](#133-classic-vs-rolling-retention). + +**16. What does DAU/MAU tell me?** +Stickiness, roughly how many days a month the average user shows up. 20%+ is solid for daily-habit products. See [Section 13.4](#134-stickiness-dau-wau-mau-and-the-ratio). + +**17. What is an activation event?** +The earliest user action most correlated with long-term retention (e.g. "create first project"). Find it with behavioral cohorts, then drive onboarding toward it. See [Section 12.3](#123-behavioral-cohorts). + +**18. How many properties should an event have?** +Enough to answer your questions, typically 5-15 for a rich event. If it has 40, it is doing too much. See [Section 8.2](#82-how-many-properties). + +**19. What is high cardinality and why avoid it?** +Cardinality = number of distinct values. Unbounded values (raw queries, full URLs) make terrible chart dimensions. Store raw if needed, but group by a normalized low-cardinality field. See [Section 8.4](#84-avoid-high-cardinality-properties). + +**20. Why use snake_case?** +It avoids casing fragmentation (`videoPlayed` vs `VideoPlayed`), is SQL/URL friendly, and reads consistently. The real rule is consistency; pick one and enforce it. See [Section 7.2](#72-snake_case). + +**21. Should I version event names?** +Prefer adding properties or a `schema_version` over renaming. Only version the name when meaning fundamentally changed. See [Section 7.5](#75-versioning). + +**22. Client timestamp or server timestamp, which is correct?** +Client is closer to the real action but trusts the device clock; server is reliable but delayed by offline/queueing. Record both and correct for skew. See [Section 16.2](#162-timezones-and-clock-skew-in-depth). + +**23. Why did yesterday's numbers change today?** +Late-arriving (offline) events landed on yesterday's event-time day, and reprocessing updated it. Daily numbers are final only after the lookback window. See [Section 16.3](#163-late-events-and-reprocessing). + +**24. What timezone should I store timestamps in?** +Always UTC; convert to local only at display. See [Section 16.2](#162-timezones-and-clock-skew-in-depth). + +**25. What is identity stitching?** +Linking anonymous pre-signup behavior to the `user_id` after `identify`, so the whole journey is one person. See [Section 9.2](#92-identity-stitching). + +**26. What is aliasing and when do I call it?** +Aliasing merges an anonymous ID with a new `user_id`, typically once at signup. Calling it repeatedly or on shared devices can wrongly merge people. See [Section 9.3](#93-aliasing). + +**27. How does cross-device tracking work?** +Each device must be identified with the same `user_id`; anonymous-only devices cannot be stitched. See [Section 9.4](#94-cross-device-tracking). + +**28. Should I use email as the user ID?** +No. Use an opaque, immutable internal ID. Emails change, are PII, and are shared. See [Section 9.5](#95-identity-model-summary). + +**29. What is the difference between an exit page and a bounce?** +Exit page = the last page of *any* session. Bounce = a single-page session with no engagement. See [Section 2.3](#23-page-concepts). + +**30. Why is my last event before navigation missing?** +The buffer was not flushed before the page unloaded. Flush on `pagehide`/`visibilitychange`. See [Section 3.3](#33-buffering-and-batch-uploads). + +**31. Why are ad-blockers eating my events?** +Client-side analytics requests are commonly blocked. Track critical events server-side to bypass this. See [Section 18.1](#181-client-side-vs-server-side-tracking). + +**32. How do I track an A/B test correctly?** +Fire an exposure event for everyone bucketed, and attach `experiment_id` + `variant` to downstream metrics. See [Section 18.6](#186-feature-flags-and-experiments). + +**33. What is the difference between ROAS and ROI?** +ROAS is gross revenue per ad dollar (ignores costs); ROI is net return after costs. A high ROAS can still be unprofitable. See [Section 14.2](#142-business-and-revenue-metrics). + +**34. What is a good LTV:CAC ratio?** +A common SaaS rule of thumb is 3:1 or higher. Below 1:1 you lose money per customer. See [Section 14.2](#142-business-and-revenue-metrics). + +**35. What is the difference between MRR and ARR?** +MRR is monthly recurring revenue; ARR is MRR × 12. See [Section 14.2](#142-business-and-revenue-metrics). + +**36. What is the difference between a static and a dynamic cohort?** +Static = fixed membership frozen at creation; dynamic = rule-based, recomputed continuously. See [Section 12](#12-cohorts). + +**37. What is a conversion window and why does it matter?** +The time allowed to complete a funnel/goal. Too short undercounts real conversions; too long over-attributes. Match it to your buying cycle. See [Section 11.3](#113-funnel-analysis-nuances). + +**38. Strict vs relaxed funnel?** +Strict requires steps in exact order with nothing between; relaxed allows other events between steps. Choose deliberately. See [Section 11.3](#113-funnel-analysis-nuances). + +**39. Why is "active user" so important to define carefully?** +Because DAU/MAU and retention all depend on it. "Opened the app" measures notifications, not value; pick a meaningful action. See [Sections 2.1](#21-user-concepts) and [13.4](#134-stickiness-dau-wau-mau-and-the-ratio). + +**40. What is the difference between voluntary and involuntary churn?** +Voluntary = user chose to cancel (a value problem); involuntary = payment failed (a dunning problem). Different fixes. See [Section 20.4](#204-example-d-subscription-renewal-and-churn). + +**41. Do I need consent before tracking?** +In the EU, generally yes for cookie/identifier-based tracking. Respect the CMP's recorded choice. See [Section 17.2](#172-cookie-consent-and-consent-management). + +**42. How long should I keep analytics data?** +Per your retention policy and law; raw identified data is often capped at 14-26 months, aggregates can be kept longer. See [Section 17.4](#174-retention-policies). + +**43. How do I handle a user's deletion request?** +Delete by `user_id` across all stores and downstream tools. This is feasible only if you kept PII out and keyed everything by `user_id`. See [Section 17.5](#175-honoring-user-rights). + +**44. Why should raw events be immutable?** +So you can reprocess after fixing a transformation bug. "Raw is sacred." See [Section 3.7](#37-from-storage-to-warehouse-to-dashboard). + +**45. How do I prevent naming chaos as the team grows?** +Governance: a reviewed tracking plan treated as code, with owners per domain. See [Section 15](#15-event-taxonomy). + +**46. One event with a property, or many events?** +Prefer one event with a discriminating property (`button_clicked` + `button_name`) over many near-identical events, without over-collapsing genuinely distinct actions. See [Section 7.7](#77-prefixes). + +**47. What is the single most valuable monitoring alert?** +"Event volume dropped sharply," it catches broken releases before stakeholders notice. See [Section 16.4](#164-schema-validation-and-monitoring). + +**48. Why do I see "(not set)" / null in reports?** +A property was omitted or set after the event fired (race). Enforce required properties and set context before tracking. See [Section 16](#16-data-quality). + +**49. How do I debug "analytics is broken" fastest?** +Open DevTools Network in incognito (extensions off), perform the action, and inspect the request and response. See [Section 19](#19-debugging-analytics). + +**50. Where do I document our team's specific choices?** +In [Section 21, "Our Analytics Standards,"](#21-our-analytics-standards) which is the customizable template for exactly this. + +--- + +## 24. Appendix + +### 24.1 Complete glossary (A-Z) + +| Term | Short definition | +|---|---| +| **AARRR** | Pirate Metrics: Acquisition, Activation, Retention, Revenue, Referral | +| **Active user** | User who took a meaningful action in a time window | +| **Activation** | A user's first valuable experience ("aha moment") | +| **Acquisition** | Getting users to your product | +| **Alias** | Explicitly merge two identities into one | +| **Anonymous ID** | Identifier for an unidentified user/device | +| **ARPU** | Average Revenue Per User | +| **ARR** | Annual Recurring Revenue (MRR × 12) | +| **Attribution** | Assigning conversion credit to touchpoints | +| **Bounce** | Single-page, non-engaged session | +| **CAC** | Customer Acquisition Cost | +| **Cardinality** | Number of distinct values a property can take | +| **Client timestamp** | Time the device claims an event occurred | +| **Cohort** | Group of users sharing a characteristic | +| **Conversion rate** | Conversions ÷ users (or sessions) | +| **CTR** | Click-Through Rate | +| **DAU / WAU / MAU** | Daily / Weekly / Monthly Active Users | +| **Data-driven attribution** | ML-assigned credit based on incremental contribution | +| **Deduplication** | Dropping duplicate events by ID | +| **Device ID** | Per-device/browser identifier | +| **Engaged session** | Session with meaningful interaction | +| **Event** | A record that something happened | +| **Event ID** | Unique idempotency key per event | +| **Event property** | Context of a single event occurrence | +| **Exit page** | Last page of a session | +| **First touch** | First marketing interaction ever | +| **First visit** | First session ever | +| **Funnel** | Ordered steps toward a goal, with drop-off | +| **Identify** | Link a device to a known `user_id` | +| **Identity stitching** | Connecting anonymous and known identities | +| **Landing page** | First page of a session | +| **Last touch** | Most recent interaction before conversion | +| **Linear attribution** | Equal credit to all touchpoints | +| **LTV** | Lifetime Value | +| **MRR** | Monthly Recurring Revenue | +| **Page view** | A web page was viewed | +| **PII** | Personally Identifiable Information | +| **Position-based attribution** | 40/20/40 first/middle/last credit | +| **Referrer** | Source URL that sent the user | +| **Retention** | Whether users return over time | +| **ROAS** | Return on Ad Spend | +| **ROI** | Return on Investment | +| **Rolling retention** | Active on day N or later | +| **Screen view** | Mobile equivalent of a page view | +| **Server timestamp** | Time the server received the event | +| **Session** | A single visit (events grouped in time) | +| **Session timeout** | Inactivity period that ends a session (default 30 min) | +| **Stickiness** | DAU ÷ MAU | +| **Time-decay attribution** | More credit to recent touches | +| **Time on page** | Duration spent on a page | +| **User ID** | Stable internal identifier for a person | +| **User property** | An attribute of the person | +| **UTM** | URL tags identifying traffic source/medium/campaign | +| **Visitor** | A unique device/browser that visited | + +### 24.2 Reference tables + +**Attribution models at a glance:** + +| Model | Credit rule | Best for | +|---|---|---| +| First-touch | 100% first | Awareness/discovery | +| Last-touch | 100% last | Short cycles | +| Linear | Equal | Long multi-touch journeys | +| Position-based | 40/20/40 | Balanced default | +| Time-decay | Recent-weighted | Building-momentum cycles | +| Data-driven | ML-learned | High-volume mature teams | + +**Active-user windows:** + +| Metric | Window | +|---|---| +| DAU | 1 day | +| WAU | 7 days | +| MAU | 28-30 days | + +### 24.3 Event property examples + +| Event | Required properties | Recommended properties | +|---|---|---| +| `signup_completed` | `method` | `referral_source` | +| `checkout_started` | `cart_value`, `currency` | `item_count`, `coupon_code` | +| `order_completed` | `order_id`, `value`, `currency` | `item_count`, `payment_method` | +| `subscription_purchased` | `plan`, `value`, `currency` | `billing_period`, `is_trial_conversion` | +| `video_played` | `video_id` | `position_seconds`, `quality` | + +### 24.4 Sample JSON payloads + +Minimal: + +```json +{ "event": "button_clicked", "properties": { "button_name": "Checkout", "page": "Cart" } } +``` + +Identify call: + +```json +{ + "type": "identify", + "user_id": "usr_8c1e0b", + "traits": { "plan": "pro", "signup_date": "2026-06-01", "company_size": "50-100" } +} +``` + +Batched upload: + +```json +{ + "batch": [ + { "event": "page_viewed", "event_id": "evt_1", "properties": { "path": "/home" } }, + { "event": "search_performed", "event_id": "evt_2", "properties": { "results_count": 12 } } + ], + "sent_at": "2026-06-29T10:13:00Z" +} +``` + +### 24.5 Common SQL patterns + +```sql +-- Daily Active Users (meaningful action = any tracked event today) +SELECT DATE(event_time) AS day, COUNT(DISTINCT user_id) AS dau +FROM events +WHERE event_time >= CURRENT_DATE - INTERVAL '30 days' +GROUP BY day +ORDER BY day; + +-- Simple D7 retention for a signup cohort +WITH cohort AS ( + SELECT user_id, DATE(MIN(event_time)) AS join_day + FROM events GROUP BY user_id +) +SELECT c.join_day, + COUNT(DISTINCT c.user_id) AS cohort_size, + COUNT(DISTINCT CASE + WHEN DATE(e.event_time) = c.join_day + INTERVAL '7 days' + THEN e.user_id END) AS retained_d7 +FROM cohort c +LEFT JOIN events e ON e.user_id = c.user_id +GROUP BY c.join_day +ORDER BY c.join_day; +``` + +### 24.6 Recommended reading + +- *Lean Analytics* - Croll & Yoskovitz (which metrics matter at each stage). +- *Trustworthy Online Controlled Experiments* - Kohavi, Tang & Xu (experimentation done right). +- Segment Spec documentation (the canonical `track`/`identify`/`page`/`group`/`alias` model). +- Amplitude and Mixpanel guides on event taxonomy and behavioral cohorts. +- GA4 documentation on sessions, events, and attribution. +- Dave McClure's original "Startup Metrics for Pirates" (AARRR). + +### 24.7 Implementation checklist (condensed) + +- [ ] Tracking plan exists and is the source of truth. +- [ ] Naming convention chosen and enforced in a wrapper. +- [ ] Global required properties on every event. +- [ ] Identity strategy (`identify`/`alias`, opaque `user_id`) implemented. +- [ ] Sessions configured (timeout, mobile background handling). +- [ ] UTMs captured and persisted at landing. +- [ ] Critical/revenue events fire server-side. +- [ ] Dedup via client `event_id`. +- [ ] Timestamps in UTC; both client and server recorded. +- [ ] Consent respected before tracking where required. +- [ ] PII kept out of analytics. +- [ ] Monitoring/alerts on volume, schema, freshness. +- [ ] Dashboards verified against real events. + +### 24.8 Common naming conventions (quick reference) + +| Rule | Example | +|---|---| +| snake_case | `checkout_started` | +| object_action, past tense | `order_completed`, `video_played` | +| One event + property over many events | `button_clicked` + `button_name` | +| Properties also snake_case | `cart_value`, `item_count` | +| No reserved prefixes | avoid `$`, `ga_`, `firebase_`, `mp_` | +| Value always with currency | `value` + `currency` | + +--- + +*End of the Analytics Reference Handbook. Keep [Section 21](#21-our-analytics-standards) up to date with your organization's specific choices; everything else is intended to be stable, general guidance.* diff --git a/README.md b/README.md index 7959ce4..761c92b 100644 --- a/README.md +++ b/README.md @@ -1,69 +1,116 @@ -# React + TypeScript + Vite - -This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules. - -Currently, two official plugins are available: - -- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Babel](https://babeljs.io/) for Fast Refresh -- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh - -## Expanding the ESLint configuration - -If you are developing a production application, we recommend updating the configuration to enable type-aware lint rules: - -```js -export default tseslint.config([ - globalIgnores(['dist']), - { - files: ['**/*.{ts,tsx}'], - extends: [ - // Other configs... - - // Remove tseslint.configs.recommended and replace with this - ...tseslint.configs.recommendedTypeChecked, - // Alternatively, use this for stricter rules - ...tseslint.configs.strictTypeChecked, - // Optionally, add this for stylistic rules - ...tseslint.configs.stylisticTypeChecked, - - // Other configs... - ], - languageOptions: { - parserOptions: { - project: ['./tsconfig.node.json', './tsconfig.app.json'], - tsconfigRootDir: import.meta.dirname, - }, - // other options... - }, - }, -]) +# analytics_dashboard — Analytics Read Layer & UI + +The **read half** of the Medblocks analytics stack: an Express API over Postgres plus a +React/Vite dashboard that visualizes the funnels. It reads the data and SQL functions produced +by [`analytics_script_2`](../analytics_script_2). + +> **Big picture:** see [`../ANALYTICS_MASTER_FLOW.md`](../ANALYTICS_MASTER_FLOW.md) for how this +> repo and the scraper fit together. + +This repo **only reads** from Postgres; the scraper **only writes**. Postgres is the contract. + +--- + +## Architecture + +``` +Postgres ──► server/index.js (Express API) ──► src/components/*Tab.tsx (React + Recharts) + (funnels) ~16 /api/* endpoints one tab per channel ``` -You can also install [eslint-plugin-react-x](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-x) and [eslint-plugin-react-dom](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-dom) for React-specific lint rules: - -```js -// eslint.config.js -import reactX from 'eslint-plugin-react-x' -import reactDom from 'eslint-plugin-react-dom' - -export default tseslint.config([ - globalIgnores(['dist']), - { - files: ['**/*.{ts,tsx}'], - extends: [ - // Other configs... - // Enable lint rules for React - reactX.configs['recommended-typescript'], - // Enable lint rules for React DOM - reactDom.configs.recommended, - ], - languageOptions: { - parserOptions: { - project: ['./tsconfig.node.json', './tsconfig.app.json'], - tsconfigRootDir: import.meta.dirname, - }, - // other options... - }, - }, -]) +- **API:** [`server/index.js`](server/index.js), a single Express 5 server. The channel + endpoints build on the inline `ATTRIBUTED_SIGNUPS_CTES` (signup attribution computed in SQL); + `/api/brevo` still uses the scraper's `f_brevo_funnel`; the raw and exploratory endpoints query + `umami_*` tables directly. +- **UI:** [`src/`](src) — React 18 + Vite. One `*Tab.tsx` per channel, all driven by the shared + [`useDateRange`](src/shared/hooks/useDateRange.ts) and + [`useFetchData`](src/shared/hooks/useFetchData.ts) hooks. + +### Key API endpoints (`server/index.js`) + +| Endpoint | Purpose | +|----------|---------| +| `/api/totals`, `/api/total-users`, `/api/user-growth` | Top-level KPIs (LinkedIn / YouTube / Google / Google Ads / Other) and the all-time growth chart | +| `/api/google` | Organic Google search landing-page funnel | +| `/api/google-ads` | Paid Google funnel (`utm_source=google`, cpc / demand_gen), kept separate from organic | +| `/api/source-growth?source=` | Per-channel 30-day growth (daily bars + all-time cumulative), for the channel tabs | +| `/api/search-queries`, `/api/keywords` | Google Search Console queries and keyword editor | +| `/api/brevo` | Email campaign funnel (`f_brevo_funnel`) | +| `/api/linkedin-raw`, `/api/youtube-raw`, `/api/youtube-rankings` | LinkedIn / YouTube funnels and rankings | +| `/api/umami-raw` | Raw traffic bucketed by `utm_source` / `referrer_domain` | +| `/api/other`, `/api/contact-us` | Other-source drill-down and contact-form data | + +### UI tabs (`src/components`) + +`OverviewTab`, `GoogleTab`, `GoogleAdsTab`, `BrevoTab`, `LinkedInRawTab`, `YouTubeRawTab`, +`YTSearchRankingTab`, `SearchQueriesTab`, `RawUmamiTab`, `ContactUsTab`, `OtherTab`. + +--- + +## Source attribution (read carefully) + +Signups are attributed with a **first-touch** model. A signup's source is resolved in priority +order, then mapped to a channel: + +1. **First-touch cookie** carried on the SignUp event (`first_utm_*` / `first_referrer_url`, + written by the website from its `mb_utm_data` cookie). +2. **`session_any_source`**: the earliest source-bearing event of the converting session (any + event type), skipping sourceless events and auth-callback / internal referrers. This recovers + the real source when it sits on a later pageview or a custom event, not the first pageview. +3. **First pageview** of the converting session (last-resort fallback). + +A signup is **"Direct"** only when none of the above carries a usable source. + +**Channel mapping (`SOURCE_CASE_SQL` in `server/index.js`).** The resolved `utm_source` / +`utm_medium` / `referrer_domain` map to a channel: + +- **LinkedIn / YouTube**: `utm_source`, `utm_medium`, or the platform's referrer domains. +- **Google Ads**: `utm_source=google` (paid: cpc / demand_gen), checked **before** organic so a + paid click stays paid even when its referrer is `google.com`. +- **Google**: organic search referrers (`google.com` and country variants, `search.google.com`, + Android quick-search), with no `utm_source=google`. +- **Other**: everything else. The `/api/other` drill-down sub-labels it (Direct, Brevo / Email, + Google OAuth callback, AI chat, Other search engine, No entry pageview, and so on). + +**Channel tabs reconcile with the Overview.** Each channel tab classifies sessions with the same +resolution (`classified_sessions`) and FULL OUTER joins redirects to conversions, so a tab's +conversion total equals the Overview's count for that channel. Page-less signups fold into a +single `(no entry page)` row. + +> The `/api/brevo` email funnel still matches the landing URL to the campaign `full_link` +> byte-for-byte (in the scraper's `f_brevo_funnel`), so it can drop clicks when query params +> differ. The raw `/api/umami-raw` endpoint never loses events. See +> [`../ANALYTICS_MASTER_FLOW.md`](../ANALYTICS_MASTER_FLOW.md). + +--- + +## Setup + +```bash +npm install +cp .env.example .env # point at the SAME Postgres the scraper writes to +npm start # runs Vite UI + Express API concurrently ``` + +### Environment variables (`.env`) + +> `DB_*` must point at the **`analytics` warehouse** (`db=analytics`, port `5432`) — the *same* +> database the scraper writes to. This repo never connects to the upstream `umami` source DB; +> that's the scraper's job. See +> [`../ANALYTICS_MASTER_FLOW.md`](../ANALYTICS_MASTER_FLOW.md#0-two-databases-one-warehouse-read-this-first). + +| Var | Purpose | +|-----|---------| +| `DB_HOST`, `DB_USER`, `DB_PASSWORD`, `DB_NAME`, `DB_PORT` | **`analytics` warehouse** connection (required) | +| `PORT` | API port (default `4000`) | +| `NODE_ENV` | `dev` / `production` (serves built UI in production) | + +### Commands + +| Command | Purpose | +|---------|---------| +| `npm start` | Vite dev server + Express API together (`concurrently`) | +| `npm run dev` | Vite UI only | +| `npm run server` | Express API only | +| `npm run build` | Type-check + production build | +| `npm run lint` | ESLint | From f1c69dae67ef5dc85b5876776a5d33bf56947898 Mon Sep 17 00:00:00 2001 From: Sushank Sinha Date: Wed, 1 Jul 2026 02:06:06 +0530 Subject: [PATCH 3/3] docs: in-repo ANALYTICS_MASTER_FLOW link + move doc into repo --- ANALYTICS_MASTER_FLOW.md | 196 +++++++++++++++++++++++++++++++++++++++ README.md | 6 +- 2 files changed, 199 insertions(+), 3 deletions(-) create mode 100644 ANALYTICS_MASTER_FLOW.md diff --git a/ANALYTICS_MASTER_FLOW.md b/ANALYTICS_MASTER_FLOW.md new file mode 100644 index 0000000..5769952 --- /dev/null +++ b/ANALYTICS_MASTER_FLOW.md @@ -0,0 +1,196 @@ +# Medblocks Analytics — Master Flow + +This document describes how the two repos in this folder work **together** to turn raw +marketing/traffic data into the funnels shown on the analytics dashboard. + +| Repo | Role | Stack | +|------|------|-------| +| [`analytics_script_2`](./analytics_script_2) | **ETL / scraper** — pulls data from every source and loads it into Postgres; defines the SQL views & funnel functions | Node + TypeScript (`tsx`), Playwright, `pg`, Postgres | +| [`analytics_dashboard`](./analytics_dashboard) | **Read layer** — Express API over the same Postgres + a React/Vite UI | React 18 + Vite, Express 5, `pg`, Recharts | + +The scraper only ever *writes* to the warehouse; the dashboard only ever *reads* from it. There +is no direct connection between the two repos — the **`analytics` database is the contract**. + +--- + +## 0. Two databases, one warehouse (read this first) + +There are **three** DB configs across the two repos, but they point at only **two** databases on +the same server. Don't let the shared IP fool you: + +| Config | Repo | Host : Port | DB name | User | Role | +|--------|------|-------------|---------|------|------| +| `UMAMI_DB_*` | scraper | `35.227.22.159:`**`6432`** | **`umami`** | `postgres` | **Source** — Umami's own DB (read-only input). Port `6432` is a PgBouncer pooler. | +| `DATABASE_URL` | scraper | `35.227.22.159:`**`5432`** | **`analytics`** | `analytics` | **Warehouse** — scraper *writes* here | +| `DB_*` | dashboard | `35.227.22.159:`**`5432`** | **`analytics`** | `analytics` | **Warehouse** — dashboard *reads* here | + +- **`umami`** is a *separate, upstream* Postgres owned by the Umami web-analytics product. The + scraper connects to it **only to copy data out** ([src/umami.ts](./analytics_script_2/src/umami.ts)). +- **`analytics`** is the **single source of truth** — the warehouse. Both the scraper's + `DATABASE_URL` and the dashboard's `DB_*` point at this same database. + +``` + umami DB (35.227.22.159:6432, db=umami) ← external SOURCE, owned by Umami + │ read-only copy via src/umami.ts (verbatim, no filtering) + ▼ + analytics DB (35.227.22.159:5432, db=analytics) ← the warehouse / source of truth + ├── scraper writes (DATABASE_URL) + └── dashboard reads (DB_*) +``` + +Why a separate warehouse instead of querying `umami` directly? Because the funnels join Umami +events with data Umami never sees — Brevo email, LinkedIn/YouTube metrics, Search Console, and +Directus CMS content/registrations. The fix for untracked clicks lives **entirely inside the +`analytics` DB** (the funnel SQL); the `umami` source DB is never touched by it. + +--- + +## 1. End-to-end picture + +``` + EXTERNAL SOURCES analytics_script_2 (ETL) POSTGRES (shared) analytics_dashboard + ──────────────── ──────────────────────── ────────────────── ─────────────────── + + Umami DB (remote) ───copy────► src/umami.ts ┌───► umami_website_event ──┐ + website_event (streamed via pg-cursor) │ umami_session │ + session │ umami_session_data │ + session_data │ umami_event_data │ + event_data │ │ + │ │ SQL views + funnel fns + Brevo API ─────────────────► src/brevo.ts ──────────────►├───► brevo ├──► f_brevo_funnel() ──┐ + (email campaign stats) scrape.ts (delta logic) │ brevo_cumulative │ f_youtube_funnel() │ + │ │ f_linkedin_funnel() │ + LinkedIn (Playwright) ──────► src/linkedin.ts ───────────►├───► linkedin │ f_google_search_funnel()│ + (post analytics, login) scrape.ts (delta logic) │ linkedin_cumulative │ v_*_url_map views │ + │ │ │ │ + YouTube (Playwright + API) ─► src/youtube*.ts ───────────►├───► youtube │ ▼ ▼ + (video stats, keywords) processYouTube.ts │ yt_keywords │ server/index.js ──► React UI + │ yt_search_ranking │ (Express API) (src/components/*Tab.tsx) + Google Search Console ──────► src/searchConsole.ts ──────►├───► search_console │ /api/brevo Overview / Google / + (queries, clicks, impr.) scrape.ts │ search_console_fresh │ /api/google Brevo / LinkedIn / + │ │ /api/linkedin-raw YouTube / Raw Umami / + Directus CMS API ───────────► src/directus/*.ts ─────────►└───► directus_content │ /api/youtube-raw Contact Us / … + (content, users, contacts, syncToDb.ts (paged upsert) directus_user │ /api/umami-raw + enrollments, registrations) directus_contact │ /api/total-users + directus_enrollment │ /api/search-queries + directus_fhir_*_reg │ /api/contact-us + directus_webinar_enroll ─┘ /api/user-growth +``` + +--- + +## 2. The ETL run (`analytics_script_2`) + +Entry point: [`src/scrape.ts`](./analytics_script_2/src/scrape.ts) (`npm run scrape`). It runs +once per day (scheduled via `dokploy_schedule_script.sh` / Docker) and processes **yesterday** +(most sources lag ~1 day; Search Console lags ~3 days and is fetched separately). + +Order of operations inside `scrape()`: + +1. **Search Console** — historical (`scDate`, 3 days back) + fresh (`date`, 1 day back) → + `search_console` / `search_console_fresh`. +2. **Brevo** — fetch campaign stats; compute **deltas** vs `brevo_cumulative`, insert only + non-zero-change days into `brevo`, then refresh the cumulative snapshot + ([scrape.ts:161-249](./analytics_script_2/src/scrape.ts#L161-L249)). +3. **Umami** — [`fetchUmamiEvents`](./analytics_script_2/src/umami.ts) streams 4 tables + (`website_event`, `session`, `session_data`, `event_data`) from the **remote Umami + Postgres** in 10k-row batches and bulk-upserts them locally with + `ON CONFLICT … DO NOTHING`. **No filtering** — every raw event is copied verbatim. +4. **Directus** — each collection streamed in pages of 500 and upserted; full sync if the + target table is empty, otherwise date-filtered since yesterday. +5. **LinkedIn / YouTube** — Playwright scrapers (currently commented out in `scrape.ts`; + run via their own scripts / logins). Same delta-vs-cumulative pattern as Brevo. + +Other entry points: `npm run scrape:date` (backfill a specific day), +`npm run backfill:search-console`, `npm run directus-export` +([src/directus/export.ts](./analytics_script_2/src/directus/export.ts) — full Directus dump, +no date filter). + +--- + +## 3. From raw events to funnels (the SQL layer) + +The funnels live in [`analytics_script_2/views/`](./analytics_script_2/views) and are installed +into Postgres by `setup-views-functions.sh`. The dashboard calls these functions; it does **not** +re-implement the funnel logic (with the exception of the raw/exploratory endpoints). + +Each channel has two pieces: + +- **`v__url_map`** — joins `directus_content` (the campaign links you authored) to the + channel's stats table, producing `content_id → full_link`. +- **`f__funnel(start, end)`** — the funnel: impressions → **clicks** → registrations. + +**How a "click" is attributed:** the funnel reconstructs the landing URL from each Umami event +and matches it against the campaign's `full_link`: + +```sql +JOIN umami_website_event u + ON ('https://medblocks.com' || u.url_path + || CASE WHEN u.url_query IS NOT NULL THEN '?' ELSE '' END + || COALESCE(u.url_query, '')) = m.full_link +``` + +YouTube/LinkedIn additionally filter on `referrer_domain`; Brevo (email, no referrer) relies on +the URL match alone. Registrations are tied back via `umami_session.distinct_id` → +`SignIn`/`SignUp` events → `umami_event_data.user_id`. + +> ### ⚠️ Known issue — under-counted / "untraceable" clicks +> The click join above requires the landing URL to equal the stored `full_link` +> **character-for-character**. Real clicks are silently dropped whenever the actual URL differs: +> extra params appended by the source (Brevo's redirector, `fbclid`/`gclid`), reordered query +> params, or trailing-slash/case/encoding differences. The clicks exist in `umami_website_event` +> but never join to a campaign → they appear "untracked." Fix direction: match on a **normalized** +> URL (path + whitelisted UTM params, order-insensitive) or join on `utm_campaign` instead of the +> full URL string. See `views/Brevo/f_brevo_funnel.sql:37`, `views/YouTube/f_youtube_funnel.sql:41`, +> `views/LinkedIn/f_linkedin_funnel.sql:37`. + +The dashboard's `/api/umami-raw` endpoint buckets traffic by `utm_source`/`referrer_domain` +([server/index.js:1222-1244](./analytics_dashboard/server/index.js#L1222-L1244)) and a shared +`SOURCE_CASE_SQL` ([server/index.js:207-219](./analytics_dashboard/server/index.js#L207-L219)) +classifies sources for attribution — note its `referrer_domain IN (…)` lists are case-sensitive. + +--- + +## 4. The read layer (`analytics_dashboard`) + +- **API:** [`server/index.js`](./analytics_dashboard/server/index.js) — one Express server, + ~16 endpoints, each running SQL (mostly calling the `f_*_funnel` functions) against Postgres. +- **UI:** [`src/`](./analytics_dashboard/src) — React + Vite. One `*Tab.tsx` component per + channel (`OverviewTab`, `GoogleTab`, `BrevoTab`, `LinkedInRawTab`, `YouTubeRawTab`, + `RawUmamiTab`, `ContactUsTab`, …), all driven by a shared `useDateRange` hook and + `useFetchData`. +- **Run:** `npm start` (concurrently runs Vite dev server + API). + +--- + +## 5. Shared database tables + +| Group | Tables | +|-------|--------| +| Umami (raw, verbatim copy) | `umami_website_event`, `umami_session`, `umami_session_data`, `umami_event_data` | +| Brevo (email) | `brevo` (daily deltas), `brevo_cumulative` (snapshot) | +| LinkedIn | `linkedin` (daily), `linkedin_cumulative` | +| YouTube | `youtube`, `yt_keywords`, `yt_search_ranking` | +| Search Console | `search_console`, `search_console_fresh` | +| Directus (CMS mirror) | `directus_content`, `directus_user`, `directus_contact`, `directus_enrollment`, `directus_fhir_builders_reg`, `directus_fhir_challenge_reg`, `directus_webinar_enrollment` | + +Schema source of truth: [`analytics_script_2/db/init.sql`](./analytics_script_2/db/init.sql). + +--- + +## 6. Quick start + +```bash +# 1. ETL — populate Postgres +cd analytics_script_2 +npm install && npx playwright install chromium +cp env.example .env # fill DB + source credentials +bash setup-views-functions.sh # install v_* views and f_*_funnel functions +npm run scrape # or: npm run scrape:date 2026-06-14 + +# 2. Dashboard — read & visualize +cd ../analytics_dashboard +npm install +cp .env.example .env # point at the same Postgres +npm start # Vite UI + Express API +``` diff --git a/README.md b/README.md index 761c92b..95e03ab 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ The **read half** of the Medblocks analytics stack: an Express API over Postgres React/Vite dashboard that visualizes the funnels. It reads the data and SQL functions produced by [`analytics_script_2`](../analytics_script_2). -> **Big picture:** see [`../ANALYTICS_MASTER_FLOW.md`](../ANALYTICS_MASTER_FLOW.md) for how this +> **Big picture:** see [`ANALYTICS_MASTER_FLOW.md`](ANALYTICS_MASTER_FLOW.md) for how this > repo and the scraper fit together. This repo **only reads** from Postgres; the scraper **only writes**. Postgres is the contract. @@ -80,7 +80,7 @@ single `(no entry page)` row. > The `/api/brevo` email funnel still matches the landing URL to the campaign `full_link` > byte-for-byte (in the scraper's `f_brevo_funnel`), so it can drop clicks when query params > differ. The raw `/api/umami-raw` endpoint never loses events. See -> [`../ANALYTICS_MASTER_FLOW.md`](../ANALYTICS_MASTER_FLOW.md). +> [`ANALYTICS_MASTER_FLOW.md`](ANALYTICS_MASTER_FLOW.md). --- @@ -97,7 +97,7 @@ npm start # runs Vite UI + Express API concurrently > `DB_*` must point at the **`analytics` warehouse** (`db=analytics`, port `5432`) — the *same* > database the scraper writes to. This repo never connects to the upstream `umami` source DB; > that's the scraper's job. See -> [`../ANALYTICS_MASTER_FLOW.md`](../ANALYTICS_MASTER_FLOW.md#0-two-databases-one-warehouse-read-this-first). +> [`ANALYTICS_MASTER_FLOW.md`](ANALYTICS_MASTER_FLOW.md#0-two-databases-one-warehouse-read-this-first). | Var | Purpose | |-----|---------|