diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 025215fcc9065..b5000bc14b78e 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -50,7 +50,6 @@ #include "access/htup_details.h" #include "access/parallel.h" #include "catalog/pg_authid.h" -#include "common/int.h" #include "executor/instrument.h" #include "funcapi.h" #include "jit/jit.h" @@ -59,7 +58,6 @@ #include "nodes/queryjumble.h" #include "optimizer/planner.h" #include "parser/analyze.h" -#include "parser/scanner.h" #include "pgstat.h" #include "storage/fd.h" #include "storage/ipc.h" @@ -339,7 +337,7 @@ PG_FUNCTION_INFO_V1(pg_stat_statements_info); static void pgss_shmem_shutdown(int code, Datum arg); static void pgss_post_parse_analyze(ParseState *pstate, Query *query, - JumbleState *jstate); + const JumbleState *jstate); static PlannedStmt *pgss_planner(Query *parse, const char *query_string, int cursorOptions, @@ -363,7 +361,7 @@ static void pgss_store(const char *query, int64 queryId, const BufferUsage *bufusage, const WalUsage *walusage, const struct JitInstrumentation *jitusage, - JumbleState *jstate, + const JumbleState *jstate, int parallel_workers_to_launch, int parallel_workers_launched, PlannedStmtOrigin planOrigin); @@ -381,12 +379,9 @@ static char *qtext_fetch(Size query_offset, int query_len, static bool need_gc_qtexts(void); static void gc_qtexts(void); static TimestampTz entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only); -static char *generate_normalized_query(JumbleState *jstate, const char *query, +static char *generate_normalized_query(const JumbleState *jstate, + const char *query, int query_loc, int *query_len_p); -static void fill_in_constant_lengths(JumbleState *jstate, const char *query, - int query_loc); -static int comp_location(const void *a, const void *b); - /* * Module load callback @@ -836,7 +831,7 @@ pgss_shmem_shutdown(int code, Datum arg) * Post-parse-analysis hook: mark query with a queryId */ static void -pgss_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate) +pgss_post_parse_analyze(ParseState *pstate, Query *query, const JumbleState *jstate) { if (prev_post_parse_analyze_hook) prev_post_parse_analyze_hook(pstate, query, jstate); @@ -1287,7 +1282,7 @@ pgss_store(const char *query, int64 queryId, const BufferUsage *bufusage, const WalUsage *walusage, const struct JitInstrumentation *jitusage, - JumbleState *jstate, + const JumbleState *jstate, int parallel_workers_to_launch, int parallel_workers_launched, PlannedStmtOrigin planOrigin) @@ -2824,7 +2819,7 @@ entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only) * Returns a palloc'd string. */ static char * -generate_normalized_query(JumbleState *jstate, const char *query, +generate_normalized_query(const JumbleState *jstate, const char *query, int query_loc, int *query_len_p) { char *norm_query; @@ -2836,12 +2831,14 @@ generate_normalized_query(JumbleState *jstate, const char *query, last_off = 0, /* Offset from start for previous tok */ last_tok_len = 0; /* Length (in bytes) of that tok */ int num_constants_replaced = 0; + LocationLen *locs = NULL; /* - * Get constants' lengths (core system only gives us locations). Note - * this also ensures the items are sorted by location. + * Determine constants' lengths (core system only gives us locations), and + * return a sorted copy of jstate's LocationLen data with lengths filled + * in. */ - fill_in_constant_lengths(jstate, query, query_loc); + locs = ComputeConstantLengths(jstate, query, query_loc); /* * Allow for $n symbols to be longer than the constants they replace. @@ -2867,15 +2864,15 @@ generate_normalized_query(JumbleState *jstate, const char *query, * the parameter in the next iteration (or after the loop is done), * which is a bit odd but seems to work okay in most cases. */ - if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists) + if (locs[i].extern_param && !jstate->has_squashed_lists) continue; - off = jstate->clocations[i].location; + off = locs[i].location; /* Adjust recorded location if we're dealing with partial string */ off -= query_loc; - tok_len = jstate->clocations[i].length; + tok_len = locs[i].length; if (tok_len < 0) continue; /* ignore any duplicates */ @@ -2894,7 +2891,7 @@ generate_normalized_query(JumbleState *jstate, const char *query, */ n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s", num_constants_replaced + 1 + jstate->highest_extern_param_id, - jstate->clocations[i].squashed ? " /*, ... */" : ""); + locs[i].squashed ? " /*, ... */" : ""); num_constants_replaced++; /* move forward */ @@ -2903,6 +2900,10 @@ generate_normalized_query(JumbleState *jstate, const char *query, last_tok_len = tok_len; } + /* Clean up, if needed */ + if (locs) + pfree(locs); + /* * We've copied up until the last ignorable constant. Copy over the * remaining bytes of the original query string. @@ -2919,140 +2920,3 @@ generate_normalized_query(JumbleState *jstate, const char *query, *query_len_p = n_quer_loc; return norm_query; } - -/* - * Given a valid SQL string and an array of constant-location records, - * fill in the textual lengths of those constants. - * - * The constants may use any allowed constant syntax, such as float literals, - * bit-strings, single-quoted strings and dollar-quoted strings. This is - * accomplished by using the public API for the core scanner. - * - * It is the caller's job to ensure that the string is a valid SQL statement - * with constants at the indicated locations. Since in practice the string - * has already been parsed, and the locations that the caller provides will - * have originated from within the authoritative parser, this should not be - * a problem. - * - * Multiple constants can have the same location. We reset lengths of those - * past the first to -1 so that they can later be ignored. - * - * If query_loc > 0, then "query" has been advanced by that much compared to - * the original string start, so we need to translate the provided locations - * to compensate. (This lets us avoid re-scanning statements before the one - * of interest, so it's worth doing.) - * - * N.B. There is an assumption that a '-' character at a Const location begins - * a negative numeric constant. This precludes there ever being another - * reason for a constant to start with a '-'. - */ -static void -fill_in_constant_lengths(JumbleState *jstate, const char *query, - int query_loc) -{ - LocationLen *locs; - core_yyscan_t yyscanner; - core_yy_extra_type yyextra; - core_YYSTYPE yylval; - YYLTYPE yylloc; - - /* - * Sort the records by location so that we can process them in order while - * scanning the query text. - */ - if (jstate->clocations_count > 1) - qsort(jstate->clocations, jstate->clocations_count, - sizeof(LocationLen), comp_location); - locs = jstate->clocations; - - /* initialize the flex scanner --- should match raw_parser() */ - yyscanner = scanner_init(query, - &yyextra, - &ScanKeywords, - ScanKeywordTokens); - - /* Search for each constant, in sequence */ - for (int i = 0; i < jstate->clocations_count; i++) - { - int loc; - int tok; - - /* Ignore constants after the first one in the same location */ - if (i > 0 && locs[i].location == locs[i - 1].location) - { - locs[i].length = -1; - continue; - } - - if (locs[i].squashed) - continue; /* squashable list, ignore */ - - /* Adjust recorded location if we're dealing with partial string */ - loc = locs[i].location - query_loc; - Assert(loc >= 0); - - /* - * We have a valid location for a constant that's not a dupe. Lex - * tokens until we find the desired constant. - */ - for (;;) - { - tok = core_yylex(&yylval, &yylloc, yyscanner); - - /* We should not hit end-of-string, but if we do, behave sanely */ - if (tok == 0) - break; /* out of inner for-loop */ - - /* - * We should find the token position exactly, but if we somehow - * run past it, work with that. - */ - if (yylloc >= loc) - { - if (query[loc] == '-') - { - /* - * It's a negative value - this is the one and only case - * where we replace more than a single token. - * - * Do not compensate for the core system's special-case - * adjustment of location to that of the leading '-' - * operator in the event of a negative constant. It is - * also useful for our purposes to start from the minus - * symbol. In this way, queries like "select * from foo - * where bar = 1" and "select * from foo where bar = -2" - * will have identical normalized query strings. - */ - tok = core_yylex(&yylval, &yylloc, yyscanner); - if (tok == 0) - break; /* out of inner for-loop */ - } - - /* - * We now rely on the assumption that flex has placed a zero - * byte after the text of the current token in scanbuf. - */ - locs[i].length = strlen(yyextra.scanbuf + loc); - break; /* out of inner for-loop */ - } - } - - /* If we hit end-of-string, give up, leaving remaining lengths -1 */ - if (tok == 0) - break; - } - - scanner_finish(yyscanner); -} - -/* - * comp_location: comparator for qsorting LocationLen structs by location - */ -static int -comp_location(const void *a, const void *b) -{ - int l = ((const LocationLen *) a)->location; - int r = ((const LocationLen *) b)->location; - - return pg_cmp_s32(l, r); -} diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 3324d2d3c49e1..13e95afa97c0e 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2533,6 +2533,77 @@ include_dir 'conf.d' + + Timing + + + + timing_clock_source (enum) + + timing_clock_source configuration parameter + + RDTSC + + Time-Stamp Counter + TSC + + TSC + + + + Selects the method for making timing measurements using the OS or + specialized CPU instructions. Possible values are: + + + + auto (automatically chooses TSC + clock source on supported x86-64 CPUs, otherwise uses the OS system + clock) + + + + + system (measures timing using the OS system clock) + + + + + tsc (measures timing with a CPU instruction, e.g. + using RDTSC/RDTSCP on x86-64) + + + + The default is auto. Only superusers can change this + setting. Changing the setting during query execution is not recommended + and may cause interval timings to jump significantly or produce negative + values. + + + If enabled, the TSC clock source, named after the + Time-Stamp Counter on x86-64, will use specialized CPU instructions when + measuring time intervals. This lowers timing overhead compared to reading + the OS system clock, and reduces the measurement error on top of the + actual runtime, for example with EXPLAIN ANALYZE. + + + On x86-64 CPUs the TSC clock source utilizes the + RDTSC instruction for EXPLAIN ANALYZE. + For timings that require higher precision the RDTSCP + instruction is used, which avoids inaccuracies due to CPU instruction + re-ordering. Use of the TSC clock source is not + supported on older x86-64 CPUs and other architectures, and is not + advised on systems that utilize an emulated TSC, as it + is likely slower than the system clock source. + + + To help decide which clock source to use you can run the + utility to check TSC + availability, and perform timing measurements. + + + + + Background Writer diff --git a/doc/src/sgml/ref/pgtesttiming.sgml b/doc/src/sgml/ref/pgtesttiming.sgml index afe6a12be4b30..342f4425c65c7 100644 --- a/doc/src/sgml/ref/pgtesttiming.sgml +++ b/doc/src/sgml/ref/pgtesttiming.sgml @@ -32,9 +32,10 @@ PostgreSQL documentation pg_test_timing is a tool to measure the timing overhead on your system and confirm that the system time never - moves backwards. It simply reads the system clock over and over again + moves backwards. It reads supported clock sources over and over again as fast as it can for a specified length of time, and then prints - statistics about the observed differences in successive clock readings. + statistics about the observed differences in successive clock readings, + as well as which clock source will be used. Smaller (but not zero) differences are better, since they imply both @@ -45,7 +46,10 @@ PostgreSQL documentation This tool is also helpful to determine if the track_io_timing configuration parameter is likely - to produce useful results. + to produce useful results, and whether the + TSC clock source (see + ) is available and if it will be + used by default. @@ -151,47 +155,134 @@ PostgreSQL documentation However, the largest observed difference is always shown. - The example results below show that 99.99% of timing loops took between - 8 and 31 nanoseconds, with the worst case somewhere between 32768 and - 65535 nanoseconds. In the second block, we can see that typical loop - time is 16 nanoseconds, and the readings appear to have full nanosecond - precision. + On platforms that support the TSC clock source, + additional output sections are shown for the RDTSCP + instruction (used for general timing needs, such as + track_io_timing) and the RDTSC + instruction (used for EXPLAIN ANALYZE). At the end + of the output, the TSC frequency, which may either be + sourced from CPU information directly, or the alternate calibration + mechanism are shown, as well as whether the TSC clock + source will be used by default. + + + + The example results below show system clock timing where 99.99% of loops + took between 16 and 63 nanoseconds, followed by TSC + clock source results. The RDTSCP instruction shows + most loops completing in 20–30 nanoseconds, while the + RDTSC instruction is the fastest at + 9–30 nanoseconds. In this example the TSC + clock source will be used by default, but can be disabled by setting + timing_clock_source to system. @@ -203,6 +294,7 @@ Observed timing durations up to 99.9900%: + Wiki discussion about timing diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c index eea45106a3ffe..b354723be4435 100644 --- a/src/backend/commands/statscmds.c +++ b/src/backend/commands/statscmds.c @@ -280,8 +280,10 @@ CreateStatistics(CreateStatsStmt *stmt, bool check_rights) if (type->lt_opr == InvalidOid) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("column \"%s\" cannot be used in multivariate statistics because its type %s has no default btree operator class", - attname, format_type_be(attForm->atttypid)))); + errmsg("cannot create multivariate statistics on column \"%s\"", + attname), + errdetail("The type %s has no default btree operator class.", + format_type_be(attForm->atttypid)))); } /* Treat virtual generated columns as expressions */ @@ -325,8 +327,10 @@ CreateStatistics(CreateStatsStmt *stmt, bool check_rights) if (type->lt_opr == InvalidOid) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("column \"%s\" cannot be used in multivariate statistics because its type %s has no default btree operator class", - get_attname(relid, var->varattno, false), format_type_be(var->vartype)))); + errmsg("cannot create multivariate statistics on column \"%s\"", + get_attname(relid, var->varattno, false)), + errdetail("The type %s has no default btree operator class.", + format_type_be(var->vartype)))); } /* Treat virtual generated columns as expressions */ @@ -375,8 +379,9 @@ CreateStatistics(CreateStatsStmt *stmt, bool check_rights) if (type->lt_opr == InvalidOid) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("expression cannot be used in multivariate statistics because its type %s has no default btree operator class", - format_type_be(atttype)))); + errmsg("cannot create multivariate statistics on this expression"), + errdetail("The type %s has no default btree operator class.", + format_type_be(atttype)))); } stxexprs = lappend(stxexprs, expr); diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 4d4e96a530236..c41005ba44e62 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -3940,6 +3940,13 @@ typedef struct AfterTriggerCallbackItem static AfterTriggersData afterTriggers; +/* + * Incremented before invoking afterTriggerInvokeEvents(). Used by + * AfterTriggerIsActive() to determine whether batch callbacks will fire, + * so that RI trigger functions can take the batched fast path. + */ +static int afterTriggerFiringDepth = 0; + static void AfterTriggerExecute(EState *estate, AfterTriggerEvent event, ResultRelInfo *relInfo, @@ -5113,6 +5120,7 @@ AfterTriggerBeginXact(void) Assert(afterTriggers.events.head == NULL); Assert(afterTriggers.trans_stack == NULL); Assert(afterTriggers.maxtransdepth == 0); + Assert(afterTriggerFiringDepth == 0); } @@ -5184,6 +5192,7 @@ AfterTriggerEndQuery(EState *estate) */ qs = &afterTriggers.query_stack[afterTriggers.query_depth]; + afterTriggerFiringDepth++; for (;;) { if (afterTriggerMarkEvents(&qs->events, &afterTriggers.events, true)) @@ -5234,6 +5243,7 @@ AfterTriggerEndQuery(EState *estate) AfterTriggerFreeQuery(&afterTriggers.query_stack[afterTriggers.query_depth]); afterTriggers.query_depth--; + afterTriggerFiringDepth--; } @@ -5329,6 +5339,7 @@ AfterTriggerFireDeferred(void) * Run all the remaining triggers. Loop until they are all gone, in case * some trigger queues more for us to do. */ + afterTriggerFiringDepth++; while (afterTriggerMarkEvents(events, NULL, false)) { CommandId firing_id = afterTriggers.firing_counter++; @@ -5340,6 +5351,8 @@ AfterTriggerFireDeferred(void) /* Flush any fast-path batches accumulated by the triggers just fired. */ FireAfterTriggerBatchCallbacks(); + afterTriggerFiringDepth--; + /* * We don't bother freeing the event list, since it will go away anyway * (and more efficiently than via pfree) in AfterTriggerEndXact. @@ -5404,6 +5417,8 @@ AfterTriggerEndXact(bool isCommit) /* No more afterTriggers manipulation until next transaction starts. */ afterTriggers.query_depth = -1; + + afterTriggerFiringDepth = 0; } /* @@ -6053,6 +6068,7 @@ AfterTriggerSetState(ConstraintsSetStmt *stmt) AfterTriggerEventList *events = &afterTriggers.events; bool snapshot_set = false; + afterTriggerFiringDepth++; while (afterTriggerMarkEvents(events, NULL, true)) { CommandId firing_id = afterTriggers.firing_counter++; @@ -6086,6 +6102,7 @@ AfterTriggerSetState(ConstraintsSetStmt *stmt) * Flush any fast-path batches accumulated by the triggers just fired. */ FireAfterTriggerBatchCallbacks(); + afterTriggerFiringDepth--; if (snapshot_set) PopActiveSnapshot(); @@ -6806,10 +6823,10 @@ RegisterAfterTriggerBatchCallback(AfterTriggerBatchCallback callback, * Allocate in TopTransactionContext so the item survives for the duration * of the batch, which may span multiple trigger invocations. * - * Must be called while afterTriggers is active (query_depth >= 0); - * callbacks registered outside a trigger-firing context would never fire. + * Must be called while afterTriggers is active; callbacks registered + * outside a trigger-firing context would never fire. */ - Assert(afterTriggers.query_depth >= 0); + Assert(afterTriggerFiringDepth > 0); oldcxt = MemoryContextSwitchTo(TopTransactionContext); item = palloc(sizeof(AfterTriggerCallbackItem)); item->callback = callback; @@ -6836,6 +6853,7 @@ FireAfterTriggerBatchCallbacks(void) if (afterTriggers.query_depth > 0) return; + Assert(afterTriggerFiringDepth > 0); foreach(lc, afterTriggers.batch_callbacks) { AfterTriggerCallbackItem *item = lfirst(lc); @@ -6858,5 +6876,5 @@ FireAfterTriggerBatchCallbacks(void) bool AfterTriggerIsActive(void) { - return afterTriggers.query_depth >= 0; + return afterTriggerFiringDepth > 0; } diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 011a9684df0d5..b1f31d59a4d6a 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -16,6 +16,8 @@ #include #include "executor/instrument.h" +#include "portability/instr_time.h" +#include "utils/guc_hooks.h" BufferUsage pgBufferUsage; static BufferUsage save_pgBufferUsage; @@ -52,7 +54,7 @@ InstrStart(Instrumentation *instr) if (!INSTR_TIME_IS_ZERO(instr->starttime)) elog(ERROR, "InstrStart called twice in a row"); else - INSTR_TIME_SET_CURRENT(instr->starttime); + INSTR_TIME_SET_CURRENT_FAST(instr->starttime); } /* save buffer usage totals at start, if needed */ @@ -78,7 +80,7 @@ InstrStopCommon(Instrumentation *instr, instr_time *accum_time) if (INSTR_TIME_IS_ZERO(instr->starttime)) elog(ERROR, "InstrStop called without start"); - INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_SET_CURRENT_FAST(endtime); INSTR_TIME_ACCUM_DIFF(*accum_time, endtime, instr->starttime); INSTR_TIME_SET_ZERO(instr->starttime); @@ -345,3 +347,75 @@ WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub) dst->wal_fpi_bytes += add->wal_fpi_bytes - sub->wal_fpi_bytes; dst->wal_buffers_full += add->wal_buffers_full - sub->wal_buffers_full; } + +/* GUC hooks for timing_clock_source */ + +bool +check_timing_clock_source(int *newval, void **extra, GucSource source) +{ + /* + * Do nothing if timing is not initialized. This is only expected on child + * processes in EXEC_BACKEND builds, as GUC hooks can be called during + * InitializeGUCOptions() before InitProcessGlobals() has had a chance to + * run pg_initialize_timing(). Instead, TSC will be initialized via + * restore_backend_variables. + */ +#ifdef EXEC_BACKEND + if (!timing_initialized) + return true; +#else + Assert(timing_initialized); +#endif + +#if PG_INSTR_TSC_CLOCK + pg_initialize_timing_tsc(); + + if (*newval == TIMING_CLOCK_SOURCE_TSC && timing_tsc_frequency_khz <= 0) + { + GUC_check_errdetail("TSC is not supported as timing clock source"); + return false; + } +#endif + + return true; +} + +void +assign_timing_clock_source(int newval, void *extra) +{ +#ifdef EXEC_BACKEND + if (!timing_initialized) + return; +#else + Assert(timing_initialized); +#endif + + /* + * Ignore the return code since the check hook already verified TSC is + * usable if its explicitly requested. + */ + pg_set_timing_clock_source(newval); +} + +const char * +show_timing_clock_source(void) +{ + switch (timing_clock_source) + { + case TIMING_CLOCK_SOURCE_AUTO: +#if PG_INSTR_TSC_CLOCK + if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC) + return "auto (tsc)"; +#endif + return "auto (system)"; + case TIMING_CLOCK_SOURCE_SYSTEM: + return "system"; +#if PG_INSTR_TSC_CLOCK + case TIMING_CLOCK_SOURCE_TSC: + return "tsc"; +#endif + } + + /* unreachable */ + return "?"; +} diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c index 87db8dc1a32f1..7c63766a51c5d 100644 --- a/src/backend/nodes/queryjumblefuncs.c +++ b/src/backend/nodes/queryjumblefuncs.c @@ -40,10 +40,12 @@ #include "access/transam.h" #include "catalog/pg_proc.h" #include "common/hashfn.h" +#include "common/int.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" #include "nodes/queryjumble.h" #include "utils/lsyscache.h" +#include "parser/scanner.h" #include "parser/scansup.h" #define JUMBLE_SIZE 1024 /* query serialization buffer size */ @@ -773,3 +775,156 @@ _jumbleRangeTblEntry_eref(JumbleState *jstate, */ JUMBLE_STRING(aliasname); } + +/* + * CompLocation: comparator for qsorting LocationLen structs by location + */ +static int +CompLocation(const void *a, const void *b) +{ + int l = ((const LocationLen *) a)->location; + int r = ((const LocationLen *) b)->location; + + return pg_cmp_s32(l, r); +} + +/* + * Given a valid SQL string and an array of constant-location records, return + * the textual lengths of those constants in a newly allocated LocationLen + * array, or NULL if there are no constants. + * + * The constants may use any allowed constant syntax, such as float literals, + * bit-strings, single-quoted strings and dollar-quoted strings. This is + * accomplished by using the public API for the core scanner. + * + * It is the caller's job to ensure that the string is a valid SQL statement + * with constants at the indicated locations. Since in practice the string + * has already been parsed, and the locations that the caller provides will + * have originated from within the authoritative parser, this should not be + * a problem. + * + * Multiple constants can have the same location. We reset lengths of those + * past the first to -1 so that they can later be ignored. + * + * If query_loc > 0, then "query" has been advanced by that much compared to + * the original string start, as is the case with multi-statement strings, so + * we need to translate the provided locations to compensate. (This lets us + * avoid re-scanning statements before the one of interest, so it's worth + * doing.) + * + * N.B. There is an assumption that a '-' character at a Const location begins + * a negative numeric constant. This precludes there ever being another + * reason for a constant to start with a '-'. + * + * It is the caller's responsibility to free the result, if necessary. + */ +LocationLen * +ComputeConstantLengths(const JumbleState *jstate, const char *query, + int query_loc) +{ + LocationLen *locs; + core_yyscan_t yyscanner; + core_yy_extra_type yyextra; + core_YYSTYPE yylval; + YYLTYPE yylloc; + + if (jstate->clocations_count == 0) + return NULL; + + /* Copy constant locations to avoid modifying jstate */ + locs = palloc_array(LocationLen, jstate->clocations_count); + memcpy(locs, jstate->clocations, jstate->clocations_count * sizeof(LocationLen)); + + /* + * Sort the records by location so that we can process them in order while + * scanning the query text. + */ + if (jstate->clocations_count > 1) + qsort(locs, jstate->clocations_count, + sizeof(LocationLen), CompLocation); + + /* initialize the flex scanner --- should match raw_parser() */ + yyscanner = scanner_init(query, + &yyextra, + &ScanKeywords, + ScanKeywordTokens); + + /* Search for each constant, in sequence */ + for (int i = 0; i < jstate->clocations_count; i++) + { + int loc; + int tok; + + /* Ignore constants after the first one in the same location */ + if (i > 0 && locs[i].location == locs[i - 1].location) + { + locs[i].length = -1; + continue; + } + + if (locs[i].squashed) + continue; /* squashable list, ignore */ + + /* + * Adjust the constant's location using the provided starting location + * of the current statement. This allows us to avoid scanning a + * multi-statement string from the beginning. + */ + loc = locs[i].location - query_loc; + Assert(loc >= 0); + + /* + * We have a valid location for a constant that's not a dupe. Lex + * tokens until we find the desired constant. + */ + for (;;) + { + tok = core_yylex(&yylval, &yylloc, yyscanner); + + /* We should not hit end-of-string, but if we do, behave sanely */ + if (tok == 0) + break; /* out of inner for-loop */ + + /* + * We should find the token position exactly, but if we somehow + * run past it, work with that. + */ + if (yylloc >= loc) + { + if (query[loc] == '-') + { + /* + * It's a negative value - this is the one and only case + * where we replace more than a single token. + * + * Do not compensate for the special-case adjustment of + * location to that of the leading '-' operator in the + * event of a negative constant (see doNegate() in + * gram.y). It is also useful for our purposes to start + * from the minus symbol. In this way, queries like + * "select * from foo where bar = 1" and "select * from + * foo where bar = -2" can be treated similarly. + */ + tok = core_yylex(&yylval, &yylloc, yyscanner); + if (tok == 0) + break; /* out of inner for-loop */ + } + + /* + * We now rely on the assumption that flex has placed a zero + * byte after the text of the current token in scanbuf. + */ + locs[i].length = strlen(yyextra.scanbuf + loc); + break; /* out of inner for-loop */ + } + } + + /* If we hit end-of-string, give up, leaving remaining lengths -1 */ + if (tok == 0) + break; + } + + scanner_finish(yyscanner); + + return locs; +} diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c index ed0f4f2d23436..8f3cfea880c3c 100644 --- a/src/backend/postmaster/launch_backend.c +++ b/src/backend/postmaster/launch_backend.c @@ -57,6 +57,7 @@ #ifdef EXEC_BACKEND #include "nodes/queryjumble.h" +#include "portability/instr_time.h" #include "storage/pg_shmem.h" #include "storage/spin.h" #endif @@ -129,6 +130,8 @@ typedef struct int MyPMChildSlot; + int32 timing_tsc_frequency_khz; + /* * These are only used by backend processes, but are here because passing * a socket needs some special handling on Windows. 'client_sock' is an @@ -750,6 +753,8 @@ save_backend_variables(BackendParameters *param, param->MaxBackends = MaxBackends; param->num_pmchild_slots = num_pmchild_slots; + param->timing_tsc_frequency_khz = timing_tsc_frequency_khz; + #ifdef WIN32 param->PostmasterHandle = PostmasterHandle; if (!write_duplicated_handle(¶m->initial_signal_pipe, @@ -1004,6 +1009,12 @@ restore_backend_variables(BackendParameters *param) MaxBackends = param->MaxBackends; num_pmchild_slots = param->num_pmchild_slots; + timing_tsc_frequency_khz = param->timing_tsc_frequency_khz; + + /* Re-run logic usually done by assign_timing_clock_source */ + pg_initialize_timing(); + pg_set_timing_clock_source(timing_clock_source); + #ifdef WIN32 PostmasterHandle = param->PostmasterHandle; pgwin32_initial_signal_pipe = param->initial_signal_pipe; diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 6f13e8f40a0be..26bf4cfe2f5c7 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -1954,6 +1954,11 @@ InitProcessGlobals(void) MyStartTimestamp = GetCurrentTimestamp(); MyStartTime = timestamptz_to_time_t(MyStartTimestamp); + /* + * Initialize timing infrastructure + */ + pg_initialize_timing(); + /* * Set a different global seed in every process. We want something * unpredictable, so if possible, use high-quality random bits for the diff --git a/src/backend/utils/activity/pgstat_shmem.c b/src/backend/utils/activity/pgstat_shmem.c index 955faf5ebc7d2..b8f354c818a06 100644 --- a/src/backend/utils/activity/pgstat_shmem.c +++ b/src/backend/utils/activity/pgstat_shmem.c @@ -150,8 +150,7 @@ StatsShmemSize(void) continue; Assert(kind_info->shared_size != 0); - - sz += MAXALIGN(kind_info->shared_size); + sz = add_size(sz, MAXALIGN(kind_info->shared_size)); } return sz; @@ -189,6 +188,7 @@ StatsShmemInit(void *arg) * efficiency win. */ ctl->raw_dsa_area = p; + p += pgstat_dsa_init_size(); dsa = dsa_create_in_place(ctl->raw_dsa_area, pgstat_dsa_init_size(), LWTRANCHE_PGSTATS_DSA, NULL); @@ -242,7 +242,8 @@ StatsShmemInit(void *arg) int idx = kind - PGSTAT_KIND_CUSTOM_MIN; Assert(kind_info->shared_size != 0); - ctl->custom_data[idx] = ShmemAlloc(kind_info->shared_size); + ctl->custom_data[idx] = p; + p += MAXALIGN(kind_info->shared_size); ptr = ctl->custom_data[idx]; } diff --git a/src/backend/utils/adt/mac.c b/src/backend/utils/adt/mac.c index f14675dea409f..923c5af54f8bf 100644 --- a/src/backend/utils/adt/mac.c +++ b/src/backend/utils/adt/mac.c @@ -14,11 +14,9 @@ #include "postgres.h" #include "common/hashfn.h" -#include "lib/hyperloglog.h" #include "libpq/pqformat.h" #include "port/pg_bswap.h" #include "utils/fmgrprotos.h" -#include "utils/guc.h" #include "utils/inet.h" #include "utils/sortsupport.h" @@ -33,15 +31,6 @@ #define lobits(addr) \ ((unsigned long)(((addr)->d<<16)|((addr)->e<<8)|((addr)->f))) -/* sortsupport for macaddr */ -typedef struct -{ - int64 input_count; /* number of non-null values seen */ - bool estimating; /* true if estimating cardinality */ - - hyperLogLogState abbr_card; /* cardinality estimator */ -} macaddr_sortsupport_state; - static int macaddr_cmp_internal(macaddr *a1, macaddr *a2); static int macaddr_fast_cmp(Datum x, Datum y, SortSupport ssup); static bool macaddr_abbrev_abort(int memtupcount, SortSupport ssup); @@ -369,24 +358,10 @@ macaddr_sortsupport(PG_FUNCTION_ARGS) if (ssup->abbreviate) { - macaddr_sortsupport_state *uss; - MemoryContext oldcontext; - - oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); - - uss = palloc_object(macaddr_sortsupport_state); - uss->input_count = 0; - uss->estimating = true; - initHyperLogLog(&uss->abbr_card, 10); - - ssup->ssup_extra = uss; - ssup->comparator = ssup_datum_unsigned_cmp; ssup->abbrev_converter = macaddr_abbrev_convert; ssup->abbrev_abort = macaddr_abbrev_abort; ssup->abbrev_full_comparator = macaddr_fast_cmp; - - MemoryContextSwitchTo(oldcontext); } PG_RETURN_VOID(); @@ -406,61 +381,13 @@ macaddr_fast_cmp(Datum x, Datum y, SortSupport ssup) } /* - * Callback for estimating effectiveness of abbreviated key optimization. - * - * We pay no attention to the cardinality of the non-abbreviated data, because - * there is no equality fast-path within authoritative macaddr comparator. + * Abbreviation is never aborted for macaddr because the 6-byte MAC address + * fits entirely within a 64-bit Datum, making the abbreviated key + * authoritative. */ static bool macaddr_abbrev_abort(int memtupcount, SortSupport ssup) { - macaddr_sortsupport_state *uss = ssup->ssup_extra; - double abbr_card; - - if (memtupcount < 10000 || uss->input_count < 10000 || !uss->estimating) - return false; - - abbr_card = estimateHyperLogLog(&uss->abbr_card); - - /* - * If we have >100k distinct values, then even if we were sorting many - * billion rows we'd likely still break even, and the penalty of undoing - * that many rows of abbrevs would probably not be worth it. At this point - * we stop counting because we know that we're now fully committed. - */ - if (abbr_card > 100000.0) - { - if (trace_sort) - elog(LOG, - "macaddr_abbrev: estimation ends at cardinality %f" - " after " INT64_FORMAT " values (%d rows)", - abbr_card, uss->input_count, memtupcount); - uss->estimating = false; - return false; - } - - /* - * Target minimum cardinality is 1 per ~2k of non-null inputs. 0.5 row - * fudge factor allows us to abort earlier on genuinely pathological data - * where we've had exactly one abbreviated value in the first 2k - * (non-null) rows. - */ - if (abbr_card < uss->input_count / 2000.0 + 0.5) - { - if (trace_sort) - elog(LOG, - "macaddr_abbrev: aborting abbreviation at cardinality %f" - " below threshold %f after " INT64_FORMAT " values (%d rows)", - abbr_card, uss->input_count / 2000.0 + 0.5, uss->input_count, - memtupcount); - return true; - } - - if (trace_sort) - elog(LOG, - "macaddr_abbrev: cardinality %f after " INT64_FORMAT - " values (%d rows)", abbr_card, uss->input_count, memtupcount); - return false; } @@ -469,14 +396,13 @@ macaddr_abbrev_abort(int memtupcount, SortSupport ssup) * to abbreviated key representation. * * Packs the bytes of a 6-byte MAC address into a Datum and treats it as an - * unsigned integer for purposes of comparison. On a 64-bit machine, there - * will be two zeroed bytes of padding. The integer is converted to native - * endianness to facilitate easy comparison. + * unsigned integer for purposes of comparison. There will be two zeroed bytes + * of padding. The integer is converted to native endianness to facilitate + * easy comparison. */ static Datum macaddr_abbrev_convert(Datum original, SortSupport ssup) { - macaddr_sortsupport_state *uss = ssup->ssup_extra; macaddr *authoritative = DatumGetMacaddrP(original); Datum res; @@ -489,21 +415,6 @@ macaddr_abbrev_convert(Datum original, SortSupport ssup) "Datum is too small for macaddr"); memset(&res, 0, sizeof(res)); memcpy(&res, authoritative, sizeof(macaddr)); - uss->input_count += 1; - - /* - * Cardinality estimation. The estimate uses uint32, so XOR the two 32-bit - * halves together to produce slightly more entropy. The two zeroed bytes - * won't have any practical impact on this operation. - */ - if (uss->estimating) - { - uint32 tmp; - - tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); - - addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); - } /* * Byteswap on little-endian machines. diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index fcb6ab8058309..31e5b85dc4f34 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -3052,6 +3052,17 @@ assign_hook => 'assign_timezone_abbreviations', }, +{ name => 'timing_clock_source', type => 'enum', context => 'PGC_SUSET', group => 'RESOURCES_TIME', + short_desc => 'Controls the clock source used for collecting timing measurements.', + long_desc => 'This enables the use of specialized clock sources, specifically the RDTSC clock source on x86-64 systems (if available), to support timing measurements with lower overhead during EXPLAIN and other instrumentation.', + variable => 'timing_clock_source', + boot_val => 'TIMING_CLOCK_SOURCE_AUTO', + options => 'timing_clock_source_options', + check_hook => 'check_timing_clock_source', + assign_hook => 'assign_timing_clock_source', + show_hook => 'show_timing_clock_source', +}, + { name => 'trace_connection_negotiation', type => 'bool', context => 'PGC_POSTMASTER', group => 'DEVELOPER_OPTIONS', short_desc => 'Logs details of pre-authentication connection handshake.', flags => 'GUC_NOT_IN_SAMPLE', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index d9ca13baff97d..9f9d8d17be917 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -92,6 +92,7 @@ #include "tcop/tcopprot.h" #include "tsearch/ts_cache.h" #include "utils/builtins.h" +#include "portability/instr_time.h" #include "utils/bytea.h" #include "utils/float.h" #include "utils/guc_hooks.h" @@ -373,6 +374,15 @@ static const struct config_enum_entry huge_pages_options[] = { {NULL, 0, false} }; +static const struct config_enum_entry timing_clock_source_options[] = { + {"auto", TIMING_CLOCK_SOURCE_AUTO, false}, + {"system", TIMING_CLOCK_SOURCE_SYSTEM, false}, +#if PG_INSTR_TSC_CLOCK + {"tsc", TIMING_CLOCK_SOURCE_TSC, false}, +#endif + {NULL, 0, false} +}; + static const struct config_enum_entry huge_pages_status_options[] = { {"off", HUGE_PAGES_OFF, false}, {"on", HUGE_PAGES_ON, false}, @@ -731,6 +741,7 @@ const char *const config_group_names[] = [CONN_AUTH_TCP] = gettext_noop("Connections and Authentication / TCP Settings"), [CONN_AUTH_AUTH] = gettext_noop("Connections and Authentication / Authentication"), [CONN_AUTH_SSL] = gettext_noop("Connections and Authentication / SSL"), + [RESOURCES_TIME] = gettext_noop("Resource Usage / Time"), [RESOURCES_MEM] = gettext_noop("Resource Usage / Memory"), [RESOURCES_DISK] = gettext_noop("Resource Usage / Disk"), [RESOURCES_KERNEL] = gettext_noop("Resource Usage / Kernel Resources"), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index e3e462f3efb90..5fc7323440ab7 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -196,6 +196,10 @@ #max_files_per_process = 1000 # min 64 # (change requires restart) +# - Time - + +#timing_clock_source = auto # auto, system, tsc (if supported) + # - Background Writer - #bgwriter_delay = 200ms # 10-10000ms between rounds diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index aee41dbe3f9b7..84388b74aceaa 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -30,22 +30,29 @@ static long long int largest_diff_count; static void handle_args(int argc, char *argv[]); -static uint64 test_timing(unsigned int duration); +static void test_system_timing(void); +#if PG_INSTR_TSC_CLOCK +static void test_tsc_timing(void); +#endif +static uint64 test_timing(unsigned int duration, TimingClockSourceType source, bool fast_timing); static void output(uint64 loop_count); int main(int argc, char *argv[]) { - uint64 loop_count; - set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_test_timing")); progname = get_progname(argv[0]); handle_args(argc, argv); - loop_count = test_timing(test_duration); + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(); - output(loop_count); + test_system_timing(); + +#if PG_INSTR_TSC_CLOCK + test_tsc_timing(); +#endif return 0; } @@ -143,20 +150,99 @@ handle_args(int argc, char *argv[]) exit(1); } - printf(ngettext("Testing timing overhead for %u second.\n", - "Testing timing overhead for %u seconds.\n", + printf(ngettext("Testing timing overhead for %u second.\n\n", + "Testing timing overhead for %u seconds.\n\n", test_duration), test_duration); } +/* + * This tests default (non-fast) timing code. A clock source for that is + * always available. Hence, we can unconditionally output the result. + */ +static void +test_system_timing(void) +{ + uint64 loop_count; + + loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_SYSTEM, false); + output(loop_count); +} + +/* + * If on a supported architecture, test the TSC clock source. This clock + * source is not always available. In that case we print an informational + * message indicating as such. + * + * We first emit "slow" timings (RDTSCP on x86), which are used for higher + * precision measurements when the TSC clock source is enabled. We emit + * "fast" timings second (RDTSC on x86), which is used for faster timing + * measurements with lower precision. + */ +#if PG_INSTR_TSC_CLOCK +static void +test_tsc_timing(void) +{ + uint64 loop_count; + uint32 calibrated_freq; + + printf("\n"); + loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_TSC, false); + if (loop_count > 0) + { + output(loop_count); + printf("\n"); + + /* Now, emit fast timing measurements */ + loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_TSC, true); + output(loop_count); + printf("\n"); + + printf(_("TSC frequency in use: %u kHz\n"), timing_tsc_frequency_khz); + + calibrated_freq = pg_tsc_calibrate_frequency(); + if (calibrated_freq > 0) + printf(_("TSC frequency from calibration: %u kHz\n"), calibrated_freq); + else + printf(_("TSC calibration did not converge\n")); + + pg_set_timing_clock_source(TIMING_CLOCK_SOURCE_AUTO); + if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC) + printf(_("TSC clock source will be used by default, unless timing_clock_source is set to 'system'.\n")); + else + printf(_("TSC clock source will not be used by default, unless timing_clock_source is set to 'tsc'.\n")); + } + else + printf(_("TSC clock source is not usable. Likely unable to determine TSC frequency. are you running in an unsupported virtualized environment?\n")); +} +#endif + static uint64 -test_timing(unsigned int duration) +test_timing(unsigned int duration, TimingClockSourceType source, bool fast_timing) { uint64 loop_count = 0; instr_time start_time, end_time, prev, cur; + char *time_source = NULL; + + if (!pg_set_timing_clock_source(source)) + return 0; + + time_source = PG_INSTR_SYSTEM_CLOCK_NAME; + +#if PG_INSTR_TSC_CLOCK + if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC) + time_source = fast_timing ? PG_INSTR_TSC_CLOCK_NAME_FAST : PG_INSTR_TSC_CLOCK_NAME; +#endif + + if (fast_timing) + printf(_("Fast clock source: %s\n"), time_source); + else if (source == TIMING_CLOCK_SOURCE_SYSTEM) + printf(_("System clock source: %s\n"), time_source); + else + printf(_("Clock source: %s\n"), time_source); /* * Pre-zero the statistics data structures. They're already zero by @@ -181,7 +267,11 @@ test_timing(unsigned int duration) instr_time diff_time; prev = cur; - INSTR_TIME_SET_CURRENT(cur); + + if (fast_timing) + INSTR_TIME_SET_CURRENT_FAST(cur); + else + INSTR_TIME_SET_CURRENT(cur); diff_time = cur; INSTR_TIME_SUBTRACT(diff_time, prev); diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 1dae918cc09d2..c969afab3a595 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -6820,6 +6820,9 @@ main(int argc, char **argv) int exit_code = 0; struct timeval tv; + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(); + /* * Record difference between Unix time and instr_time time. We'll use * this for logging and aggregation. diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c index 9a397ec87b736..69d044d405d5b 100644 --- a/src/bin/psql/startup.c +++ b/src/bin/psql/startup.c @@ -24,6 +24,7 @@ #include "help.h" #include "input.h" #include "mainloop.h" +#include "portability/instr_time.h" #include "settings.h" /* @@ -327,6 +328,9 @@ main(int argc, char *argv[]) PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL); + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(); + SyncVariables(); if (options.list_dbs) diff --git a/src/common/Makefile b/src/common/Makefile index 2c720caa50972..1a2fbbe887f22 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -59,6 +59,7 @@ OBJS_COMMON = \ file_perm.o \ file_utils.o \ hashfn.o \ + instr_time.o \ ip.o \ jsonapi.o \ keywords.o \ diff --git a/src/common/instr_time.c b/src/common/instr_time.c new file mode 100644 index 0000000000000..14ab4579d37b8 --- /dev/null +++ b/src/common/instr_time.c @@ -0,0 +1,438 @@ +/*------------------------------------------------------------------------- + * + * instr_time.c + * Non-inline parts of the portable high-precision interval timing + * implementation + * + * Portions Copyright (c) 2026, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/common/instr_time.c + * + *------------------------------------------------------------------------- + */ +#ifndef FRONTEND +#include "postgres.h" +#else +#include "postgres_fe.h" +#endif + +#include + +#if defined(__APPLE__) +#include +#endif + +#include "port/pg_cpu.h" +#include "portability/instr_time.h" + +/* + * Stores what the number of ticks needs to be multiplied with to end up + * with nanoseconds using integer math. + * + * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows) + * the ticks to nanoseconds conversion requires floating point math because: + * + * sec = ticks / frequency_hz + * ns = ticks / frequency_hz * 1,000,000,000 + * ns = ticks * (1,000,000,000 / frequency_hz) + * ns = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz + * + * Here, 'ns' is usually a floating point number. For example for a 2.5 GHz CPU + * the scaling factor becomes 1,000,000 / 2,500,000 = 0.4. + * + * To be able to use integer math we work around the lack of precision. We + * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the + * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by + * the same amount. + * + * We remember the maximum number of ticks that can be multiplied by the scale + * factor without overflowing so we can check via a * b > max <=> a > max / b. + * + * However, as this is meant for interval measurements, it is unlikely that the + * overflow path is actually taken in typical scenarios, since overflows would + * only occur for intervals longer than 6.5 days. + * + * Note we utilize unsigned integers even though ticks are stored as a signed + * value to encourage compilers to generate better assembly, since we can be + * sure these values are not negative. + * + * In all other cases we are using clock_gettime(), which uses nanoseconds + * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns + * to return the original value. + */ +uint64 ticks_per_ns_scaled = 0; +uint64 max_ticks_no_overflow = 0; +bool timing_initialized = false; +int timing_clock_source = TIMING_CLOCK_SOURCE_AUTO; + +bool timing_tsc_enabled = false; +int32 timing_tsc_frequency_khz = -1; + +static void set_ticks_per_ns(void); +static void set_ticks_per_ns_system(void); + +#if PG_INSTR_TSC_CLOCK +static bool tsc_use_by_default(void); +static void set_ticks_per_ns_for_tsc(void); +#endif + +/* + * Initializes timing infrastructure. Must be called before making any use + * of INSTR* macros. + */ +void +pg_initialize_timing(void) +{ + if (timing_initialized) + return; + + set_ticks_per_ns_system(); + timing_initialized = true; +} + +bool +pg_set_timing_clock_source(TimingClockSourceType source) +{ + Assert(timing_initialized); + +#if PG_INSTR_TSC_CLOCK + pg_initialize_timing_tsc(); + + switch (source) + { + case TIMING_CLOCK_SOURCE_AUTO: + timing_tsc_enabled = (timing_tsc_frequency_khz > 0) && tsc_use_by_default(); + break; + case TIMING_CLOCK_SOURCE_SYSTEM: + timing_tsc_enabled = false; + break; + case TIMING_CLOCK_SOURCE_TSC: + /* Tell caller TSC is not usable */ + if (timing_tsc_frequency_khz <= 0) + return false; + timing_tsc_enabled = true; + break; + } +#endif + + set_ticks_per_ns(); + timing_clock_source = source; + return true; +} + +static void +set_ticks_per_ns(void) +{ +#if PG_INSTR_TSC_CLOCK + if (timing_tsc_enabled) + { + set_ticks_per_ns_for_tsc(); + return; + } +#endif + set_ticks_per_ns_system(); +} + +#ifndef WIN32 + +static void +set_ticks_per_ns_system(void) +{ + ticks_per_ns_scaled = 0; + max_ticks_no_overflow = 0; +} + +#else /* WIN32 */ + +/* GetTimerFrequency returns counts per second */ +static inline double +GetTimerFrequency(void) +{ + LARGE_INTEGER f; + + QueryPerformanceFrequency(&f); + return (double) f.QuadPart; +} + +static void +set_ticks_per_ns_system(void) +{ + ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency(); + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; +} + +#endif /* WIN32 */ + +/* Hardware clock specific logic (x86 TSC / AArch64 CNTVCT) */ + +#if PG_INSTR_TSC_CLOCK + +static void tsc_detect_frequency(void); + +/* + * Initialize the TSC clock source by determining its usability and frequency. + * + * This can be called multiple times without causing repeated work, as + * timing_tsc_frequency_khz will be set to 0 if a prior call determined the + * TSC is not usable. On EXEC_BACKEND (Windows), the TSC frequency may also be + * set by restore_backend_variables. + */ +void +pg_initialize_timing_tsc(void) +{ + if (timing_tsc_frequency_khz < 0) + tsc_detect_frequency(); +} + +static void +set_ticks_per_ns_for_tsc(void) +{ + ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / timing_tsc_frequency_khz; + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; +} + +#if defined(__x86_64__) || defined(_M_X64) + +/* + * x86-64 TSC specific logic + */ + +/* + * Detect the TSC frequency and whether RDTSCP is available on x86-64. + * + * This can't be reliably determined at compile time, since the + * availability of an "invariant" TSC (that is not affected by CPU + * frequency changes) is dependent on the CPU architecture. Additionally, + * there are cases where TSC availability is impacted by virtualization, + * where a simple cpuid feature check would not be enough. + */ +static void +tsc_detect_frequency(void) +{ + timing_tsc_frequency_khz = 0; + + /* We require RDTSCP support and an invariant TSC, bail if not available */ + if (!x86_feature_available(PG_RDTSCP) || !x86_feature_available(PG_TSC_INVARIANT)) + return; + + /* Determine speed at which the TSC advances */ + timing_tsc_frequency_khz = x86_tsc_frequency_khz(); + if (timing_tsc_frequency_khz > 0) + return; + + /* + * CPUID did not give us the TSC frequency. We can instead measure the + * frequency by comparing ticks against walltime in a calibration loop. + */ + timing_tsc_frequency_khz = pg_tsc_calibrate_frequency(); +} + +/* + * Decides whether to use the TSC clock source if the user did not specify it + * one way or the other, and it is available (checked separately). + * + * Inspired by the Linux kernel's clocksource watchdog disable logic as updated + * in 2021 to reflect the reliability of the TSC on Intel platforms, see + * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion + * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/ + * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/ + * for reference. + * + * When tsc_detect_frequency determines the TSC is viable (invariant, etc.), and + * we're on an Intel platform (determined via TSC_ADJUST), we consider the TSC + * trustworthy by default, matching the Linux kernel. + * + * On other CPU platforms (e.g. AMD), or in some virtual machines, we don't have + * an easy way to determine the TSC's reliability. If on Linux, we can check if + * TSC is the active clocksource, based on it having run the watchdog logic to + * monitor TSC correctness. For other platforms the user must explicitly enable + * it via GUC instead. + */ +static bool +tsc_use_by_default(void) +{ + if (x86_feature_available(PG_TSC_ADJUST)) + return true; + +#if defined(__linux__) + { + FILE *fp; + char buf[128]; + + fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r"); + if (fp) + { + bool is_tsc = (fgets(buf, sizeof(buf), fp) != NULL && + strcmp(buf, "tsc\n") == 0); + + fclose(fp); + if (is_tsc) + return true; + } + } +#endif + + return false; +} + +/* + * Calibrate the TSC frequency by comparing TSC ticks against walltime. + * + * Takes initial TSC and system clock snapshots, then loops, recomputing the + * frequency each TSC_CALIBRATION_SKIPS iterations from cumulative TSC + * ticks divided by elapsed time. + * + * Once the frequency estimate stabilizes (consecutive iterations agree), we + * consider it converged and the frequency in KHz is returned. If either too + * many iterations or a time limit passes without convergence, 0 is returned. + */ +#define TSC_CALIBRATION_MAX_NS (50 * NS_PER_MS) +#define TSC_CALIBRATION_ITERATIONS 1000000 +#define TSC_CALIBRATION_SKIPS 100 +#define TSC_CALIBRATION_STABLE_CYCLES 10 + +uint32 +pg_tsc_calibrate_frequency(void) +{ + instr_time initial_wall; + int64 initial_tsc; + double freq_khz = 0; + double prev_freq_khz = 0; + int stable_count = 0; + int64 prev_tsc; + int saved_clock_source = timing_clock_source; + + /* + * Frequency must be initialized to avoid recursion via + * pg_set_timing_clock_source + */ + Assert(timing_tsc_frequency_khz >= 0); + + /* Ensure INSTR_* calls below work on system time */ + pg_set_timing_clock_source(TIMING_CLOCK_SOURCE_SYSTEM); + + INSTR_TIME_SET_CURRENT(initial_wall); + + initial_tsc = pg_rdtscp(); + prev_tsc = initial_tsc; + + for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++) + { + instr_time now_wall; + int64 now_tsc; + int64 elapsed_ns; + int64 elapsed_ticks; + + INSTR_TIME_SET_CURRENT(now_wall); + + now_tsc = pg_rdtscp(); + + INSTR_TIME_SUBTRACT(now_wall, initial_wall); + elapsed_ns = INSTR_TIME_GET_NANOSEC(now_wall); + + /* Safety: bail out if we've taken too long */ + if (elapsed_ns >= TSC_CALIBRATION_MAX_NS) + break; + + elapsed_ticks = now_tsc - initial_tsc; + + /* + * Skip if TSC hasn't advanced, or we walked backwards for some + * reason. + */ + if (now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0) + continue; + + /* + * We only measure frequency every TSC_CALIBRATION_SKIPS to avoid + * stabilizing based on just a handful of RDTSC instructions. + */ + if (i % TSC_CALIBRATION_SKIPS != 0) + continue; + + freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000; + + /* + * Once freq_khz / prev_freq_khz is small, check if it stays that way. + * If it does for long enough, we've got a winner frequency. + */ + if (prev_freq_khz != 0 && fabs(1 - freq_khz / prev_freq_khz) < 0.0001) + { + stable_count++; + if (stable_count >= TSC_CALIBRATION_STABLE_CYCLES) + break; + } + else + stable_count = 0; + + prev_tsc = now_tsc; + prev_freq_khz = freq_khz; + } + + /* Restore the previous clock source */ + pg_set_timing_clock_source(saved_clock_source); + + if (stable_count < TSC_CALIBRATION_STABLE_CYCLES) + return 0; /* did not converge */ + + return (uint32) freq_khz; +} + +#elif defined(__aarch64__) + +/* + * Check whether this is a heterogeneous Apple Silicon P+E core system + * where CNTVCT_EL0 may tick at different rates on different core types. + */ +static bool +aarch64_has_heterogeneous_cores(void) +{ +#if defined(__APPLE__) + int nperflevels = 0; + size_t len = sizeof(nperflevels); + + if (sysctlbyname("hw.nperflevels", &nperflevels, &len, NULL, 0) == 0) + return nperflevels > 1; +#endif + + return false; +} + +/* + * Detect the generic timer frequency on AArch64. + */ +static void +tsc_detect_frequency(void) +{ + if (aarch64_has_heterogeneous_cores()) + { + timing_tsc_frequency_khz = 0; + return; + } + + timing_tsc_frequency_khz = aarch64_cntvct_frequency_khz(); +} + +/* + * The ARM generic timer is architecturally guaranteed to be monotonic and + * synchronized across cores of the same type, so we always use it by default + * when available and cores are homogenous. + */ +static bool +tsc_use_by_default(void) +{ + return true; +} + +uint32 +pg_tsc_calibrate_frequency(void) +{ + /* No calibration loop on AArch64; frequency comes from CNTFRQ_EL0 */ + return 0; +} + +#endif /* defined(__aarch64__) */ + +#endif /* PG_INSTR_TSC_CLOCK */ diff --git a/src/common/meson.build b/src/common/meson.build index 4f9b8b8263d55..9bd55cda95b10 100644 --- a/src/common/meson.build +++ b/src/common/meson.build @@ -13,6 +13,7 @@ common_sources = files( 'file_perm.c', 'file_utils.c', 'hashfn.c', + 'instr_time.c', 'ip.c', 'jsonapi.c', 'keywords.c', diff --git a/src/include/nodes/queryjumble.h b/src/include/nodes/queryjumble.h index 9f81893003c24..f331449ba78f6 100644 --- a/src/include/nodes/queryjumble.h +++ b/src/include/nodes/queryjumble.h @@ -91,6 +91,9 @@ extern PGDLLIMPORT int compute_query_id; extern const char *CleanQuerytext(const char *query, int *location, int *len); +extern LocationLen *ComputeConstantLengths(const JumbleState *jstate, + const char *query, + int query_loc); extern JumbleState *JumbleQuery(Query *query); extern void EnableQueryId(void); diff --git a/src/include/parser/analyze.h b/src/include/parser/analyze.h index 92c1c502945ee..9da833e40e5ae 100644 --- a/src/include/parser/analyze.h +++ b/src/include/parser/analyze.h @@ -21,7 +21,7 @@ /* Hook for plugins to get control at end of parse analysis */ typedef void (*post_parse_analyze_hook_type) (ParseState *pstate, Query *query, - JumbleState *jstate); + const JumbleState *jstate); extern PGDLLIMPORT post_parse_analyze_hook_type post_parse_analyze_hook; diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index c5d96bb4f479f..aee501a4ecdc4 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -32,8 +32,16 @@ typedef enum X86FeatureId PG_AVX512_VL, PG_AVX512_VPCLMULQDQ, PG_AVX512_VPOPCNTDQ, + + /* identification */ + PG_HYPERVISOR, + + /* Time-Stamp Counter (TSC) flags */ + PG_RDTSCP, + PG_TSC_INVARIANT, + PG_TSC_ADJUST, } X86FeatureId; -#define X86FeaturesSize (PG_AVX512_VPOPCNTDQ + 1) +#define X86FeaturesSize (PG_TSC_ADJUST + 1) extern PGDLLIMPORT bool X86Features[]; @@ -48,6 +56,14 @@ x86_feature_available(X86FeatureId feature) return X86Features[feature]; } +extern uint32 x86_tsc_frequency_khz(void); + #endif /* defined(USE_SSE2) || defined(__i386__) */ +#if defined(__aarch64__) + +extern uint32 aarch64_cntvct_frequency_khz(void); + +#endif /* defined(__aarch64__) */ + #endif /* PG_CPU_H */ diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index 0a1fff7c487ae..dfebdfbf461c1 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -4,9 +4,11 @@ * portable high-precision interval timing * * This file provides an abstraction layer to hide portability issues in - * interval timing. On Unix we use clock_gettime(), and on Windows we use - * QueryPerformanceCounter(). These macros also give some breathing room to - * use other high-precision-timing APIs. + * interval timing. On x86 we use the RDTSC/RDTSCP instruction, and on + * AArch64 the CNTVCT_EL0 generic timer, directly in certain cases, or + * alternatively clock_gettime() on Unix-like systems and + * QueryPerformanceCounter() on Windows. These macros also give some breathing + * room to use other high-precision-timing APIs. * * The basic data type is instr_time, which all callers should treat as an * opaque typedef. instr_time can store either an absolute time (of @@ -17,7 +19,11 @@ * * INSTR_TIME_SET_ZERO(t) set t to zero (memset is acceptable too) * - * INSTR_TIME_SET_CURRENT(t) set t to current time + * INSTR_TIME_SET_CURRENT_FAST(t) set t to current time without waiting + * for instructions in out-of-order window + * + * INSTR_TIME_SET_CURRENT(t) set t to current time while waiting for + * instructions in OOO to retire * * * INSTR_TIME_ADD(x, y) x += y @@ -80,11 +86,108 @@ typedef struct instr_time #define NS_PER_MS INT64CONST(1000000) #define NS_PER_US INT64CONST(1000) +/* Shift amount for fixed-point ticks-to-nanoseconds conversion. */ +#define TICKS_TO_NS_SHIFT 14 -#ifndef WIN32 +/* + * PG_INSTR_TICKS_TO_NS controls whether pg_ticks_to_ns/pg_ns_to_ticks needs to + * check ticks_per_ns_scaled and potentially convert ticks <=> nanoseconds. + * + * PG_INSTR_TSC_CLOCK controls whether the TSC clock source is compiled in, and + * potentially used based on timing_tsc_enabled. + */ +#if defined(__x86_64__) || defined(_M_X64) || (defined(__aarch64__) && !defined(_MSC_VER)) +#define PG_INSTR_TICKS_TO_NS 1 +#define PG_INSTR_TSC_CLOCK 1 +#elif defined(WIN32) +#define PG_INSTR_TICKS_TO_NS 1 +#define PG_INSTR_TSC_CLOCK 0 +#else +#define PG_INSTR_TICKS_TO_NS 0 +#define PG_INSTR_TSC_CLOCK 0 +#endif + +/* + * Variables used to translate ticks to nanoseconds, initialized by + * pg_initialize_timing and adjusted by pg_set_timing_clock_source calls or + * changes of the "timing_clock_source" GUC. + * + * Note that changing these values after setting an instr_time and before + * reading/converting it will lead to incorrect results. This is technically + * possibly because the GUC can be changed at runtime, but unlikely, and we + * allow changing this at runtime to simplify testing of different sources. + */ +extern PGDLLIMPORT uint64 ticks_per_ns_scaled; +extern PGDLLIMPORT uint64 max_ticks_no_overflow; +extern PGDLLIMPORT bool timing_initialized; + +typedef enum +{ + TIMING_CLOCK_SOURCE_AUTO, + TIMING_CLOCK_SOURCE_SYSTEM, +#if PG_INSTR_TSC_CLOCK + TIMING_CLOCK_SOURCE_TSC +#endif +} TimingClockSourceType; + +extern int timing_clock_source; + +/* + * Initialize timing infrastructure + * + * This must be called at least once before using INSTR_TIME_SET_CURRENT* + * macros. + * + * If you want to use the TSC clock source in a client program you just also + * call pg_set_timing_clock_source afterwards. + */ +extern void pg_initialize_timing(void); + +/* + * Sets the time source to be used. Mainly intended for frontend programs, + * the backend should set it via the timing_clock_source GUC instead. + * + * Returns false if the clock source could not be set, for example when TSC + * is not available despite being explicitly set. + */ +extern bool pg_set_timing_clock_source(TimingClockSourceType source); + +/* Whether to actually use TSC based on availability and GUC settings. */ +extern PGDLLIMPORT bool timing_tsc_enabled; + +/* + * TSC frequency in kHz, set during initialization. + * + * -1 = not yet initialized, 0 = TSC not usable, >0 = frequency in kHz. + */ +extern PGDLLIMPORT int32 timing_tsc_frequency_khz; + +#if PG_INSTR_TSC_CLOCK +extern void pg_initialize_timing_tsc(void); -/* Use clock_gettime() */ +extern uint32 pg_tsc_calibrate_frequency(void); + +#endif /* PG_INSTR_TSC_CLOCK */ + +/* + * Returns the current timing clock source effectively in use, resolving + * TIMING_CLOCK_SOURCE_AUTO to either TIMING_CLOCK_SOURCE_SYSTEM or + * TIMING_CLOCK_SOURCE_TSC. + */ +static inline TimingClockSourceType +pg_current_timing_clock_source(void) +{ +#if PG_INSTR_TSC_CLOCK + if (timing_tsc_enabled) + return TIMING_CLOCK_SOURCE_TSC; +#endif + return TIMING_CLOCK_SOURCE_SYSTEM; +} + +#ifndef WIN32 + +/* On POSIX, use clock_gettime() for system clock source */ #include @@ -99,76 +202,258 @@ typedef struct instr_time * than CLOCK_MONOTONIC. In particular, as of macOS 10.12, Apple provides * CLOCK_MONOTONIC_RAW which is both faster to read and higher resolution than * their version of CLOCK_MONOTONIC. + * + * Note this does not get used in case the TSC clock source logic is used, + * which directly calls architecture specific timing instructions (e.g. RDTSC). */ #if defined(__darwin__) && defined(CLOCK_MONOTONIC_RAW) -#define PG_INSTR_CLOCK CLOCK_MONOTONIC_RAW +#define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC_RAW +#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_MONOTONIC_RAW)" #elif defined(CLOCK_MONOTONIC) -#define PG_INSTR_CLOCK CLOCK_MONOTONIC +#define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC +#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_MONOTONIC)" #else -#define PG_INSTR_CLOCK CLOCK_REALTIME +#define PG_INSTR_SYSTEM_CLOCK CLOCK_REALTIME +#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_REALTIME)" #endif -/* helper for INSTR_TIME_SET_CURRENT */ static inline instr_time -pg_clock_gettime_ns(void) +pg_get_ticks_system(void) { instr_time now; struct timespec tmp; - clock_gettime(PG_INSTR_CLOCK, &tmp); + Assert(timing_initialized); + + clock_gettime(PG_INSTR_SYSTEM_CLOCK, &tmp); now.ticks = tmp.tv_sec * NS_PER_S + tmp.tv_nsec; return now; } -#define INSTR_TIME_SET_CURRENT(t) \ - ((t) = pg_clock_gettime_ns()) - -#define INSTR_TIME_GET_NANOSEC(t) \ - ((int64) (t).ticks) - -#define INSTR_TIME_ADD_NANOSEC(t, n) \ - ((t).ticks += (n)) - - #else /* WIN32 */ +/* On Windows, use QueryPerformanceCounter() for system clock source */ -/* Use QueryPerformanceCounter() */ - -/* helper for INSTR_TIME_SET_CURRENT */ +#define PG_INSTR_SYSTEM_CLOCK_NAME "QueryPerformanceCounter" static inline instr_time -pg_query_performance_counter(void) +pg_get_ticks_system(void) { instr_time now; LARGE_INTEGER tmp; + Assert(timing_initialized); + QueryPerformanceCounter(&tmp); now.ticks = tmp.QuadPart; return now; } -static inline double -GetTimerFrequency(void) +#endif /* WIN32 */ + +static inline int64 +pg_ticks_to_ns(int64 ticks) { - LARGE_INTEGER f; +#if PG_INSTR_TICKS_TO_NS + int64 ns = 0; + + Assert(timing_initialized); + + /* + * Avoid doing work if we don't use scaled ticks, e.g. system clock on + * Unix (in that case ticks is counted in nanoseconds) + */ + if (ticks_per_ns_scaled == 0) + return ticks; + + /* + * Would multiplication overflow? If so perform computation in two parts. + */ + if (unlikely(ticks > (int64) max_ticks_no_overflow)) + { + /* + * To avoid overflow, first scale total ticks down by the fixed + * factor, and *afterwards* multiply them by the frequency-based scale + * factor. + * + * The remaining ticks can follow the regular formula, since they + * won't overflow. + */ + int64 count = ticks >> TICKS_TO_NS_SHIFT; + + ns = count * ticks_per_ns_scaled; + ticks -= (count << TICKS_TO_NS_SHIFT); + } + + ns += (ticks * ticks_per_ns_scaled) >> TICKS_TO_NS_SHIFT; + + return ns; +#else + Assert(timing_initialized); - QueryPerformanceFrequency(&f); - return (double) f.QuadPart; + return ticks; +#endif /* PG_INSTR_TICKS_TO_NS */ } -#define INSTR_TIME_SET_CURRENT(t) \ - ((t) = pg_query_performance_counter()) +static inline int64 +pg_ns_to_ticks(int64 ns) +{ +#if PG_INSTR_TICKS_TO_NS + int64 ticks = 0; -#define INSTR_TIME_GET_NANOSEC(t) \ - ((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency()))) + Assert(timing_initialized); -#define INSTR_TIME_ADD_NANOSEC(t, n) \ - ((t).ticks += ((n) / ((double) NS_PER_S / GetTimerFrequency()))) + /* + * If ticks_per_ns_scaled is zero, ticks are already in nanoseconds (e.g. + * system clock on Unix). + */ + if (ticks_per_ns_scaled == 0) + return ns; -#endif /* WIN32 */ + /* + * The reverse of pg_ticks_to_ns to avoid a similar overflow problem. + */ + if (unlikely(ns > (INT64_MAX >> TICKS_TO_NS_SHIFT))) + { + int64 count = ns / ticks_per_ns_scaled; + ticks = count << TICKS_TO_NS_SHIFT; + ns -= count * ticks_per_ns_scaled; + } + + ticks += (ns << TICKS_TO_NS_SHIFT) / ticks_per_ns_scaled; + + return ticks; +#else + Assert(timing_initialized); + + return ns; +#endif /* PG_INSTR_TICKS_TO_NS */ +} + +#if PG_INSTR_TSC_CLOCK + +#if defined(__x86_64__) || defined(_M_X64) + +#define PG_INSTR_TSC_CLOCK_NAME_FAST "RDTSC" +#define PG_INSTR_TSC_CLOCK_NAME "RDTSCP" + +#ifdef _MSC_VER +#include +#endif /* defined(_MSC_VER) */ + +/* Helpers to abstract compiler differences for reading the x86 TSC. */ +static inline int64 +pg_rdtsc(void) +{ +#ifdef _MSC_VER + return __rdtsc(); +#else + return __builtin_ia32_rdtsc(); +#endif /* defined(_MSC_VER) */ +} + +static inline int64 +pg_rdtscp(void) +{ + uint32 unused; + +#ifdef _MSC_VER + return __rdtscp(&unused); +#else + return __builtin_ia32_rdtscp(&unused); +#endif /* defined(_MSC_VER) */ +} + +static pg_attribute_always_inline instr_time +pg_get_ticks_fast(void) +{ + if (likely(timing_tsc_enabled)) + { + instr_time now; + + now.ticks = pg_rdtsc(); + return now; + } + + return pg_get_ticks_system(); +} + +static pg_attribute_always_inline instr_time +pg_get_ticks(void) +{ + if (likely(timing_tsc_enabled)) + { + instr_time now; + + now.ticks = pg_rdtscp(); + return now; + } + + return pg_get_ticks_system(); +} + +#elif defined(__aarch64__) && !defined(_MSC_VER) + +#define PG_INSTR_TSC_CLOCK_NAME_FAST "CNTVCT_EL0" +#define PG_INSTR_TSC_CLOCK_NAME "CNTVCT_EL0 (ISB)" + +/* + * Read the ARM generic timer counter (CNTVCT_EL0). + * + * The "fast" variant reads the counter without a barrier, analogous to RDTSC + * on x86. The regular variant issues an ISB (Instruction Synchronization + * Barrier) first, which acts as a serializing instruction analogous to RDTSCP, + * ensuring all preceding instructions have completed before reading the + * counter. + */ +static pg_attribute_always_inline instr_time +pg_get_ticks_fast(void) +{ + if (likely(timing_tsc_enabled)) + { + instr_time now; + + now.ticks = __builtin_arm_rsr64("cntvct_el0"); + return now; + } + + return pg_get_ticks_system(); +} + +static pg_attribute_always_inline instr_time +pg_get_ticks(void) +{ + if (likely(timing_tsc_enabled)) + { + instr_time now; + + __builtin_arm_isb(0xf); + now.ticks = __builtin_arm_rsr64("cntvct_el0"); + return now; + } + + return pg_get_ticks_system(); +} + +#endif /* defined(__aarch64__) */ + +#else /* !PG_INSTR_TSC_CLOCK */ + +static pg_attribute_always_inline instr_time +pg_get_ticks_fast(void) +{ + return pg_get_ticks_system(); +} + +static pg_attribute_always_inline instr_time +pg_get_ticks(void) +{ + return pg_get_ticks_system(); +} + +#endif /* PG_INSTR_TSC_CLOCK */ /* * Common macros @@ -178,10 +463,19 @@ GetTimerFrequency(void) #define INSTR_TIME_SET_ZERO(t) ((t).ticks = 0) +#define INSTR_TIME_SET_CURRENT_FAST(t) \ + ((t) = pg_get_ticks_fast()) + +#define INSTR_TIME_SET_CURRENT(t) \ + ((t) = pg_get_ticks()) + #define INSTR_TIME_ADD(x,y) \ ((x).ticks += (y).ticks) +#define INSTR_TIME_ADD_NANOSEC(t, n) \ + ((t).ticks += pg_ns_to_ticks(n)) + #define INSTR_TIME_SUBTRACT(x,y) \ ((x).ticks -= (y).ticks) @@ -191,6 +485,9 @@ GetTimerFrequency(void) #define INSTR_TIME_GT(x,y) \ ((x).ticks > (y).ticks) +#define INSTR_TIME_GET_NANOSEC(t) \ + (pg_ticks_to_ns((t).ticks)) + #define INSTR_TIME_GET_DOUBLE(t) \ ((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S) diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h index b01697c1f606d..307f4fbaefe08 100644 --- a/src/include/utils/guc_hooks.h +++ b/src/include/utils/guc_hooks.h @@ -163,6 +163,9 @@ extern const char *show_timezone(void); extern bool check_timezone_abbreviations(char **newval, void **extra, GucSource source); extern void assign_timezone_abbreviations(const char *newval, void *extra); +extern void assign_timing_clock_source(int newval, void *extra); +extern bool check_timing_clock_source(int *newval, void **extra, GucSource source); +extern const char *show_timing_clock_source(void); extern bool check_transaction_buffers(int *newval, void **extra, GucSource source); extern bool check_transaction_deferrable(bool *newval, void **extra, GucSource source); extern bool check_transaction_isolation(int *newval, void **extra, GucSource source); diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index 71a8016196138..63440b8e36c83 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -60,6 +60,7 @@ enum config_group CONN_AUTH_TCP, CONN_AUTH_AUTH, CONN_AUTH_SSL, + RESOURCES_TIME, RESOURCES_MEM, RESOURCES_DISK, RESOURCES_KERNEL, diff --git a/src/port/meson.build b/src/port/meson.build index 922b3f646768d..d695f92b769e1 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_cpu_arm.c', 'pg_cpu_x86.c', 'pg_getopt_ctx.c', 'pg_localeconv_r.c', diff --git a/src/port/pg_cpu_arm.c b/src/port/pg_cpu_arm.c new file mode 100644 index 0000000000000..2814a9477065d --- /dev/null +++ b/src/port/pg_cpu_arm.c @@ -0,0 +1,45 @@ +/*------------------------------------------------------------------------- + * + * pg_cpu_arm.c + * Runtime CPU feature detection for AArch64 + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_cpu_arm.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#if defined(__aarch64__) && !defined(_MSC_VER) + +#include "port/pg_cpu.h" + +/* + * Return the frequency of the ARM generic timer (CNTVCT_EL0) in kHz. + * + * The CNTFRQ_EL0 system register is architecturally guaranteed to be readable + * from EL0 (userspace) and holds the timer frequency in Hz. The firmware sets + * this at boot and it does not change. + * + * Returns 0 if the frequency is not available (should not happen on conforming + * implementations). + */ +uint32 +aarch64_cntvct_frequency_khz(void) +{ + uint64 freq; + + freq = __builtin_arm_rsr64("cntfrq_el0"); + + if (freq == 0) + return 0; + + return (uint32) (freq / 1000); +} + +#endif /* defined(__aarch64__) */ diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index 40ff78633ca3f..8951e7a0811ce 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -80,13 +80,13 @@ pg_cpuid(int leaf, unsigned int *reg) static inline bool pg_cpuid_subleaf(int leaf, int subleaf, unsigned int *reg) { + memset(reg, 0, 4 * sizeof(unsigned int)); #if defined(HAVE__GET_CPUID_COUNT) return __get_cpuid_count(leaf, subleaf, ®[EAX], ®[EBX], ®[ECX], ®[EDX]) == 1; #elif defined(HAVE__CPUIDEX) __cpuidex((int *) reg, leaf, subleaf); return true; #else - memset(reg, 0, 4 * sizeof(unsigned int)); return false; #endif } @@ -101,19 +101,24 @@ void set_x86_features(void) { unsigned int reg[4] = {0}; + bool have_osxsave; pg_cpuid(0x01, reg); X86Features[PG_SSE4_2] = reg[ECX] >> 20 & 1; X86Features[PG_POPCNT] = reg[ECX] >> 23 & 1; + X86Features[PG_HYPERVISOR] = reg[ECX] >> 31 & 1; + have_osxsave = reg[ECX] >> 27 & 1; + + pg_cpuid_subleaf(0x07, 0, reg); + + X86Features[PG_TSC_ADJUST] = reg[EBX] >> 1 & 1; /* leaf 7 features that depend on OSXSAVE */ - if (reg[ECX] & (1 << 27)) + if (have_osxsave) { uint32 xcr0_val = 0; - pg_cpuid_subleaf(0x07, 0, reg); - #ifdef HAVE_XSAVE_INTRINSICS /* get value of Extended Control Register */ xcr0_val = _xgetbv(0); @@ -135,7 +140,126 @@ set_x86_features(void) } } + /* Check for other TSC related flags */ + pg_cpuid(0x80000001, reg); + X86Features[PG_RDTSCP] = reg[EDX] >> 27 & 1; + + pg_cpuid(0x80000007, reg); + X86Features[PG_TSC_INVARIANT] = reg[EDX] >> 8 & 1; + X86Features[INIT_PG_X86] = true; } +/* TSC (Time-stamp Counter) handling code */ + +static uint32 x86_hypervisor_tsc_frequency_khz(void); + +/* + * Determine the TSC frequency of the CPU through CPUID, where supported. + * + * Needed to interpret the tick value returned by RDTSC/RDTSCP. Return value of + * 0 indicates the frequency information was not accessible via CPUID. + */ +uint32 +x86_tsc_frequency_khz(void) +{ + unsigned int reg[4] = {0}; + + if (x86_feature_available(PG_HYPERVISOR)) + { + uint32 freq = x86_hypervisor_tsc_frequency_khz(); + + if (freq > 0) + return freq; + } + + /* + * On modern Intel CPUs, the TSC is implemented by invariant timekeeping + * hardware, also called "Always Running Timer", or ART. The ART stays + * consistent even if the CPU changes frequency due to changing power + * levels. + * + * As documented in "Determining the Processor Base Frequency" in the + * "IntelĀ® 64 and IA-32 Architectures Software Developer's Manual", + * February 2026 Edition, we can get the TSC frequency as follows: + * + * Nominal TSC frequency = ( CPUID.15H:ECX[31:0] * CPUID.15H:EBX[31:0] ) / + * CPUID.15H:EAX[31:0] + * + * With CPUID.15H:ECX representing the nominal core crystal clock + * frequency, and EAX/EBX representing values used to translate the TSC + * value to that frequency, see "Chapter 20.17 "Time-Stamp Counter" of + * that manual. + * + * Older Intel CPUs, and other vendors do not set CPUID.15H:ECX, and as + * such we fall back to alternate approaches. + */ + pg_cpuid(0x15, reg); + if (reg[ECX] > 0) + { + /* + * EBX not being set indicates invariant TSC is not available. Require + * EAX being non-zero too, to avoid a theoretical divide by zero. + */ + if (reg[EAX] == 0 || reg[EBX] == 0) + return 0; + + return reg[ECX] / 1000 * reg[EBX] / reg[EAX]; + } + + /* + * When CPUID.15H is not available/incomplete, we can instead try to get + * the processor base frequency in MHz from CPUID.16H:EAX, the "Processor + * Frequency Information Leaf". + */ + pg_cpuid(0x16, reg); + if (reg[EAX] > 0) + return reg[EAX] * 1000; + + return 0; +} + +/* + * Support for reading TSC frequency for hypervisors passing it to a guest VM. + * + * Two Hypervisors (VMware and KVM) are known to make TSC frequency in KHz + * available at the vendor-specific 0x40000010 leaf in the EAX register. + * + * For some other Hypervisors that have an invariant TSC, e.g. HyperV, we would + * need to access a model-specific register (MSR) to get the frequency. MSRs are + * separate from CPUID and typically not available for unprivileged processes, + * so we can't get the frequency this way. + */ +#define CPUID_HYPERVISOR_VMWARE(r) (r[EBX] == 0x61774d56 && r[ECX] == 0x4d566572 && r[EDX] == 0x65726177) /* VMwareVMware */ +#define CPUID_HYPERVISOR_KVM(r) (r[EBX] == 0x4b4d564b && r[ECX] == 0x564b4d56 && r[EDX] == 0x0000004d) /* KVMKVMKVM */ +static uint32 +x86_hypervisor_tsc_frequency_khz(void) +{ + unsigned int reg[4] = {0}; + +#if defined(HAVE__CPUIDEX) + + /* + * The hypervisor is determined using the 0x40000000 Hypervisor + * information leaf, which requires use of __cpuidex to set ECX to 0 to + * access it. + * + * The similar __get_cpuid_count function does not work as expected since + * it contains a check for __get_cpuid_max, which has been observed to be + * lower than the special Hypervisor leaf, despite it being available. + */ + __cpuidex((int *) reg, 0x40000000, 0); + + if (reg[EAX] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(reg) || CPUID_HYPERVISOR_KVM(reg))) + { + __cpuidex((int *) reg, 0x40000010, 0); + if (reg[EAX] > 0) + return reg[EAX]; + } +#endif /* HAVE__CPUIDEX */ + + return 0; +} + + #endif /* defined(USE_SSE2) || defined(__i386__) */ diff --git a/src/test/modules/test_misc/t/011_lock_stats.pl b/src/test/modules/test_misc/t/011_lock_stats.pl index 7662db160173a..45d7d26f70ccb 100644 --- a/src/test/modules/test_misc/t/011_lock_stats.pl +++ b/src/test/modules/test_misc/t/011_lock_stats.pl @@ -36,9 +36,9 @@ sub setup_sessions $s2 = $node->background_psql('postgres'); # Setup injection points for the waiting session - $s2->query_safe( - q[ - SELECT injection_points_set_local(); + $s2->query_until( + qr/attaching_injection_point/, q[ + \echo attaching_injection_point SELECT injection_points_attach('deadlock-timeout-fired', 'wait'); ]); } @@ -64,10 +64,11 @@ sub wait_and_detach my ($node, $point_name) = @_; $node->wait_for_event('client backend', $point_name); - $node->safe_psql('postgres', - "SELECT injection_points_detach('$point_name');"); - $node->safe_psql('postgres', - "SELECT injection_points_wakeup('$point_name');"); + $node->safe_psql( + 'postgres', qq[ +SELECT injection_points_detach('$point_name'); +SELECT injection_points_wakeup('$point_name'); +]); } # Node initialization diff --git a/src/test/regress/expected/misc_functions.out b/src/test/regress/expected/misc_functions.out index cf55cdf3688d9..c3261bff209fb 100644 --- a/src/test/regress/expected/misc_functions.out +++ b/src/test/regress/expected/misc_functions.out @@ -850,3 +850,14 @@ SELECT oldest_multixact IS NULL AS null_result FROM pg_get_multixact_stats(); RESET ROLE; DROP ROLE regress_multixact_funcs; +-- test instr_time nanosecond<->ticks conversion +CREATE FUNCTION test_instr_time() + RETURNS bool + AS :'regresslib' + LANGUAGE C; +SELECT test_instr_time(); + test_instr_time +----------------- + t +(1 row) + diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index c6ba2479413c8..37070c1a89639 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -99,7 +99,8 @@ CREATE STATISTICS tst (ndistinct) ON (y + z) FROM ext_stats_test; ERROR: cannot specify statistics kinds when building univariate statistics -- multivariate statistics without a less-than operator not supported CREATE STATISTICS tst (ndistinct) ON x, w from ext_stats_test; -ERROR: column "w" cannot be used in multivariate statistics because its type xid has no default btree operator class +ERROR: cannot create multivariate statistics on column "w" +DETAIL: The type xid has no default btree operator class. DROP TABLE ext_stats_test; -- Ensure stats are dropped sanely, and test IF NOT EXISTS while at it CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER); diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c index 9a918156437b2..0c0620569829b 100644 --- a/src/test/regress/pg_regress.c +++ b/src/test/regress/pg_regress.c @@ -2181,6 +2181,8 @@ regression_main(int argc, char *argv[], progname = get_progname(argv[0]); set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_regress")); + pg_initialize_timing(); + get_restricted_token(); atexit(stop_postmaster); diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index 68a01a1dde014..c2eaa96f08605 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -38,6 +38,7 @@ #include "optimizer/plancat.h" #include "parser/parse_coerce.h" #include "port/atomics.h" +#include "portability/instr_time.h" #include "postmaster/postmaster.h" /* for MAX_BACKENDS */ #include "storage/spin.h" #include "tcop/tcopprot.h" @@ -1384,3 +1385,38 @@ test_translation(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* Verify that pg_ticks_to_ns behaves correct, including overflow */ +PG_FUNCTION_INFO_V1(test_instr_time); +Datum +test_instr_time(PG_FUNCTION_ARGS) +{ + instr_time t; + int64 test_ns[] = {0, 1000, INT64CONST(1000000000000000)}; + int64 max_err; + + /* + * The ns-to-ticks-to-ns roundtrip may lose precision due to integer + * truncation in the fixed-point conversion. The maximum error depends on + * ticks_per_ns_scaled relative to the shift factor. + */ + max_err = (ticks_per_ns_scaled >> TICKS_TO_NS_SHIFT) + 1; + + for (int i = 0; i < lengthof(test_ns); i++) + { + int64 result; + + INSTR_TIME_SET_ZERO(t); + INSTR_TIME_ADD_NANOSEC(t, test_ns[i]); + result = INSTR_TIME_GET_NANOSEC(t); + + if (result < test_ns[i] - max_err || result > test_ns[i]) + elog(ERROR, + "INSTR_TIME_GET_NANOSEC(t) yielded " INT64_FORMAT + ", expected " INT64_FORMAT " (max_err " INT64_FORMAT + ") in file \"%s\" line %u", + result, test_ns[i], max_err, __FILE__, __LINE__); + } + + PG_RETURN_BOOL(true); +} diff --git a/src/test/regress/sql/misc_functions.sql b/src/test/regress/sql/misc_functions.sql index c8226652f2c94..946ee5726cdd7 100644 --- a/src/test/regress/sql/misc_functions.sql +++ b/src/test/regress/sql/misc_functions.sql @@ -349,3 +349,10 @@ SET ROLE regress_multixact_funcs; SELECT oldest_multixact IS NULL AS null_result FROM pg_get_multixact_stats(); RESET ROLE; DROP ROLE regress_multixact_funcs; + +-- test instr_time nanosecond<->ticks conversion +CREATE FUNCTION test_instr_time() + RETURNS bool + AS :'regresslib' + LANGUAGE C; +SELECT test_instr_time(); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 9e6a39f560833..07ac380cf976a 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3185,6 +3185,7 @@ TimeoutId TimeoutType Timestamp TimestampTz +TimingClockSourceType TmFromChar TmToChar ToastAttrInfo