funtrace/funtrace.cpp at master · yosefk/funtrace · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//an easy way to make the compiler wrappers add the funcount runtime
//to the program instead of the funtrace runtime
#ifdef FUNTRACE_FUNCOUNT
#include "funcount.cpp"

#else

#include <x86intrin.h>
#include <cstdint>
#include <iostream>
#include <fstream>
#include <sstream>
#include <set>
#include <mutex>
#include <vector>
#include <dlfcn.h>
#include <pthread.h>
#include <algorithm>
#include <cstring>
#include <cassert>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <link.h>
#include "funtrace.h"
#include "funtrace_flags.h"

//buffer sizes - configurable with #defines and env vars
//(and for function call tracing, on a per thread basis)

//the ftrace events buffer size is set in the number of events, not in bytes.
//in a test 20K events took about 1MB but YMMV
#ifndef FUNTRACE_FTRACE_EVENTS_IN_BUF
#define FUNTRACE_FTRACE_EVENTS_IN_BUF 20000
#endif

//the buffer size is in bytes. the size of an event in the current implementation
//is 32 bytes. the size is defined as a log (the size must be a power of 2)
#ifndef FUNTRACE_LOG_BUF_SIZE
#define FUNTRACE_LOG_BUF_SIZE 20
#endif

//when threads exit, their traces are kept around for this number of milliseconds
//(or deleted immediately if it's set to 0.) this is useful because otherwise
//recently finished threads are invisible in trace snapshots, which is misleading
#ifndef FUNTRACE_GC_MAX_AGE_MS
#define FUNTRACE_GC_MAX_AGE_MS 300
#endif

#ifndef FUNTRACE_GC_PERIOD_MS
#define FUNTRACE_GC_PERIOD_MS FUNTRACE_GC_MAX_AGE_MS
#endif


//we could ask the user to compile this file without -finstrument-functions/-pg
//instead of peppering all of the code with NOINSTR. but this file wants
//to be compiled into the executable (rather than be its own shared object
//with its own build script - which would make TLS access slower), and we
//don't want the user to have to figure out how to compile all of their program
//with -finstrument-functions/-pg but not use these compiler flags with this one file
//which compiles together with the other files in that program - could be hard
//in some build systems.
//
//note that we do things like add explicit ctors/dtors to non-POD structs
//which would have these generated by the compiler - because this way we can
//put NOINSTR on them...
#define NOINSTR NOFUNTRACE

//if you modify this struct, update funtrace_pg.S as well
struct trace_entry
{
    void* func;
    uint64_t cycle;
};

struct thread_id
{
    uint64_t pid;
    uint64_t tid;
    char name[16];
};

static int NOINSTR env_int(const char* env, int default_value)
{
    const char* val = getenv(env);
    return val ? atoi(val) : default_value;
}

static int NOINSTR default_log_buf_size()
{
    //0 disables tracing
    static int value = env_int("FUNTRACE_LOG_BUF_SIZE", FUNTRACE_LOG_BUF_SIZE);
    return value;
}

struct trace_data
{
    trace_entry* pos; //pos points into buf which must be aligned to buf_size*2
    uint64_t wraparound_mask; //this is occasionally set to 0 to temporarily disable tracing;
    trace_entry* buf;
    uint64_t buf_size;
    pthread_t thread; //null for finished threads that need to be gc'd
    thread_id id;
    uint64_t time_of_death_ms; //for about-to-be garbage-collected objects

    inline void NOINSTR pause_tracing() { wraparound_mask = 0; }
    inline void NOINSTR resume_tracing() { wraparound_mask = ~buf_size; }

    void NOINSTR allocate(int log_buf_size=0)
    {
        if(log_buf_size == 0) {
            log_buf_size = default_log_buf_size();
        }
        //we align the allocation to twice the buffer size so that after we increment the pos pointer,
        //we can clear the bit FUNTRACE_LOG_BUF_SIZE without worrying that the increment
        //carried values to bits at higher positions
        if(log_buf_size) {
            buf_size = 1<<log_buf_size;
            wraparound_mask = ~buf_size;
            trace_entry* entries = nullptr;
            int memret = posix_memalign((void**)&entries, buf_size*2, buf_size);
            buf = pos = entries;
            if(memret) { //allocation failed - disable tracing
                buf_size = wraparound_mask = 0;
            }
            else {
                //we don't want to spend time on clearing the entire buffer; we zero
                //the last entry's cycle to be able to check whether wraparound occured,
                //to avoid accessing uninitialized memory
                buf[buf_size/sizeof(trace_entry)-1].cycle = 0;
            }
        }
        else {
            buf_size = wraparound_mask = 0;
            buf = pos = nullptr;
        }
    }

    void NOINSTR free()
    {
        ::free(buf);
        buf_size = wraparound_mask = 0;
        buf = pos = nullptr;
    }

    inline void NOINSTR trace(void* ptr, uint64_t is_ret);

    void NOINSTR update_name()
    {
        if(thread) { //if !thread, this trace belongs to a finished thread
            pthread_getname_np(thread, id.name, sizeof id.name);
        }
    }
};

inline void NOINSTR trace_data::trace(void* ptr, uint64_t flags)
{
    static_assert(sizeof(trace_entry) == 16, "funtrace_pg.S assumes 16 byte trace_entry structs");

    //ANDing pos and wraparound_mask and testing the result creates nicely-looking code on x86
    //where on one hand you run just 4 instructions when wraparound_mask==0 [trace disabled],
    //and on the otehr hand you only add one instruction (je) to support early exit when
    //the trace is disabled. not sure it is really always faster on x86 than testing wraparound_mask
    //though and on other architectures you might want to do this a bit differently. in any case,
    //for correctness it's OK to have entry+1 written to pos before ANDing with the mask,
    //and it's also OK to AND and then store;
    //what's NOT OK is loading wraparound_mask more than once - it might be zeroed by another
    //thread, so you shouldn't test it and then reload it for ANDing with pos.
    trace_entry* entry = pos;
    entry = (trace_entry*)(uint64_t(entry) & wraparound_mask);

    if(!entry) {
        return;
    }

    uint64_t cycle = __rdtsc();
    uint64_t func = ((uint64_t)ptr) | flags;

    //this generates prefetchw and doesn't impact povray's runtime (with any of +4, 8, 12, 16)
    //__builtin_prefetch(entry+4, 1, 0);

    //straightforward writing:
    entry->func = (void*)func;
    entry->cycle = cycle;

    //this generates movntq and makes povray slower relatively to straightforward writing
    //__m64* pm64 = (__m64*)entry;
    //_mm_stream_pi(pm64, _m_from_int64(func));
    //_mm_stream_pi(pm64+1, _m_from_int64(cycle));

    //this generates vmovntdq and also makes povray slower
    //__m128i xmm = _mm_set_epi64x(cycle, func);
    //_mm_stream_si128((__m128i*)entry, xmm);

    //printf("%p 0x%lx 0x%lx\n", (void*)entry, uint64_t(entry) & (buf_size*2-1), wraparound_mask);

    pos = entry+1;
}

thread_local trace_data g_thread_trace;

#if (defined __GNUC__) && !(defined __clang__)
#define CALL_FLAGS (1ULL<<FUNTRACE_CALL_RETURNING_UPON_THROW_BIT)
#else
#define CALL_FLAGS 0
#endif

extern "C" void NOINSTR __cyg_profile_func_enter(void* func, void*) { g_thread_trace.trace(func, CALL_FLAGS); }
extern "C" void NOINSTR __cyg_profile_func_exit(void* func, void*) { g_thread_trace.trace(func, 1ULL<<FUNTRACE_RETURN_BIT); }

//funtrace_pg.S is based on assembly generated from these C functions, massaged
//to avoid clobbering the registers where arguments might have been passed to
//the caller of __fentry__ and __return__ (and in __return__'s case,
//also to avoid clobbering the caller's return value) - unfortunately didn't
//find a way to tell gcc not to do that given C code, other than that there's
//not really any need to use assembly rather than C here
//
//(the other changes to funtrace_pg.S in addition to register renaming are
//to put macros like FUNTRACE_RETURN_BIT instead of the constants generated by gcc)
#if 0
extern "C" void NOINSTR __fentry__() {
    g_thread_trace.trace(__builtin_return_address(0),0);
}
extern "C" void NOINSTR __return__() {
    g_thread_trace.trace(__builtin_return_address(0),1ULL<<FUNTRACE_RETURN_BIT);
}
#endif

static std::string NOINSTR get_cmdline()
{
    std::ifstream cmdline_file("/proc/self/cmdline", std::ios::binary);
    if(!cmdline_file) {
        return "UNKNOWN";
    }

    std::vector<char> buffer((std::istreambuf_iterator<char>(cmdline_file)),
                             std::istreambuf_iterator<char>());

    // Replace null characters with spaces. this will misprepresent
    // the argument "A B" as two arguments, "A" and "B"; and if there are
    // special characters that must be escaped in the shell, you will get them
    // without any escaping. But, should be better than nothing when wondering
    // where some trace came from
    for (char &c : buffer) {
        if (c == '\0') {
            c = ' ';
        }
    }

    return std::string(buffer.begin(), buffer.end()-1); //-1 for the last null
}

static uint64_t cpu_cycles_per_second();

static uint64_t NOINSTR current_time_ms()
{
    return funtrace_time() / (cpu_cycles_per_second() / 1000);
}

struct trace_global_state
{
    //a set is easier to erase from but slower to iterate over and we want
    //to iterate quickly when pausing the tracing
    std::vector<trace_data*> thread_traces;
    std::ofstream trace_file;
    std::mutex mutex; //guards both thread_traces (from newly created
    //and destroyed threads adding/removing their trace buffers to the set)
    //and trace_file (from calls to funtrace_pause_and_write_current_snapshot()
    //which might come from multiple threads)
    uint64_t pid;
    std::string cmdline;
    uint64_t cpu_freq;
    char* exe_path = nullptr;
    bool destroyed = false; //to guard against threads starting/finishing
    //after this object is destroyed
    int gc_max_age_ms;
    bool exiting = false; //when we're exiting the program we're no longer
    //deferring the destruction of trace buffers to avoid memory leaks
    //(not that it matters but it's extra noise in leak reporting tools)
    bool tracing_paused = false;

    NOINSTR trace_global_state()
    {
        pid = getpid();
        cmdline = get_cmdline();
        cpu_freq = cpu_cycles_per_second();
        exe_path = realpath("/proc/self/exe", nullptr);
        gc_max_age_ms = env_int("FUNTRACE_GC_MAX_AGE_MS", FUNTRACE_GC_MAX_AGE_MS);
    }

    std::ostream& NOINSTR file()
    {
        if(!trace_file.is_open()) {
            trace_file.open("funtrace.raw", std::ios::binary);
        }
        return trace_file;
    }

    //can't be called more than once before unregister_this_thread()
    void NOINSTR register_this_thread(int log_buf_size=0)
    {
        g_thread_trace.allocate(log_buf_size);

        std::lock_guard<std::mutex> guard(mutex);
        if(tracing_paused) {
            g_thread_trace.pause_tracing(); //will be resumed by trace_state::resume_tracing()
        }

        g_thread_trace.id.pid = pid;
        g_thread_trace.id.tid = gettid();
        g_thread_trace.thread = pthread_self();
        thread_traces.push_back(&g_thread_trace);
    }

    //safe to call many times without calling register_this_thread() between the calls
    void NOINSTR unregister_this_thread(bool defer_free=false)
    {
        std::lock_guard<std::mutex> guard(mutex);
        for(int i=0; i<(int)thread_traces.size(); ++i) {
            if(thread_traces[i] == &g_thread_trace) {
                if(gc_max_age_ms == 0 || !defer_free || exiting) {
                    //no GC - free this trace (which points to a soon-to-be-deallocated buffer)
                    g_thread_trace.free();
                    thread_traces[i] = thread_traces.back();
                    thread_traces.pop_back();
                }
                else {
                    //keep the trace - GC will delete it together with the buffer it points to later
                    thread_traces[i] = new trace_data(g_thread_trace);
                    thread_traces[i]->update_name();
                    thread_traces[i]->thread = 0;
                    thread_traces[i]->time_of_death_ms = current_time_ms();
                }
                break;
            }
        }
    }

    void NOINSTR collect_garbage(bool exiting_program)
    {
        if(gc_max_age_ms == 0) {
            return;
        }
        std::lock_guard<std::mutex> guard(mutex);
        if(exiting_program) {
            exiting = true;
        }
        uint64_t now = current_time_ms();
        for(int i=0; i<(int)thread_traces.size(); ++i) {
            if(thread_traces[i]->thread) {
                continue; //live thread
            }
            if(!exiting && thread_traces[i]->time_of_death_ms + gc_max_age_ms > now) {
                continue; //recently finished thread
            }
            //garbage-collect this trace
            thread_traces[i]->free();
            delete thread_traces[i];

            thread_traces[i] = thread_traces.back();
            thread_traces.pop_back();
            --i;
        }
    }

    void NOINSTR pause_tracing()
    {
        tracing_paused = true; //for to-be-created threads
        for(auto trace : thread_traces) { //for running threads
            trace->pause_tracing();
        }
    }

    void NOINSTR resume_tracing()
    {
        tracing_paused = false;
        for(auto trace : thread_traces) {
            trace->resume_tracing();
        }
    }
};

//we keep this for funtrace_gdb.py which has trouble accessing trace_state()::g_trace_state
trace_global_state* g_p_trace_state = nullptr;

//the reason we heap-allocate and leak this object instead of making it a regular
//global variable is that threads might be terminating during the program destruction
//sequence, and we need unregister_this_thread() to work, and similarly, we need
//collect_garbage() to work and it can be called at any moment.
static trace_global_state& NOINSTR trace_state() {
    static trace_global_state& g_trace_state = *new trace_global_state;
    if(!g_p_trace_state) {
        g_p_trace_state = &g_trace_state;
    }
    return g_trace_state;
}

// the code below for getting the TSC frequency is straight from an LLM,
// and I'm not sure when it doesn't work. when we see at runtime
// that it doesn't we fall back on parsing dmesg, and if that fails,
// on usleeping and measuring the number of ticks it took.
//
// LLVM XRay uses /sys/devices/system/cpu/cpu0/tsc_freq_khz but it
// recommends against using it in production and it's not available
// by default; see https://blog.trailofbits.com/2019/10/03/tsc-frequency-for-all-better-profiling-and-benchmarking/
//
// a better method for getting the TSC frequency would be most welcome.

// Structure to hold CPUID leaf 15H return values
struct cpuid_15h_t {
    uint32_t eax;  // denominator
    uint32_t ebx;  // numerator
    uint32_t ecx;  // nominal frequency in Hz
    uint32_t edx;  // reserved
};

// Function to execute CPUID instruction
static inline void NOINSTR cpuid(uint32_t leaf, uint32_t subleaf,
                        uint32_t *eax, uint32_t *ebx,
                        uint32_t *ecx, uint32_t *edx) {
    __asm__ __volatile__ (
        "cpuid"
        : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
        : "a" (leaf), "c" (subleaf)
    );
}

// Function to get TSC frequency using CPUID leaf 15H
static uint64_t NOINSTR get_tsc_freq(void) {
    struct cpuid_15h_t res;
    uint32_t max_leaf;

    // First check if CPUID leaf 15H is supported
    cpuid(0, 0, &max_leaf, &res.ebx, &res.ecx, &res.edx);
    if (max_leaf < 0x15) {
        //printf("CPUID leaf 15H not supported\n");
        return 0;
    }

    // Get values from leaf 15H
    cpuid(0x15, 0, &res.eax, &res.ebx, &res.ecx, &res.edx);

    // Check if values are valid
    if (res.eax == 0 || res.ebx == 0) {
        //printf("Invalid CPUID.15H values returned\n");
        return 0;
    }

    // If ECX is non-zero, it provides the nominal frequency directly
    if (res.ecx) {
        uint64_t tsc_hz = ((uint64_t)res.ecx * res.ebx) / res.eax;
        return tsc_hz;
    } else {
        // let's try other methods
        return 0;
        /* this might have been an OK fallback but we probably have better fallbacks
        // If ECX is zero, we need crystal clock frequency
        // This is typically 24MHz or 25MHz for Intel processors
        // You might want to get this from BIOS or assume a common value
        const uint64_t crystal_hz = 24000000; // Assume 24MHz
        uint64_t tsc_hz = (crystal_hz * res.ebx) / res.eax;
        return tsc_hz;
        */
    }
}

static uint64_t NOINSTR cpu_cycles_per_second()
{
    uint64_t freq = 0;
    freq = get_tsc_freq();
    if(!freq) {
        FILE* f = popen("dmesg | grep -o '[^ ]* MHz TSC'", "r");
        float freq_mhz = 0;
        if(fscanf(f, "%f", &freq_mhz)) {
            freq = freq_mhz * 1000000;
        }
        pclose(f);
    }
    if(!freq) {
        uint64_t start = __rdtsc();
        usleep(100*1000); //sleep for 100ms
        uint64_t finish = __rdtsc();
        freq = (finish - start)*10; //not too accurate but seriously we shouldn't ever need this code
    }
    return freq;
}

extern "C" uint64_t NOINSTR funtrace_ticks_per_second() { return trace_state().cpu_freq; }

const int MAGIC_LEN = 8;

static void NOINSTR write_chunk(std::ostream& file, const char* magic, const void* data, uint64_t bytes)
{
    assert(strlen(magic) == MAGIC_LEN);
    file.write(magic, MAGIC_LEN);
    file.write((char*)&bytes, sizeof bytes);
    file.write((char*)data, bytes);
}

static void NOINSTR write_procmaps(std::ostream& file, std::stringstream& procmaps)
{
    std::string s = std::move(procmaps).str();
    write_chunk(file, "PROCMAPS", &s[0], s.size());
    procmaps.str(std::move(s));
}

struct event_buffer
{
    trace_entry* buf;
    uint64_t bytes;
    thread_id id;
};

static void NOINSTR write_funtrace(std::ostream& file)
{
    write_chunk(file, "FUNTRACE", &trace_state().cpu_freq, sizeof trace_state().cpu_freq);
    write_chunk(file, "CMD LINE", trace_state().cmdline.c_str(), trace_state().cmdline.size());
}

static void NOINSTR write_endtrace(std::ostream& file)
{
    write_chunk(file, "ENDTRACE", "", 0);
}

static void NOINSTR write_tracebufs(std::ostream& file, const std::vector<event_buffer>& thread_traces)
{
    for(auto trace : thread_traces) {
        write_chunk(file, "THREADID", &trace.id, sizeof trace.id);
        write_chunk(file, "TRACEBUF", trace.buf, trace.bytes);
    }
}

//the default timestamp threshold of 1 cuts off uninitialized events with timestamp=0
static void ftrace_events_snapshot(std::vector<std::string>& snapshot, uint64_t earliest_timestamp=1);

static void NOINSTR write_ftrace(std::ostream& file, const std::vector<std::string>& events)
{
    if(events.empty()) {
        return;
    }
    uint64_t size = 0;
    for(const auto& s : events) {
        size += s.size() + 1; //+1 for the newline
    }
    file.write("FTRACETX", MAGIC_LEN);
    file.write((char*)&size, sizeof size);
    for(const auto& s : events) {
        file << s << '\n';
    }
}

//finding the executable segments using dl_iterate_phdr() is faster than reading /proc/self/maps
//and produces less segments since we ignore the non-executable ones
static int NOINSTR phdr_callback (struct dl_phdr_info *info, size_t size, void *data)
{
    std::stringstream& s = *(std::stringstream*)data;
    for(int i=0; i<info->dlpi_phnum; ++i ) {
        const auto& phdr = info->dlpi_phdr[i];
        //we only care about loadable executable segments (the likes of .text)
        if(phdr.p_type == PT_LOAD && (phdr.p_flags & PF_X)) {
            //we print in "roughly" the format of /proc/self/maps, with arbitrary values for the fields we don't really care about
            uint64_t start_addr = info->dlpi_addr + phdr.p_vaddr;
            uint64_t end_addr = start_addr + phdr.p_memsz;
            const char* name = info->dlpi_name[0] ? info->dlpi_name : trace_state().exe_path;
            s << start_addr << '-' << end_addr << " r-xp " << phdr.p_offset << " 0:0 0 " << name << '\n';
        }
    }
    return 0;
}

static void NOINSTR get_procmaps(std::stringstream& procmaps)
{
    procmaps << std::hex;
    dl_iterate_phdr(phdr_callback, &procmaps);
}

extern "C" void NOINSTR funtrace_pause_and_write_current_snapshot()
{
    std::lock_guard<std::mutex> guard(trace_state().mutex);

    trace_state().pause_tracing();

    std::ostream& file = trace_state().file();
    std::stringstream procmaps;
    get_procmaps(procmaps);
    write_procmaps(file, procmaps);

    //we don't allocate a snapshot - we save the memory for this by writing
    //straight from the trace buffers (at the expense of pausing tracing
    //for more time)
    //
    //(we didn't mind briefly allocating procmaps because it's very little data)
    std::vector<event_buffer> traces;
    for(auto trace : trace_state().thread_traces) {
        trace->update_name();
        traces.push_back(event_buffer{trace->buf, trace->buf_size, trace->id});
    }
    write_funtrace(file);
    write_tracebufs(file, traces);

    trace_state().resume_tracing();

    std::vector<std::string> ftrace_snapshot;
    ftrace_events_snapshot(ftrace_snapshot);
    write_ftrace(file, ftrace_snapshot);

    write_endtrace(file);
    file.flush();
}

struct funtrace_snapshot
{
    std::vector<event_buffer> thread_traces;
    std::vector<std::string> ftrace_events;
    std::stringstream procmaps;
    NOINSTR funtrace_snapshot() {}
    NOINSTR ~funtrace_snapshot() {}
};

funtrace_snapshot* NOINSTR funtrace_pause_and_get_snapshot()
{
    std::lock_guard<std::mutex> guard(trace_state().mutex);

    trace_state().pause_tracing();

    funtrace_snapshot* snapshot = new funtrace_snapshot;
    for(auto trace : trace_state().thread_traces) {
        trace_entry* copy = (trace_entry*)new char[trace->buf_size];
        memcpy(copy, trace->buf, trace->buf_size);
        trace->update_name();
        snapshot->thread_traces.push_back(event_buffer{copy, trace->buf_size, trace->id});
    }

    trace_state().resume_tracing();

    ftrace_events_snapshot(snapshot->ftrace_events);
    get_procmaps(snapshot->procmaps);
    return snapshot;
}

extern "C" void NOINSTR funtrace_free_snapshot(funtrace_snapshot* snapshot)
{
    if(!snapshot) {
        return;
    }
    for(auto trace : snapshot->thread_traces) {
        delete [] (char*)trace.buf;
    }
    delete snapshot;
}

extern "C" void NOINSTR funtrace_write_snapshot(const char* filename, funtrace_snapshot* snapshot)
{
    std::ofstream file(filename);
    write_procmaps(file, snapshot->procmaps);
    write_funtrace(file);
    write_tracebufs(file, snapshot->thread_traces);
    write_ftrace(file, snapshot->ftrace_events);
    write_endtrace(file);
    file.flush();
}

extern "C" uint64_t NOINSTR funtrace_time()
{
    return __rdtsc();
}

static trace_entry* NOINSTR find_earliest_event_after(trace_entry* begin, trace_entry* end, uint64_t time_threshold, uint64_t pause_time)
{
    trace_entry e;
    e.cycle = time_threshold;
    trace_entry* p = std::lower_bound(begin, end, e, [=](const trace_entry& e1, const trace_entry& e2) {
        //treat events recorded later than pause_time as ordered _before_ the rest.
        //that's because we're passing this function ranges of events logged in the order of time
        //(so sorted by time), but they can potentially be overwritten at the beginning by events
        //recorded after we paused tracing (because we don't have a mechanism to wait for the actual pause
        //to take effect and threads can take time to notice the write to their pause flag.)
        //
        //so if binary search finds an event ordered after the pause time, it should look to its right,
        //and the array can be thought of as "events after pause time from oldest to newest followed
        //by "events before pause time from oldest to newest."
        //
        //note that we could avoid all this by a simple linear search from the pos pointer backwards,
        //but it's slower than a binary search finding the exact earliest event followed by memcpy
        //into an array of the correctly allocated size
        //
        //also note that strictly speaking, since the buffers are written by the traced threads
        //and read by a snapshotting thread, we could theoretically see stale data in the buffer
        //s.t. the entries would not be sorted. I think this will result in very few events being
        //lost on very rare occasions but maybe there's a pathology where find_earliest_event_after()
        //misses most of the relevant entries because of this with reasonably high likelihood.
        if(e1.cycle > pause_time && e2.cycle <= pause_time) {
            return true; //events after pause time are ordered before events before pause time
        }
        if(e2.cycle > pause_time && e1.cycle <= pause_time) {
            return false;
        }
        return e1.cycle < e2.cycle;
    });
    return p==end ? nullptr : p;
}

extern "C" struct funtrace_snapshot* NOINSTR funtrace_pause_and_get_snapshot_starting_at_time(uint64_t time)
{
    std::lock_guard<std::mutex> guard(trace_state().mutex);

    trace_state().pause_tracing();

    funtrace_snapshot* snapshot = new funtrace_snapshot;
    uint64_t pause_time = funtrace_time();
    for(auto trace : trace_state().thread_traces) {
        trace_entry* pos = trace->pos;
        trace_entry* buf = trace->buf;
        trace_entry* end = buf + trace->buf_size/sizeof(trace_entry);
        //if we never had a wraparound, don't access uninitialized memory in an attempt to search to the right of pos
        trace_entry* earliest_right = end[-1].cycle ? find_earliest_event_after(pos, end, time, pause_time) : nullptr;
        trace_entry* earliest_left = find_earliest_event_after(buf, pos, time, pause_time);
        uint64_t entries = (earliest_left ? pos-earliest_left : 0) + (earliest_right ? end-earliest_right : 0);
        trace_entry* copy = (trace_entry*)new char[entries*sizeof(trace_entry)];
        trace_entry* copy_to = copy;
        //we don't really care if the events in the output are sorted but the ones to the right of pos, if we have any,
        //would be the earlier ones
        if(earliest_right) {
            copy_to = std::copy(earliest_right, end, copy_to);
        }
        if(earliest_left) {
            copy_to = std::copy(earliest_left, pos, copy_to);
        }
        assert(uint64_t(copy_to - copy) == entries);
        trace->update_name();
        snapshot->thread_traces.push_back(event_buffer{copy, entries*sizeof(trace_entry), trace->id});
    }

    trace_state().resume_tracing();

    ftrace_events_snapshot(snapshot->ftrace_events, time);
    get_procmaps(snapshot->procmaps);
    return snapshot;
}

extern "C" struct funtrace_snapshot* NOINSTR funtrace_pause_and_get_snapshot_up_to_age(uint64_t max_event_age)
{
    return funtrace_pause_and_get_snapshot_starting_at_time(funtrace_time() - max_event_age);
}

extern "C" void NOINSTR funtrace_ignore_this_thread()
{
    trace_state().unregister_this_thread();
}

extern "C" void NOINSTR funtrace_set_thread_log_buf_size(int log_buf_size)
{
    funtrace_ignore_this_thread();
    if(log_buf_size >= 5) { //5 is log(sizeof(trace_entry)*2
        trace_state().register_this_thread(log_buf_size);
    }
}

extern "C" void NOINSTR funtrace_disable_tracing()
{
    std::lock_guard<std::mutex> guard(trace_state().mutex);
    trace_state().pause_tracing();
}


extern "C" void NOINSTR funtrace_enable_tracing()
{
    std::lock_guard<std::mutex> guard(trace_state().mutex);
    trace_state().resume_tracing();
}

// we interpose pthread_create in order to implement the thing that having
// a ctor & dtor for the trace_data struct would do, but more efficiently.
//
// we need a thread's thread_local trace_data to be added to trace_state().thread_traces
// set when a new thread is created, and we need it to be removed from this set
// when it is destroyed. a ctor & dtor would do it but the ctor slows down the trace()
// function which would need to check every time if the thread_local object was
// already constructed or not, and call the ctor if it isn't.
//
// interposing pthread_create lets us avoid this check in the trace() function.
//
// a more portable and/or succinct yet still efficient approach would be great!

typedef int (*original_pthread_create_type)(pthread_t *, const pthread_attr_t *, void *(*)(void *), void *);

struct pthread_args
{
    void* (*func)(void*);
    void* arg;
};

void* NOINSTR pthread_entry_point(void* arg)
{
    trace_state().register_this_thread();

    pthread_args* args = (pthread_args*)arg;
    void* ret = args->func(args->arg);
    delete args;

    trace_state().unregister_this_thread(true); //defer freeing if configured to garbage-collect

    return ret;
}

int NOINSTR pthread_create(pthread_t *thread, const pthread_attr_t *attr,
                   void *(*start_routine)(void *), void *arg) {
    // Find the original pthread_create using dlvsym; using dlsym might give us
    // an older version without support for the attr argument... spoken from experience!
    static original_pthread_create_type original_pthread_create = NULL;
    if (!original_pthread_create) {
        original_pthread_create = (original_pthread_create_type)dlvsym(RTLD_NEXT, "pthread_create", "GLIBC_2.2.5");
        if (!original_pthread_create) {
            fprintf(stderr, "Error locating original pthread_create: %s\n", dlerror());
            exit(EXIT_FAILURE);
        }
    }

    pthread_args* args = new pthread_args;
    args->func = start_routine;
    args->arg = arg;
    // Call the original pthread_create
    return original_pthread_create(thread, attr, pthread_entry_point, args);
}

//...and we need to register the main thread's trace separately, this time using a ctor
//(it's not global ctors that are the problem, it's the thread_locals' ctors); we don't
//need a dtor - the main thread never "goes out of scope" until the program terminates
struct register_main_thread
{
    NOINSTR register_main_thread() { trace_state().register_this_thread(); }
    NOINSTR ~register_main_thread() { trace_state().unregister_this_thread(); }
}
g_funtrace_register_main_thread;

//interposing __cxa_begin_catch to handle exceptions (eg if f calls g which throws a exception
//that f catches and calls h, we want this to be understood as g returning and f calling h,
//rather than g calling h; which our interposing & tracing doesn't completely ensure but does make
//work in many cases)
extern "C" void* NOFUNTRACE __cxa_begin_catch(void *thrown_exception) throw() {
    void* caller = __builtin_return_address(0);

    g_thread_trace.trace(caller, FUNTRACE_CATCH_MASK);
    g_thread_trace.trace((void*)&__cxa_begin_catch, 0);

    static void *(*real_cxa_begin_catch)(void *) = nullptr;
    if (!real_cxa_begin_catch) {
        real_cxa_begin_catch = (void *(*)(void *))dlsym(RTLD_NEXT, "__cxa_begin_catch");
        if (!real_cxa_begin_catch) {
            fprintf(stderr, "Error locating original __cxa_begin_catch: %s\n", dlerror());
            return nullptr;
        }
    }

    void* ret = real_cxa_begin_catch(thrown_exception);

    g_thread_trace.trace((void*)&__cxa_begin_catch, 1ULL<<FUNTRACE_RETURN_BIT);
    return ret;
}

//we interpose __cxa_end_catch just in order to instrument it - so as to make it visible
//in the trace that an exception was caught. we don't instrument it "normally" however
//since in some cases funtrace.cpp might be compiled without instrumentation flags -
//so we actually make sure it's __NOT__ instrumented but call trace() directly
extern "C" void NOFUNTRACE __cxa_end_catch(void) throw() {
    static void (*real_cxa_end_catch)(void) = nullptr;

    g_thread_trace.trace((void*)&__cxa_end_catch, 0);

    if (!real_cxa_end_catch) {
        real_cxa_end_catch = (void (*)(void))dlsym(RTLD_NEXT, "__cxa_end_catch");
        if (!real_cxa_end_catch) {
            fprintf(stderr, "Error locating original __cxa_end_catch: %s\n", dlerror());
            return;
        }
    }

    real_cxa_end_catch();

    g_thread_trace.trace((void*)&__cxa_end_catch, 1ULL<<FUNTRACE_RETURN_BIT);
}

extern "C" void NOFUNTRACE
#ifdef __GNUC__
__cxa_throw(void* thrown_object, void* type_info, void (*dest)(void *))
#elif defined(__clang__)
__cxa_throw(void* thrown_object, std::type_info* type_info, void (_GLIBCXX_CDTOR_CALLABI *dest)(void*))
#endif
{
    static void (*real_cxa_throw)(void*, void*, void (*)(void*)) = nullptr;

    g_thread_trace.trace((void*)&__cxa_throw, 0);

    if(!real_cxa_throw) {
        real_cxa_throw = (void (*)(void*, void*, void (*)(void*)))dlsym(RTLD_NEXT, "__cxa_throw");
        if(!real_cxa_throw) {
            fprintf(stderr, "Error locating original __cxa_throw: %s\n", dlerror());
            return;
        }
    }

    //__cxa_throw doesn't return so we record it as a "point event", without logging
    //the actual time it takes
    g_thread_trace.trace((void*)&__cxa_throw, 1ULL<<FUNTRACE_RETURN_BIT);
    real_cxa_throw(thrown_object, type_info, dest);
}

#include <thread>
#include <atomic>
#include <csignal>
#include <condition_variable>
#include <chrono>
#include <dirent.h>

//we register a signal handler for SIGTRAP, and have a thread waiting for the signal
//to arrive and dumping trace data when it does. this is good for programs you don't
//want to modify beyond rebuilding (otherwise it's not so great since you can't time
//the event very well, but it might still be enough to get a feeling of what the program
//is doing)
#ifndef FUNTRACE_NO_SIGTRAP

struct sigtrap_handler
{
    std::mutex mutex;
    std::thread thread;
    std::atomic<bool> quit;

    static void NOINSTR signal_handler(int);
    void NOINSTR thread_func();
    NOINSTR sigtrap_handler()
    {
        quit = false;
        mutex.lock();
        thread = std::thread([this] {
            thread_func();
        });
        signal(SIGTRAP, signal_handler);
    }
    NOINSTR ~sigtrap_handler() {
        quit = true;
        mutex.unlock();
        thread.join();
    }
}
g_funtrace_sigtrap_handler;

void NOINSTR sigtrap_handler::signal_handler(int)
{
    g_funtrace_sigtrap_handler.mutex.unlock();
}

void NOINSTR sigtrap_handler::thread_func()
{
    //we don't want to trace the SIGTRAP-handling thread
    funtrace_ignore_this_thread();
    pthread_setname_np(pthread_self(), "funtraceSIGTRAP");
    while(true) {
        mutex.lock();
        if(quit) {
            break;
        }
        funtrace_pause_and_write_current_snapshot();
    }
}

#endif //FUNTRACE_NO_SIGTRAP

//if FUNTRACE_GC_MAX_AGE_MS is defined to zero, we don't spawn a GC thread at all.
//otherwise we allow to reconfigure the GC waiting period using an env var
#if FUNTRACE_GC_MAX_AGE_MS > 0

struct funtrace_gc
{
    std::mutex mutex;
    std::condition_variable cond_var;
    std::thread thread;
    int gc_period_ms;
    std::atomic<bool> quit;
    std::atomic<bool> done;

    NOINSTR funtrace_gc()
    {
        gc_period_ms = env_int("FUNTRACE_GC_PERIOD_MS", FUNTRACE_GC_PERIOD_MS);
        if(gc_period_ms == 0) {
            return;
        }
        quit = false;
        done = false;
        thread = std::thread([this] {
            thread_func();
        });
    }
    NOINSTR ~funtrace_gc()
    {
        if(gc_period_ms == 0) {
            return;
        }
        quit = true;