From 9506244561de5c534b6db0378eaa8fcdf1955099 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 21 Feb 2026 15:31:47 +0000 Subject: [PATCH 1/3] Initial plan From b8cf00585afb52ebd5159d91a3d6ef33c72d28c8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 21 Feb 2026 15:43:02 +0000 Subject: [PATCH 2/3] Implement --prev mode to attribute event counts to the previous instruction Add -P/--prev CLI option that enables a mode where each perf event sample's count is attributed to the previously sampled instruction instead of the current one. This is useful for compensating for instruction pointer skid in perf sampling. Changes: - Add attribute_to_prev bit field to pw_opts_t - Add previous instruction tracking fields to process_t - Add -P/--prev command-line option with help text - Restructure handle_sample() to extract decode results into local variables and support swapping with stored previous instruction data - Initialize has_prev=0 in process creation functions Co-authored-by: rdementi <25432609+rdementi@users.noreply.github.com> --- src/process_info.h | 2 + src/processwatch.c | 8 ++- src/processwatch.h | 13 ++++ src/results.h | 145 +++++++++++++++++++++++++++++++++------------ 4 files changed, 128 insertions(+), 40 deletions(-) diff --git a/src/process_info.h b/src/process_info.h index 4a041c8..490019e 100644 --- a/src/process_info.h +++ b/src/process_info.h @@ -170,6 +170,7 @@ static void update_one_process_info(uint32_t pid, char *name, uint32_t hash, int process->name = strdup(name); process->name_hash = hash; process->index = results->pid_ctr++; + process->has_prev = 0; results->process_info.arr[pid][num_procs] = process; results->process_info.arr[pid][num_procs + 1] = NULL; } @@ -191,6 +192,7 @@ static void add_process_info(uint32_t pid, char *name, uint32_t hash) { process->name = strdup(name); process->name_hash = hash; process->index = results->pid_ctr++; + process->has_prev = 0; /* Now add that process_t to the process_arr_t struct */ results->process_info.arr[pid] = (process_t **) malloc(sizeof(process_t *) * 2); diff --git a/src/processwatch.c b/src/processwatch.c index 3f8eff7..e0b9aff 100644 --- a/src/processwatch.c +++ b/src/processwatch.c @@ -46,6 +46,7 @@ static struct option long_options[] = { {"btf", required_argument, 0, 'b'}, {"all", no_argument, 0, 'a'}, {"cycles", no_argument, 0, 'C'}, + {"prev", no_argument, 0, 'P'}, {0, 0, 0, 0} }; @@ -239,6 +240,7 @@ int read_opts(int argc, char **argv) { pw_opts.show_mnemonics = 0; pw_opts.show_extensions = 0; pw_opts.use_cycles = 0; + pw_opts.attribute_to_prev = 0; pw_opts.csv = 0; pw_opts.btf_custom_path = NULL; pw_opts.debug = 0; @@ -253,7 +255,7 @@ int read_opts(int argc, char **argv) { while(1) { option_index = 0; - c = getopt_long(argc, argv, "hvdi:cp:ms:f:ln:b:eaC", + c = getopt_long(argc, argv, "hvdi:cp:ms:f:ln:b:eaCP", long_options, &option_index); if(c == -1) { break; @@ -283,6 +285,7 @@ int read_opts(int argc, char **argv) { #endif printf(" -s Sampling period. Defaults to 100000 (1 in 100000 instructions or cycles).\n"); printf(" -C Profiles cycles instead of instructions.\n"); + printf(" -P Attributes event counts to the previous instruction.\n"); #ifdef __aarch64__ printf(" -f Can be used multiple times. Defines filters for columns. Defaults to 'FPARMv8', 'NEON', 'SVE' and 'SVE2'.\n"); #elif __x86_64__ @@ -339,6 +342,9 @@ int read_opts(int argc, char **argv) { case 'C': pw_opts.use_cycles = 1; break; + case 'P': + pw_opts.attribute_to_prev = 1; + break; case 'l': pw_opts.list = 1; break; diff --git a/src/processwatch.h b/src/processwatch.h index a3f3beb..ce8f376 100644 --- a/src/processwatch.h +++ b/src/processwatch.h @@ -52,6 +52,7 @@ struct pw_opts_t { unsigned char show_mnemonics : 1; unsigned char show_extensions : 1; unsigned char use_cycles : 1; + unsigned char attribute_to_prev : 1; unsigned int sample_period; char *btf_custom_path; @@ -101,6 +102,18 @@ typedef struct { int index; char *name; uint32_t name_hash; + /* Previous instruction tracking (for --prev mode) */ + int has_prev; + int prev_success; + int prev_mnemonic; + int prev_category; +#ifdef __x86_64__ + int prev_extension; + int prev_is_locked; +#elif __aarch64__ + uint8_t prev_groups[8]; + uint8_t prev_groups_count; +#endif } process_t; #define MAX_PROCESSES 4194304 diff --git a/src/results.h b/src/results.h index dda5b3b..bf3bae7 100644 --- a/src/results.h +++ b/src/results.h @@ -17,18 +17,29 @@ static int handle_sample(void *ctx, void *data, size_t data_sz) { struct insn_info *insn_info; int category, mnemonic, success; int interval_index; + int skip_sample; + process_t *proc; #ifdef __x86_64__ int extension; + int is_locked; +#elif __aarch64__ + uint8_t groups[8]; + uint8_t groups_count; + int i; #endif uint32_t hash; insn_info = data; success = 0; + skip_sample = 0; category = -1; mnemonic = -1; #ifdef __x86_64__ extension = -1; + is_locked = 0; +#elif __aarch64__ + groups_count = 0; #endif #ifdef __x86_64__ @@ -44,6 +55,20 @@ static int handle_sample(void *ctx, void *data, size_t data_sz) { #ifdef __x86_64__ extension = results->decoded_insn.meta.isa_ext; #endif + + /* Detect lock-prefixed instructions with memory destination + or xchg with memory destination */ + if (results->decoded_insn.attributes & ZYDIS_ATTRIB_HAS_LOCK) { + is_locked = 1; + } else if (results->decoded_insn.mnemonic == ZYDIS_MNEMONIC_XCHG) { + int k; + for (k = 0; k < results->decoded_insn.operand_count_visible; k++) { + if (results->decoded_operands[k].type == ZYDIS_OPERAND_TYPE_MEMORY) { + is_locked = 1; + break; + } + } + } } #elif __aarch64__ int count; @@ -52,6 +77,12 @@ static int handle_sample(void *ctx, void *data, size_t data_sz) { if(count && insn[0].detail) { success = 1; mnemonic = insn[0].id; + groups_count = insn[0].detail->groups_count; + if(groups_count > 8) groups_count = 8; + for(i = 0; i < groups_count; i++) { + groups[i] = insn[0].detail->groups[i]; + } + cs_free(insn, count); } #endif @@ -66,52 +97,88 @@ static int handle_sample(void *ctx, void *data, size_t data_sz) { /* Store this result in the per-process array */ interval_index = get_interval_proc_arr_index(insn_info->pid); - if(success) { - results->interval->insn_count[mnemonic]++; - results->interval->proc_insn_count[mnemonic][interval_index]++; - + /* In --prev mode, attribute the event count to the previous instruction */ + if(pw_opts.attribute_to_prev) { + proc = get_process_info(insn_info->pid, hash); + if(proc->has_prev) { + /* Swap current decode results with the stored previous ones */ + int tmp; + tmp = success; success = proc->prev_success; proc->prev_success = tmp; + tmp = mnemonic; mnemonic = proc->prev_mnemonic; proc->prev_mnemonic = tmp; #ifdef __x86_64__ - results->interval->cat_count[results->decoded_insn.meta.category]++; - results->interval->proc_cat_count[category][interval_index]++; - results->interval->ext_count[results->decoded_insn.meta.isa_ext]++; - results->interval->proc_ext_count[extension][interval_index]++; - - /* Detect lock-prefixed instructions with memory destination - or xchg with memory destination */ - if (results->decoded_insn.attributes & ZYDIS_ATTRIB_HAS_LOCK) { - results->interval->cat_count[PW_CATEGORY_LOCKED]++; - results->interval->proc_cat_count[PW_CATEGORY_LOCKED][interval_index]++; - } else if (results->decoded_insn.mnemonic == ZYDIS_MNEMONIC_XCHG) { - int k; - for (k = 0; k < results->decoded_insn.operand_count_visible; k++) { - if (results->decoded_operands[k].type == ZYDIS_OPERAND_TYPE_MEMORY) { - results->interval->cat_count[PW_CATEGORY_LOCKED]++; - results->interval->proc_cat_count[PW_CATEGORY_LOCKED][interval_index]++; - break; - } + tmp = category; category = proc->prev_category; proc->prev_category = tmp; + tmp = extension; extension = proc->prev_extension; proc->prev_extension = tmp; + tmp = is_locked; is_locked = proc->prev_is_locked; proc->prev_is_locked = tmp; +#elif __aarch64__ + { + uint8_t tmp_groups[8]; + uint8_t tmp_count; + int j; + tmp_count = groups_count; + for(j = 0; j < tmp_count; j++) tmp_groups[j] = groups[j]; + groups_count = proc->prev_groups_count; + for(j = 0; j < groups_count; j++) groups[j] = proc->prev_groups[j]; + proc->prev_groups_count = tmp_count; + for(j = 0; j < tmp_count; j++) proc->prev_groups[j] = tmp_groups[j]; } - } +#endif + } else { + /* First sample for this process; store current and skip counting */ + proc->has_prev = 1; + proc->prev_success = success; + proc->prev_mnemonic = mnemonic; + proc->prev_category = category; +#ifdef __x86_64__ + proc->prev_extension = extension; + proc->prev_is_locked = is_locked; #elif __aarch64__ - int i; - // Capstone (LLVM) puts some instructions in 0, 1 or more groups - for (i = 0; i < insn[0].detail->groups_count; i++) { - category = insn[0].detail->groups[i]; + { + int j; + proc->prev_groups_count = groups_count; + for(j = 0; j < groups_count; j++) proc->prev_groups[j] = groups[j]; + } +#endif + skip_sample = 1; + } + } + + if(!skip_sample) { + + if(success) { + results->interval->insn_count[mnemonic]++; + results->interval->proc_insn_count[mnemonic][interval_index]++; + +#ifdef __x86_64__ results->interval->cat_count[category]++; results->interval->proc_cat_count[category][interval_index]++; - } - cs_free(insn, count); + results->interval->ext_count[extension]++; + results->interval->proc_ext_count[extension][interval_index]++; + + if(is_locked) { + results->interval->cat_count[PW_CATEGORY_LOCKED]++; + results->interval->proc_cat_count[PW_CATEGORY_LOCKED][interval_index]++; + } +#elif __aarch64__ + /* Capstone (LLVM) puts some instructions in 0, 1 or more groups */ + for (i = 0; i < groups_count; i++) { + category = groups[i]; + results->interval->cat_count[category]++; + results->interval->proc_cat_count[category][interval_index]++; + } #endif - - } else { - results->interval->num_failed++; - results->interval->proc_num_failed[interval_index]++; - results->num_failed++; - } + + } else { + results->interval->num_failed++; + results->interval->proc_num_failed[interval_index]++; + results->num_failed++; + } + + results->interval->num_samples++; + results->interval->proc_num_samples[interval_index]++; + results->interval->pids[interval_index] = insn_info->pid; + results->num_samples++; - results->interval->num_samples++; - results->interval->proc_num_samples[interval_index]++; - results->interval->pids[interval_index] = insn_info->pid; - results->num_samples++; + } if(pthread_rwlock_unlock(&results_lock) != 0) { fprintf(stderr, "Failed to unlock the lock! Aborting.\n"); From ec6fda972212505988f58276599fa0b21adf9558 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 21 Feb 2026 15:44:28 +0000 Subject: [PATCH 3/3] Fix code review issues: make category/prev_category x86-only and remove duplicate line Co-authored-by: rdementi <25432609+rdementi@users.noreply.github.com> --- src/processwatch.h | 2 +- src/results.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/processwatch.h b/src/processwatch.h index ce8f376..669e69a 100644 --- a/src/processwatch.h +++ b/src/processwatch.h @@ -106,8 +106,8 @@ typedef struct { int has_prev; int prev_success; int prev_mnemonic; - int prev_category; #ifdef __x86_64__ + int prev_category; int prev_extension; int prev_is_locked; #elif __aarch64__ diff --git a/src/results.h b/src/results.h index bf3bae7..a696913 100644 --- a/src/results.h +++ b/src/results.h @@ -127,8 +127,8 @@ static int handle_sample(void *ctx, void *data, size_t data_sz) { proc->has_prev = 1; proc->prev_success = success; proc->prev_mnemonic = mnemonic; - proc->prev_category = category; #ifdef __x86_64__ + proc->prev_category = category; proc->prev_extension = extension; proc->prev_is_locked = is_locked; #elif __aarch64__