// SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corp. 2018 * Auxtrace support for s390 CPU-Measurement Sampling Facility * * Author(s): Thomas Richter <tmricht@linux.ibm.com> * * Auxiliary traces are collected during 'perf record' using rbd000 event. * Several PERF_RECORD_XXX are generated during recording: * * PERF_RECORD_AUX: * Records that new data landed in the AUX buffer part. * PERF_RECORD_AUXTRACE: * Defines auxtrace data. Followed by the actual data. The contents of * the auxtrace data is dependent on the event and the CPU. * This record is generated by perf record command. For details * see Documentation/perf.data-file-format.txt. * PERF_RECORD_AUXTRACE_INFO: * Defines a table of contains for PERF_RECORD_AUXTRACE records. This * record is generated during 'perf record' command. Each record contains * up to 256 entries describing offset and size of the AUXTRACE data in the * perf.data file. * PERF_RECORD_AUXTRACE_ERROR: * Indicates an error during AUXTRACE collection such as buffer overflow. * PERF_RECORD_FINISHED_ROUND: * Perf events are not necessarily in time stamp order, as they can be * collected in parallel on different CPUs. If the events should be * processed in time order they need to be sorted first. * Perf report guarantees that there is no reordering over a * PERF_RECORD_FINISHED_ROUND boundary event. All perf records with a * time stamp lower than this record are processed (and displayed) before * the succeeding perf record are processed. * * These records are evaluated during perf report command. * * 1. PERF_RECORD_AUXTRACE_INFO is used to set up the infrastructure for * auxiliary trace data processing. See s390_cpumsf_process_auxtrace_info() * below. * Auxiliary trace data is collected per CPU. To merge the data into the report * an auxtrace_queue is created for each CPU. It is assumed that the auxtrace * data is in ascending order. * * Each queue has a double linked list of auxtrace_buffers. This list contains * the offset and size of a CPU's auxtrace data. During auxtrace processing * the data portion is mmap()'ed. * * To sort the queues in chronological order, all queue access is controlled * by the auxtrace_heap. This is basically a stack, each stack element has two * entries, the queue number and a time stamp. However the stack is sorted by * the time stamps. The highest time stamp is at the bottom the lowest * (nearest) time stamp is at the top. That sort order is maintained at all * times! * * After the auxtrace infrastructure has been setup, the auxtrace queues are * filled with data (offset/size pairs) and the auxtrace_heap is populated. * * 2. PERF_RECORD_XXX processing triggers access to the auxtrace_queues. * Each record is handled by s390_cpumsf_process_event(). The time stamp of * the perf record is compared with the time stamp located on the auxtrace_heap * top element. If that time stamp is lower than the time stamp from the * record sample, the auxtrace queues will be processed. As auxtrace queues * control many auxtrace_buffers and each buffer can be quite large, the * auxtrace buffer might be processed only partially. In this case the * position in the auxtrace_buffer of that queue is remembered and the time * stamp of the last processed entry of the auxtrace_buffer replaces the * current auxtrace_heap top. * * 3. Auxtrace_queues might run of out data and are fed by the * PERF_RECORD_AUXTRACE handling, see s390_cpumsf_process_auxtrace_event(). * * Event Generation * Each sampling-data entry in the auxiliary trace data generates a perf sample. * This sample is filled * with data from the auxtrace such as PID/TID, instruction address, CPU state, * etc. This sample is processed with perf_session__deliver_synth_event() to * be included into the GUI. * * 4. PERF_RECORD_FINISHED_ROUND event is used to process all the remaining * auxiliary traces entries until the time stamp of this record is reached * auxtrace_heap top. This is triggered by ordered_event->deliver(). * * * Perf event processing. * Event processing of PERF_RECORD_XXX entries relies on time stamp entries. * This is the function call sequence: * * __cmd_report() * | * perf_session__process_events() * | * __perf_session__process_events() * | * perf_session__process_event() * | This functions splits the PERF_RECORD_XXX records. * | - Those generated by perf record command (type number equal or higher * | than PERF_RECORD_USER_TYPE_START) are handled by * | perf_session__process_user_event(see below) * | - Those generated by the kernel are handled by * | evlist__parse_sample_timestamp() * | * evlist__parse_sample_timestamp() * | Extract time stamp from sample data. * | * perf_session__queue_event() * | If timestamp is positive the sample is entered into an ordered_event * | list, sort order is the timestamp. The event processing is deferred until * | later (see perf_session__process_user_event()). * | Other timestamps (0 or -1) are handled immediately by * | perf_session__deliver_event(). These are events generated at start up * | of command perf record. They create PERF_RECORD_COMM and PERF_RECORD_MMAP* * | records. They are needed to create a list of running processes and its * | memory mappings and layout. They are needed at the beginning to enable * | command perf report to create process trees and memory mappings. * | * perf_session__deliver_event() * | Delivers a PERF_RECORD_XXX entry for handling. * | * auxtrace__process_event() * | The timestamp of the PERF_RECORD_XXX entry is taken to correlate with * | time stamps from the auxiliary trace buffers. This enables * | synchronization between auxiliary trace data and the events on the * | perf.data file. * | * machine__deliver_event() * | Handles the PERF_RECORD_XXX event. This depends on the record type. * It might update the process tree, update a process memory map or enter * a sample with IP and call back chain data into GUI data pool. * * * Deferred processing determined by perf_session__process_user_event() is * finally processed when a PERF_RECORD_FINISHED_ROUND is encountered. These * are generated during command perf record. * The timestamp of PERF_RECORD_FINISHED_ROUND event is taken to process all * PERF_RECORD_XXX entries stored in the ordered_event list. This list was * built up while reading the perf.data file. * Each event is now processed by calling perf_session__deliver_event(). * This enables time synchronization between the data in the perf.data file and * the data in the auxiliary trace buffers. */ #include <endian.h> #include <errno.h> #include <byteswap.h> #include <inttypes.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/bitops.h> #include <linux/log2.h> #include <linux/zalloc.h> #include <sys/stat.h> #include <sys/types.h> #include "color.h" #include "evsel.h" #include "evlist.h" #include "machine.h" #include "session.h" #include "tool.h" #include "debug.h" #include "auxtrace.h" #include "s390-cpumsf.h" #include "s390-cpumsf-kernel.h" #include "s390-cpumcf-kernel.h" #include "config.h" #include "util/sample.h" struct s390_cpumsf { struct auxtrace auxtrace; struct auxtrace_queues queues; struct auxtrace_heap heap; struct perf_session *session; struct machine *machine; u32 auxtrace_type; u32 pmu_type; u16 machine_type; bool data_queued; bool use_logfile; char *logdir; }; struct s390_cpumsf_queue { struct s390_cpumsf *sf; unsigned int queue_nr; struct auxtrace_buffer *buffer; int cpu; FILE *logfile; FILE *logfile_ctr; }; /* Check if the raw data should be dumped to file. If this is the case and * the file to dump to has not been opened for writing, do so. * * Return 0 on success and greater zero on error so processing continues. */ static int s390_cpumcf_dumpctr(struct s390_cpumsf *sf, struct perf_sample *sample) { struct s390_cpumsf_queue *sfq; struct auxtrace_queue *q; int rc = 0; if (!sf->use_logfile || sf->queues.nr_queues <= sample->cpu) return rc; q = &sf->queues.queue_array[sample->cpu]; sfq = q->priv; if (!sfq) /* Queue not yet allocated */ return rc; if (!sfq->logfile_ctr) { char *name; rc = (sf->logdir) ? asprintf(&name, "%s/aux.ctr.%02x", sf->logdir, sample->cpu) : asprintf(&name, "aux.ctr.%02x", sample->cpu); if (rc > 0) sfq->logfile_ctr = fopen(name, "w"); if (sfq->logfile_ctr == NULL) { pr_err("Failed to open counter set log file %s, " "continue...\n", name); rc = 1; } free(name); } if (sfq->logfile_ctr) { /* See comment above for -4 */ size_t n = fwrite(sample->raw_data, sample->raw_size - 4, 1, sfq->logfile_ctr); if (n != 1) { pr_err("Failed to write counter set data\n"); rc = 1; } } return rc; } /* Display s390 CPU measurement facility basic-sampling data entry * Data written on s390 in big endian byte order and contains bit * fields across byte boundaries. */ static bool s390_cpumsf_basic_show(const char *color, size_t pos, struct hws_basic_entry *basicp) { struct hws_basic_entry *basic = basicp; #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ struct hws_basic_entry local; unsigned long long word = be64toh(*(unsigned long long *)basicp); memset(&local, 0, sizeof(local)); local.def = be16toh(basicp->def); local.prim_asn = word & 0xffff; local.CL = word >> 30 & 0x3; local.I = word >> 32 & 0x1; local.AS = word >> 33 & 0x3; local.P = word >> 35 & 0x1; local.W = word >> 36 & 0x1; local.T = word >> 37 & 0x1; local.U = word >> 40 & 0xf; local.ia = be64toh(basicp->ia); local.gpp = be64toh(basicp->gpp); local.hpp = be64toh(basicp->hpp); basic = &local; #endif if (basic->def != 1) { pr_err("Invalid AUX trace basic entry [%#08zx]\n", pos); return false; } color_fprintf(stdout, color, " [%#08zx] Basic Def:%04x Inst:%#04x" " %c%c%c%c AS:%d ASN:%#04x IA:%#018llx\n" "\t\tCL:%d HPP:%#018llx GPP:%#018llx\n", pos, basic->def, basic->U, basic->T ? 'T' : ' ', basic->W ? 'W' : ' ', basic->P ? 'P' : ' ', basic->I ? 'I' : ' ', basic->AS, basic->prim_asn, basic->ia, basic->CL, basic->hpp, basic->gpp); return true; } /* Display s390 CPU measurement facility diagnostic-sampling data entry. * Data written on s390 in big endian byte order and contains bit * fields across byte boundaries. */ static bool s390_cpumsf_diag_show(const char *color, size_t pos, struct hws_diag_entry *diagp) { struct hws_diag_entry *diag = diagp; #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ struct hws_diag_entry local; unsigned long long word = be64toh(*(unsigned long long *)diagp); local.def = be16toh(diagp->def); local.I = word >> 32 & 0x1; diag = &local; #endif if (diag->def < S390_CPUMSF_DIAG_DEF_FIRST) { pr_err("Invalid AUX trace diagnostic entry [%#08zx]\n", pos); return false; } color_fprintf(stdout, color, " [%#08zx] Diag Def:%04x %c\n", pos, diag->def, diag->I ? 'I' : ' '); return true; } /* Return TOD timestamp contained in an trailer entry */ static unsigned long long trailer_timestamp(struct hws_trailer_entry *te, int idx) { /* te->t set: TOD in STCKE format, bytes 8-15 * to->t not set: TOD in STCK format, bytes 0-7 */ unsigned long long ts; memcpy(&ts, &te->timestamp[idx], sizeof(ts)); return be64toh(ts); } /* Display s390 CPU measurement facility trailer entry */ static bool s390_cpumsf_trailer_show(const char *color, size_t pos, struct hws_trailer_entry *te) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ struct hws_trailer_entry local; const unsigned long long flags = be64toh(te->flags); memset(&local, 0, sizeof(local)); local.f = flags >> 63 & 0x1; local.a = flags >> 62 & 0x1; local.t = flags >> 61 & 0x1; local.bsdes = be16toh((flags >> 16 & 0xffff)); local.dsdes = be16toh((flags & 0xffff)); memcpy(&local.timestamp, te->timestamp, sizeof(te->timestamp)); local.overflow = be64toh(te->overflow); local.clock_base = be64toh(te->progusage[0]) >> 63 & 1; local.progusage2 = be64toh(te->progusage2); te = &local; #endif if (te->bsdes != sizeof(struct hws_basic_entry)) { pr_err("Invalid AUX trace trailer entry [%#08zx]\n", pos); return false; } color_fprintf(stdout, color, " [%#08zx] Trailer %c%c%c bsdes:%d" " dsdes:%d Overflow:%lld Time:%#llx\n" "\t\tC:%d TOD:%#lx\n", pos, te->f ? 'F' : ' ', te->a ? 'A' : ' ', te->t ? 'T' : ' ', te->bsdes, te->dsdes, te->overflow, trailer_timestamp(te, te->clock_base), te->clock_base, te->progusage2); return true; } /* Test a sample data block. It must be 4KB or a multiple thereof in size and * 4KB page aligned. Each sample data page has a trailer entry at the * end which contains the sample entry data sizes. * * Return true if the sample data block passes the checks and set the * basic set entry size and diagnostic set entry size. * * Return false on failure. * * Note: Old hardware does not set the basic or diagnostic entry sizes * in the trailer entry. Use the type number instead. */ static bool s390_cpumsf_validate(int machine_type, unsigned char *buf, size_t len, unsigned short *bsdes, unsigned short *dsdes) { struct hws_basic_entry *basic = (struct hws_basic_entry *)buf; struct hws_trailer_entry *te; *dsdes = *bsdes = 0; if (len & (S390_CPUMSF_PAGESZ - 1)) /* Illegal size */ return false; if (be16toh(basic->def) != 1) /* No basic set entry, must be first */ return false; /* Check for trailer entry at end of SDB */ te = (struct hws_trailer_entry *)(buf + S390_CPUMSF_PAGESZ - sizeof(*te)); *bsdes = be16toh(te->bsdes); *dsdes = be16toh(te->dsdes); if (!te->bsdes && !te->dsdes) { /* Very old hardware, use CPUID */ switch (machine_type) { case 2097: case 2098: *dsdes = 64; *bsdes = 32; break; case 2817: case 2818: *dsdes = 74; *bsdes = 32; break; case 2827: case 2828: *dsdes = 85; *bsdes = 32; break; case 2964: case 2965: *dsdes = 112; *bsdes = 32; break; default: /* Illegal trailer entry */ return false; } } return true; } /* Return true if there is room for another entry */ static bool s390_cpumsf_reached_trailer(size_t entry_sz, size_t pos) { size_t payload = S390_CPUMSF_PAGESZ - sizeof(struct hws_trailer_entry); if (payload - (pos & (S390_CPUMSF_PAGESZ - 1)) < entry_sz) return false; return true; } /* Dump an auxiliary buffer. These buffers are multiple of * 4KB SDB pages. */ static void s390_cpumsf_dump(struct s390_cpumsf *sf, unsigned char *buf, size_t len) { const char *color = PERF_COLOR_BLUE; struct hws_basic_entry *basic; struct hws_diag_entry *diag; unsigned short bsdes, dsdes; size_t pos = 0; color_fprintf(stdout, color, ". ... s390 AUX data: size %zu bytes\n", len); if (!s390_cpumsf_validate(sf->machine_type, buf, len, &bsdes, &dsdes)) { pr_err("Invalid AUX trace data block size:%zu" " (type:%d bsdes:%hd dsdes:%hd)\n", len, sf->machine_type, bsdes, dsdes); return; } /* s390 kernel always returns 4KB blocks fully occupied, * no partially filled SDBs. */ while (pos < len) { /* Handle Basic entry */ basic = (struct hws_basic_entry *)(buf + pos); if (s390_cpumsf_basic_show(color, pos, basic)) pos += bsdes; else return; /* Handle Diagnostic entry */ diag = (struct hws_diag_entry *)(buf + pos); if (s390_cpumsf_diag_show(color, pos, diag)) pos += dsdes; else return; /* Check for trailer entry */ if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) { /* Show trailer entry */ struct hws_trailer_entry te; pos = (pos + S390_CPUMSF_PAGESZ) & ~(S390_CPUMSF_PAGESZ - 1); pos -= sizeof(te); memcpy(&te, buf + pos, sizeof(te)); /* Set descriptor sizes in case of old hardware * where these values are not set. */ te.bsdes = bsdes; te.dsdes = dsdes; if (s390_cpumsf_trailer_show(color, pos, &te)) pos += sizeof(te); else return; } } } static void s390_cpumsf_dump_event(struct s390_cpumsf *sf, unsigned char *buf, size_t len) { printf(".\n"); s390_cpumsf_dump(sf, buf, len); } #define S390_LPP_PID_MASK 0xffffffff static bool s390_cpumsf_make_event(size_t pos, struct hws_basic_entry *basic, struct s390_cpumsf_queue *sfq) { struct perf_sample sample = { .ip = basic->ia, .pid = basic->hpp & S390_LPP_PID_MASK, .tid = basic->hpp & S390_LPP_PID_MASK, .cpumode = PERF_RECORD_MISC_CPUMODE_UNKNOWN, .cpu = sfq->cpu, .period = 1 }; union perf_event event; memset(&event, 0, sizeof(event)); if (basic->CL == 1) /* Native LPAR mode */ sample.cpumode = basic->P ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL; else if (basic->CL == 2) /* Guest kernel/user space */ sample.cpumode = basic->P ? PERF_RECORD_MISC_GUEST_USER : PERF_RECORD_MISC_GUEST_KERNEL; else if (basic->gpp || basic->prim_asn != 0xffff) /* Use heuristics on old hardware */ sample.cpumode = basic->P ? PERF_RECORD_MISC_GUEST_USER : PERF_RECORD_MISC_GUEST_KERNEL; else sample.cpumode = basic->P ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL; event.sample.header.type = PERF_RECORD_SAMPLE; event.sample.header.misc = sample.cpumode; event.sample.header.size = sizeof(struct perf_event_header); pr_debug4("%s pos:%#zx ip:%#" PRIx64 " P:%d CL:%d pid:%d.%d cpumode:%d cpu:%d\n", __func__, pos, sample.ip, basic->P, basic->CL, sample.pid, sample.tid, sample.cpumode, sample.cpu); if (perf_session__deliver_synth_event(sfq->sf->session, &event, &sample)) { pr_err("s390 Auxiliary Trace: failed to deliver event\n"); return false; } return true; } static unsigned long long get_trailer_time(const unsigned char *buf) { struct hws_trailer_entry *te; unsigned long long aux_time, progusage2; bool clock_base; te = (struct hws_trailer_entry *)(buf + S390_CPUMSF_PAGESZ - sizeof(*te)); #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ clock_base = be64toh(te->progusage[0]) >> 63 & 0x1; progusage2 = be64toh(te->progusage[1]); #else clock_base = te->clock_base; progusage2 = te->progusage2; #endif if (!clock_base) /* TOD_CLOCK_BASE value missing */ return 0; /* Correct calculation to convert time stamp in trailer entry to * nano seconds (taken from arch/s390 function tod_to_ns()). * TOD_CLOCK_BASE is stored in trailer entry member progusage2. */ aux_time = trailer_timestamp(te, clock_base) - progusage2; aux_time = (aux_time >> 9) * 125 + (((aux_time & 0x1ff) * 125) >> 9); return aux_time; } /* Process the data samples of a single queue. The first parameter is a * pointer to the queue, the second parameter is the time stamp. This * is the time stamp: * - of the event that triggered this processing. * - or the time stamp when the last processing of this queue stopped. * In this case it stopped at a 4KB page boundary and record the * position on where to continue processing on the next invocation * (see buffer->use_data and buffer->use_size). * * When this function returns the second parameter is updated to * reflect the time stamp of the last processed auxiliary data entry * (taken from the trailer entry of that page). The caller uses this * returned time stamp to record the last processed entry in this * queue. * * The function returns: * 0: Processing successful. The second parameter returns the * time stamp from the trailer entry until which position * processing took place. Subsequent calls resume from this * position. * <0: An error occurred during processing. The second parameter * returns the maximum time stamp. * >0: Done on this queue. The second parameter returns the * maximum time stamp. */ static int s390_cpumsf_samples(struct s390_cpumsf_queue *sfq, u64 *ts) { struct s390_cpumsf *sf = sfq->sf; unsigned char *buf = sfq->buffer->use_data; size_t len = sfq->buffer->use_size; struct hws_basic_entry *basic; unsigned short bsdes, dsdes; size_t pos = 0; int err = 1; u64 aux_ts; if (!s390_cpumsf_validate(sf->machine_type, buf, len, &bsdes, &dsdes)) { *ts = ~0ULL; return -1; } /* Get trailer entry time stamp and check if entries in * this auxiliary page are ready for processing. If the * time stamp of the first entry is too high, whole buffer * can be skipped. In this case return time stamp. */ aux_ts = get_trailer_time(buf); if (!aux_ts) { pr_err("[%#08" PRIx64 "] Invalid AUX trailer entry TOD clock base\n", (s64)sfq->buffer->data_offset); aux_ts = ~0ULL; goto out; } if (aux_ts > *ts) { *ts = aux_ts; return 0; } while (pos < len) { /* Handle Basic entry */ basic = (struct hws_basic_entry *)(buf + pos); if (s390_cpumsf_make_event(pos, basic, sfq)) pos += bsdes; else { err = -EBADF; goto out; } pos += dsdes; /* Skip diagnostic entry */ /* Check for trailer entry */ if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) { pos = (pos + S390_CPUMSF_PAGESZ) & ~(S390_CPUMSF_PAGESZ - 1); /* Check existence of next page */ if (pos >= len) break; aux_ts = get_trailer_time(buf + pos); if (!aux_ts) { aux_ts = ~0ULL; goto out; } if (aux_ts > *ts) { *ts = aux_ts; sfq->buffer->use_data += pos; sfq->buffer->use_size -= pos; return 0; } } } out: *ts = aux_ts; sfq->buffer->use_size = 0; sfq->buffer->use_data = NULL; return err; /* Buffer completely scanned or error */ } /* Run the s390 auxiliary trace decoder. * Select the queue buffer to operate on, the caller already selected * the proper queue, depending on second parameter 'ts'. * This is the time stamp until which the auxiliary entries should * be processed. This value is updated by called functions and * returned to the caller. * * Resume processing in the current buffer. If there is no buffer * get a new buffer from the queue and setup start position for * processing. * When a buffer is completely processed remove it from the queue * before returning. * * This function returns * 1: When the queue is empty. Second parameter will be set to * maximum time stamp. * 0: Normal processing done. * <0: Error during queue buffer setup. This causes the caller * to stop processing completely. */ static int s390_cpumsf_run_decoder(struct s390_cpumsf_queue *sfq, u64 *ts) { struct auxtrace_buffer *buffer; struct auxtrace_queue *queue; int err; queue = &sfq->sf->queues.queue_array[sfq->queue_nr]; /* Get buffer and last position in buffer to resume * decoding the auxiliary entries. One buffer might be large * and decoding might stop in between. This depends on the time * stamp of the trailer entry in each page of the auxiliary * data and the time stamp of the event triggering the decoding. */ if (sfq->buffer == NULL) { sfq->buffer = buffer = auxtrace_buffer__next(queue, sfq->buffer); if (!buffer) { *ts = ~0ULL; return 1; /* Processing done on this queue */ } /* Start with a new buffer on this queue */ if (buffer->data) { buffer->use_size = buffer->size; buffer->use_data = buffer->data; } if (sfq->logfile) { /* Write into log file */ size_t rc = fwrite(buffer->data, buffer->size, 1, sfq->logfile); if (rc != 1) pr_err("Failed to write auxiliary data\n"); } } else buffer = sfq->buffer; if (!buffer->data) { int fd = perf_data__fd(sfq->sf->session->data); buffer->data = auxtrace_buffer__get_data(buffer, fd); if (!buffer->data) return -ENOMEM; buffer->use_size = buffer->size; buffer->use_data = buffer->data; if (sfq->logfile) { /* Write into log file */ size_t rc = fwrite(buffer->data, buffer->size, 1, sfq->logfile); if (rc != 1) pr_err("Failed to write auxiliary data\n"); } } pr_debug4("%s queue_nr:%d buffer:%" PRId64 " offset:%#" PRIx64 " size:%#zx rest:%#zx\n", __func__, sfq->queue_nr, buffer->buffer_nr, buffer->offset, buffer->size, buffer->use_size); err = s390_cpumsf_samples(sfq, ts); /* If non-zero, there is either an error (err < 0) or the buffer is * completely done (err > 0). The error is unrecoverable, usually * some descriptors could not be read successfully, so continue with * the next buffer. * In both cases the parameter 'ts' has been updated. */ if (err) { sfq->buffer = NULL; list_del_init(&buffer->list); auxtrace_buffer__free(buffer); if (err > 0) /* Buffer done, no error */ err = 0; } return err; } static struct s390_cpumsf_queue * s390_cpumsf_alloc_queue(struct s390_cpumsf *sf, unsigned int queue_nr) { struct s390_cpumsf_queue *sfq; sfq = zalloc(sizeof(struct s390_cpumsf_queue)); if (sfq == NULL) return NULL; sfq->sf = sf; sfq->queue_nr = queue_nr; sfq->cpu = -1; if (sf->use_logfile) { char *name; int rc; rc = (sf->logdir) ? asprintf(&name, "%s/aux.smp.%02x", sf->logdir, queue_nr) : asprintf(&name, "aux.smp.%02x", queue_nr); if (rc > 0) sfq->logfile = fopen(name, "w"); if (sfq->logfile == NULL) { pr_err("Failed to open auxiliary log file %s," "continue...\n", name); sf->use_logfile = false; } free(name); } return sfq; } static int s390_cpumsf_setup_queue(struct s390_cpumsf *sf, struct auxtrace_queue *queue, unsigned int queue_nr, u64 ts) { struct s390_cpumsf_queue *sfq = queue->priv; if (list_empty(&queue->head)) return 0; if (sfq == NULL) { sfq = s390_cpumsf_alloc_queue(sf, queue_nr); if (!sfq) return -ENOMEM; queue->priv = sfq; if (queue->cpu != -1) sfq->cpu = queue->cpu; } return auxtrace_heap__add(&sf->heap, queue_nr, ts); } static int s390_cpumsf_setup_queues(struct s390_cpumsf *sf, u64 ts) { unsigned int i; int ret = 0; for (i = 0; i < sf->queues.nr_queues; i++) { ret = s390_cpumsf_setup_queue(sf, &sf->queues.queue_array[i], i, ts); if (ret) break; } return ret; } static int s390_cpumsf_update_queues(struct s390_cpumsf *sf, u64 ts) { if (!sf->queues.new_data) return 0; sf->queues.new_data = false; return s390_cpumsf_setup_queues(sf, ts); } static int s390_cpumsf_process_queues(struct s390_cpumsf *sf, u64 timestamp) { unsigned int queue_nr; u64 ts; int ret; while (1) { struct auxtrace_queue *queue; struct s390_cpumsf_queue *sfq; if (!sf->heap.heap_cnt) return 0; if (sf->heap.heap_array[0].ordinal >= timestamp) return 0; queue_nr = sf->heap.heap_array[0].queue_nr; queue = &sf->queues.queue_array[queue_nr]; sfq = queue->priv; auxtrace_heap__pop(&sf->heap); if (sf->heap.heap_cnt) { ts = sf->heap.heap_array[0].ordinal + 1; if (ts > timestamp) ts = timestamp; } else { ts = timestamp; } ret = s390_cpumsf_run_decoder(sfq, &ts); if (ret < 0) { auxtrace_heap__add(&sf->heap, queue_nr, ts); return ret; } if (!ret) { ret = auxtrace_heap__add(&sf->heap, queue_nr, ts); if (ret < 0) return ret; } } return 0; } static int s390_cpumsf_synth_error(struct s390_cpumsf *sf, int code, int cpu, pid_t pid, pid_t tid, u64 ip, u64 timestamp) { char msg[MAX_AUXTRACE_ERROR_MSG]; union perf_event event; int err; strncpy(msg, "Lost Auxiliary Trace Buffer", sizeof(msg) - 1); auxtrace_synth_error(&event.auxtrace_error, PERF_AUXTRACE_ERROR_ITRACE, code, cpu, pid, tid, ip, msg, timestamp); err = perf_session__deliver_synth_event(sf->session, &event, NULL); if (err) pr_err("s390 Auxiliary Trace: failed to deliver error event," "error %d\n", err); return err; } static int s390_cpumsf_lost(struct s390_cpumsf *sf, struct perf_sample *sample) { return s390_cpumsf_synth_error(sf, 1, sample->cpu, sample->pid, sample->tid, 0, sample->time); } static int s390_cpumsf_process_event(struct perf_session *session, union perf_event *event, struct perf_sample *sample, struct perf_tool *tool) { struct s390_cpumsf *sf = container_of(session->auxtrace, struct s390_cpumsf, auxtrace); u64 timestamp = sample->time; struct evsel *ev_bc000; int err = 0; if (dump_trace) return 0; if (!tool->ordered_events) { pr_err("s390 Auxiliary Trace requires ordered events\n"); return -EINVAL; } if (event->header.type == PERF_RECORD_SAMPLE && sample->raw_size) { /* Handle event with raw data */ ev_bc000 = evlist__event2evsel(session->evlist, event); if (ev_bc000 && ev_bc000->core.attr.config == PERF_EVENT_CPUM_CF_DIAG) err = s390_cpumcf_dumpctr(sf, sample); return err; } if (event->header.type == PERF_RECORD_AUX && event->aux.flags & PERF_AUX_FLAG_TRUNCATED) return s390_cpumsf_lost(sf, sample); if (timestamp) { err = s390_cpumsf_update_queues(sf, timestamp); if (!err) err = s390_cpumsf_process_queues(sf, timestamp); } return err; } struct s390_cpumsf_synth { struct perf_tool cpumsf_tool; struct perf_session *session; }; static int s390_cpumsf_process_auxtrace_event(struct perf_session *session, union perf_event *event __maybe_unused, struct perf_tool *tool __maybe_unused) { struct s390_cpumsf *sf = container_of(session->auxtrace, struct s390_cpumsf, auxtrace); int fd = perf_data__fd(session->data); struct auxtrace_buffer *buffer; off_t data_offset; int err; if (sf->data_queued) return 0; if (perf_data__is_pipe(session->data)) { data_offset = 0; } else { data_offset = lseek(fd, 0, SEEK_CUR); if (data_offset == -1) return -errno; } err = auxtrace_queues__add_event(&sf->queues, session, event, data_offset, &buffer); if (err) return err; /* Dump here after copying piped trace out of the pipe */ if (dump_trace) { if (auxtrace_buffer__get_data(buffer, fd)) { s390_cpumsf_dump_event(sf, buffer->data, buffer->size); auxtrace_buffer__put_data(buffer); } } return 0; } static void s390_cpumsf_free_events(struct perf_session *session __maybe_unused) { } static int s390_cpumsf_flush(struct perf_session *session __maybe_unused, struct perf_tool *tool __maybe_unused) { return 0; } static void s390_cpumsf_free_queues(struct perf_session *session) { struct s390_cpumsf *sf = container_of(session->auxtrace, struct s390_cpumsf, auxtrace); struct auxtrace_queues *queues = &sf->queues; unsigned int i; for (i = 0; i < queues->nr_queues; i++) { struct s390_cpumsf_queue *sfq = (struct s390_cpumsf_queue *) queues->queue_array[i].priv; if (sfq != NULL) { if (sfq->logfile) { fclose(sfq->logfile); sfq->logfile = NULL; } if (sfq->logfile_ctr) { fclose(sfq->logfile_ctr); sfq->logfile_ctr = NULL; } } zfree(&queues->queue_array[i].priv); } auxtrace_queues__free(queues); } static void s390_cpumsf_free(struct perf_session *session) { struct s390_cpumsf *sf = container_of(session->auxtrace, struct s390_cpumsf, auxtrace); auxtrace_heap__free(&sf->heap); s390_cpumsf_free_queues(session); session->auxtrace = NULL; zfree(&sf->logdir); free(sf); } static bool s390_cpumsf_evsel_is_auxtrace(struct perf_session *session __maybe_unused, struct evsel *evsel) { return evsel->core.attr.type == PERF_TYPE_RAW && evsel->core.attr.config == PERF_EVENT_CPUM_SF_DIAG; } static int s390_cpumsf_get_type(const char *cpuid) { int ret, family = 0; ret = sscanf(cpuid, "%*[^,],%u", &family); return (ret == 1) ? family : 0; } /* Check itrace options set on perf report command. * Return true, if none are set or all options specified can be * handled on s390 (currently only option 'd' for logging. * Return false otherwise. */ static bool check_auxtrace_itrace(struct itrace_synth_opts *itops) { bool ison = false; if (!itops || !itops->set) return true; ison = itops->inject || itops->instructions || itops->branches || itops->transactions || itops->ptwrites || itops->pwr_events || itops->errors || itops->dont_decode || itops->calls || itops->returns || itops->callchain || itops->thread_stack || itops->last_branch || itops->add_callchain || itops->add_last_branch; if (!ison) return true; pr_err("Unsupported --itrace options specified\n"); return false; } /* Check for AUXTRACE dump directory if it is needed. * On failure print an error message but continue. * Return 0 on wrong keyword in config file and 1 otherwise. */ static int s390_cpumsf__config(const char *var, const char *value, void *cb) { struct s390_cpumsf *sf = cb; struct stat stbuf; int rc; if (strcmp(var, "auxtrace.dumpdir")) return 0; sf->logdir = strdup(value); if (sf->logdir == NULL) { pr_err("Failed to find auxtrace log directory %s," " continue with current directory...\n", value); return 1; } rc = stat(sf->logdir, &stbuf); if (rc == -1 || !S_ISDIR(stbuf.st_mode)) { pr_err("Missing auxtrace log directory %s," " continue with current directory...\n", value); zfree(&sf->logdir); } return 1; } int s390_cpumsf_process_auxtrace_info(union perf_event *event, struct perf_session *session) { struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info; struct s390_cpumsf *sf; int err; if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info)) return -EINVAL; sf = zalloc(sizeof(struct s390_cpumsf)); if (sf == NULL) return -ENOMEM; if (!check_auxtrace_itrace(session->itrace_synth_opts)) { err = -EINVAL; goto err_free; } sf->use_logfile = session->itrace_synth_opts->log; if (sf->use_logfile) perf_config(s390_cpumsf__config, sf); err = auxtrace_queues__init(&sf->queues); if (err) goto err_free; sf->session = session; sf->machine = &session->machines.host; /* No kvm support */ sf->auxtrace_type = auxtrace_info->type; sf->pmu_type = PERF_TYPE_RAW; sf->machine_type = s390_cpumsf_get_type(session->evlist->env->cpuid); sf->auxtrace.process_event = s390_cpumsf_process_event; sf->auxtrace.process_auxtrace_event = s390_cpumsf_process_auxtrace_event; sf->auxtrace.flush_events = s390_cpumsf_flush; sf->auxtrace.free_events = s390_cpumsf_free_events; sf->auxtrace.free = s390_cpumsf_free; sf->auxtrace.evsel_is_auxtrace = s390_cpumsf_evsel_is_auxtrace; session->auxtrace = &sf->auxtrace; if (dump_trace) return 0; err = auxtrace_queues__process_index(&sf->queues, session); if (err) goto err_free_queues; if (sf->queues.populated) sf->data_queued = true; return 0; err_free_queues: auxtrace_queues__free(&sf->queues); session->auxtrace = NULL; err_free: zfree(&sf->logdir); free(sf); return err; }