#!/usr/bin/env perl # This is a POC (proof of concept or piece of crap, take your pick) for reading the # text representation of trace output related to page allocation. It makes an attempt # to extract some high-level information on what is going on. The accuracy of the parser # may vary considerably # # Example usage: trace-pagealloc-postprocess.pl < /sys/kernel/tracing/trace_pipe # other options # --prepend-parent Report on the parent proc and PID # --read-procstat If the trace lacks process info, get it from /proc # --ignore-pid Aggregate processes of the same name together # # Copyright (c) IBM Corporation 2009 # Author: Mel Gorman <mel@csn.ul.ie> use strict; use Getopt::Long; # Tracepoint events use constant MM_PAGE_ALLOC => 1; use constant MM_PAGE_FREE => 2; use constant MM_PAGE_FREE_BATCHED => 3; use constant MM_PAGE_PCPU_DRAIN => 4; use constant MM_PAGE_ALLOC_ZONE_LOCKED => 5; use constant MM_PAGE_ALLOC_EXTFRAG => 6; use constant EVENT_UNKNOWN => 7; # Constants used to track state use constant STATE_PCPU_PAGES_DRAINED => 8; use constant STATE_PCPU_PAGES_REFILLED => 9; # High-level events extrapolated from tracepoints use constant HIGH_PCPU_DRAINS => 10; use constant HIGH_PCPU_REFILLS => 11; use constant HIGH_EXT_FRAGMENT => 12; use constant HIGH_EXT_FRAGMENT_SEVERE => 13; use constant HIGH_EXT_FRAGMENT_MODERATE => 14; use constant HIGH_EXT_FRAGMENT_CHANGED => 15; my %perprocesspid; my %perprocess; my $opt_ignorepid; my $opt_read_procstat; my $opt_prepend_parent; # Catch sigint and exit on request my $sigint_report = 0; my $sigint_exit = 0; my $sigint_pending = 0; my $sigint_received = 0; sub sigint_handler { my $current_time = time; if ($current_time - 2 > $sigint_received) { print "SIGINT received, report pending. Hit ctrl-c again to exit\n"; $sigint_report = 1; } else { if (!$sigint_exit) { print "Second SIGINT received quickly, exiting\n"; } $sigint_exit++; } if ($sigint_exit > 3) { print "Many SIGINTs received, exiting now without report\n"; exit; } $sigint_received = $current_time; $sigint_pending = 1; } $SIG{INT} = "sigint_handler"; # Parse command line options GetOptions( 'ignore-pid' => \$opt_ignorepid, 'read-procstat' => \$opt_read_procstat, 'prepend-parent' => \$opt_prepend_parent, ); # Defaults for dynamically discovered regex's my $regex_fragdetails_default = 'page=([0-9a-f]*) pfn=([0-9]*) alloc_order=([-0-9]*) fallback_order=([-0-9]*) pageblock_order=([-0-9]*) alloc_migratetype=([-0-9]*) fallback_migratetype=([-0-9]*) fragmenting=([-0-9]) change_ownership=([-0-9])'; # Dyanically discovered regex my $regex_fragdetails; # Static regex used. Specified like this for readability and for use with /o # (process_pid) (cpus ) ( time ) (tpoint ) (details) my $regex_traceevent = '\s*([a-zA-Z0-9-]*)\s*(\[[0-9]*\])\s*([0-9.]*):\s*([a-zA-Z_]*):\s*(.*)'; my $regex_statname = '[-0-9]*\s\((.*)\).*'; my $regex_statppid = '[-0-9]*\s\(.*\)\s[A-Za-z]\s([0-9]*).*'; sub generate_traceevent_regex { my $event = shift; my $default = shift; my $regex; # Read the event format or use the default if (!open (FORMAT, "/sys/kernel/tracing/events/$event/format")) { $regex = $default; } else { my $line; while (!eof(FORMAT)) { $line = <FORMAT>; if ($line =~ /^print fmt:\s"(.*)",.*/) { $regex = $1; $regex =~ s/%p/\([0-9a-f]*\)/g; $regex =~ s/%d/\([-0-9]*\)/g; $regex =~ s/%lu/\([0-9]*\)/g; } } } # Verify fields are in the right order my $tuple; foreach $tuple (split /\s/, $regex) { my ($key, $value) = split(/=/, $tuple); my $expected = shift; if ($key ne $expected) { print("WARNING: Format not as expected '$key' != '$expected'"); $regex =~ s/$key=\((.*)\)/$key=$1/; } } if (defined shift) { die("Fewer fields than expected in format"); } return $regex; } $regex_fragdetails = generate_traceevent_regex("kmem/mm_page_alloc_extfrag", $regex_fragdetails_default, "page", "pfn", "alloc_order", "fallback_order", "pageblock_order", "alloc_migratetype", "fallback_migratetype", "fragmenting", "change_ownership"); sub read_statline($) { my $pid = $_[0]; my $statline; if (open(STAT, "/proc/$pid/stat")) { $statline = <STAT>; close(STAT); } if ($statline eq '') { $statline = "-1 (UNKNOWN_PROCESS_NAME) R 0"; } return $statline; } sub guess_process_pid($$) { my $pid = $_[0]; my $statline = $_[1]; if ($pid == 0) { return "swapper-0"; } if ($statline !~ /$regex_statname/o) { die("Failed to math stat line for process name :: $statline"); } return "$1-$pid"; } sub parent_info($$) { my $pid = $_[0]; my $statline = $_[1]; my $ppid; if ($pid == 0) { return "NOPARENT-0"; } if ($statline !~ /$regex_statppid/o) { die("Failed to match stat line process ppid:: $statline"); } # Read the ppid stat line $ppid = $1; return guess_process_pid($ppid, read_statline($ppid)); } sub process_events { my $traceevent; my $process_pid; my $cpus; my $timestamp; my $tracepoint; my $details; my $statline; # Read each line of the event log EVENT_PROCESS: while ($traceevent = <STDIN>) { if ($traceevent =~ /$regex_traceevent/o) { $process_pid = $1; $tracepoint = $4; if ($opt_read_procstat || $opt_prepend_parent) { $process_pid =~ /(.*)-([0-9]*)$/; my $process = $1; my $pid = $2; $statline = read_statline($pid); if ($opt_read_procstat && $process eq '') { $process_pid = guess_process_pid($pid, $statline); } if ($opt_prepend_parent) { $process_pid = parent_info($pid, $statline) . " :: $process_pid"; } } # Unnecessary in this script. Uncomment if required # $cpus = $2; # $timestamp = $3; } else { next; } # Perl Switch() sucks majorly if ($tracepoint eq "mm_page_alloc") { $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}++; } elsif ($tracepoint eq "mm_page_free") { $perprocesspid{$process_pid}->{MM_PAGE_FREE}++ } elsif ($tracepoint eq "mm_page_free_batched") { $perprocesspid{$process_pid}->{MM_PAGE_FREE_BATCHED}++; } elsif ($tracepoint eq "mm_page_pcpu_drain") { $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}++; $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED}++; } elsif ($tracepoint eq "mm_page_alloc_zone_locked") { $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}++; $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED}++; } elsif ($tracepoint eq "mm_page_alloc_extfrag") { # Extract the details of the event now $details = $5; my ($page, $pfn); my ($alloc_order, $fallback_order, $pageblock_order); my ($alloc_migratetype, $fallback_migratetype); my ($fragmenting, $change_ownership); if ($details !~ /$regex_fragdetails/o) { print "WARNING: Failed to parse mm_page_alloc_extfrag as expected\n"; next; } $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG}++; $page = $1; $pfn = $2; $alloc_order = $3; $fallback_order = $4; $pageblock_order = $5; $alloc_migratetype = $6; $fallback_migratetype = $7; $fragmenting = $8; $change_ownership = $9; if ($fragmenting) { $perprocesspid{$process_pid}->{HIGH_EXT_FRAG}++; if ($fallback_order <= 3) { $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE}++; } else { $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE}++; } } if ($change_ownership) { $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED}++; } } else { $perprocesspid{$process_pid}->{EVENT_UNKNOWN}++; } # Catch a full pcpu drain event if ($perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED} && $tracepoint ne "mm_page_pcpu_drain") { $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS}++; $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED} = 0; } # Catch a full pcpu refill event if ($perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED} && $tracepoint ne "mm_page_alloc_zone_locked") { $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS}++; $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED} = 0; } if ($sigint_pending) { last EVENT_PROCESS; } } } sub dump_stats { my $hashref = shift; my %stats = %$hashref; # Dump per-process stats my $process_pid; my $max_strlen = 0; # Get the maximum process name foreach $process_pid (keys %perprocesspid) { my $len = length($process_pid); if ($len > $max_strlen) { $max_strlen = $len; } } $max_strlen += 2; printf("\n"); printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n", "Process", "Pages", "Pages", "Pages", "Pages", "PCPU", "PCPU", "PCPU", "Fragment", "Fragment", "MigType", "Fragment", "Fragment", "Unknown"); printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n", "details", "allocd", "allocd", "freed", "freed", "pages", "drains", "refills", "Fallback", "Causing", "Changed", "Severe", "Moderate", ""); printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n", "", "", "under lock", "direct", "pagevec", "drain", "", "", "", "", "", "", "", ""); foreach $process_pid (keys %stats) { # Dump final aggregates if ($stats{$process_pid}->{STATE_PCPU_PAGES_DRAINED}) { $stats{$process_pid}->{HIGH_PCPU_DRAINS}++; $stats{$process_pid}->{STATE_PCPU_PAGES_DRAINED} = 0; } if ($stats{$process_pid}->{STATE_PCPU_PAGES_REFILLED}) { $stats{$process_pid}->{HIGH_PCPU_REFILLS}++; $stats{$process_pid}->{STATE_PCPU_PAGES_REFILLED} = 0; } printf("%-" . $max_strlen . "s %8d %10d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d\n", $process_pid, $stats{$process_pid}->{MM_PAGE_ALLOC}, $stats{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}, $stats{$process_pid}->{MM_PAGE_FREE}, $stats{$process_pid}->{MM_PAGE_FREE_BATCHED}, $stats{$process_pid}->{MM_PAGE_PCPU_DRAIN}, $stats{$process_pid}->{HIGH_PCPU_DRAINS}, $stats{$process_pid}->{HIGH_PCPU_REFILLS}, $stats{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG}, $stats{$process_pid}->{HIGH_EXT_FRAG}, $stats{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED}, $stats{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE}, $stats{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE}, $stats{$process_pid}->{EVENT_UNKNOWN}); } } sub aggregate_perprocesspid() { my $process_pid; my $process; undef %perprocess; foreach $process_pid (keys %perprocesspid) { $process = $process_pid; $process =~ s/-([0-9])*$//; if ($process eq '') { $process = "NO_PROCESS_NAME"; } $perprocess{$process}->{MM_PAGE_ALLOC} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}; $perprocess{$process}->{MM_PAGE_ALLOC_ZONE_LOCKED} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}; $perprocess{$process}->{MM_PAGE_FREE} += $perprocesspid{$process_pid}->{MM_PAGE_FREE}; $perprocess{$process}->{MM_PAGE_FREE_BATCHED} += $perprocesspid{$process_pid}->{MM_PAGE_FREE_BATCHED}; $perprocess{$process}->{MM_PAGE_PCPU_DRAIN} += $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}; $perprocess{$process}->{HIGH_PCPU_DRAINS} += $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS}; $perprocess{$process}->{HIGH_PCPU_REFILLS} += $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS}; $perprocess{$process}->{MM_PAGE_ALLOC_EXTFRAG} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG}; $perprocess{$process}->{HIGH_EXT_FRAG} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAG}; $perprocess{$process}->{HIGH_EXT_FRAGMENT_CHANGED} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED}; $perprocess{$process}->{HIGH_EXT_FRAGMENT_SEVERE} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE}; $perprocess{$process}->{HIGH_EXT_FRAGMENT_MODERATE} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE}; $perprocess{$process}->{EVENT_UNKNOWN} += $perprocesspid{$process_pid}->{EVENT_UNKNOWN}; } } sub report() { if (!$opt_ignorepid) { dump_stats(\%perprocesspid); } else { aggregate_perprocesspid(); dump_stats(\%perprocess); } } # Process events or signals until neither is available sub signal_loop() { my $sigint_processed; do { $sigint_processed = 0; process_events(); # Handle pending signals if any if ($sigint_pending) { my $current_time = time; if ($sigint_exit) { print "Received exit signal\n"; $sigint_pending = 0; } if ($sigint_report) { if ($current_time >= $sigint_received + 2) { report(); $sigint_report = 0; $sigint_pending = 0; $sigint_processed = 1; } } } } while ($sigint_pending || $sigint_processed); } signal_loop(); report();