崩溃日志收集库 xCrash 浅析

xCrash 是爱奇艺团队开源的一款崩溃日志收集库,可以收集 java crashnative crashANR 日志

日志格式为专用格式,内容还算丰富:机器信息崩溃线程和其他线程的方法栈logcat打开的 fd 等等 …

默认配置为:

  1. java crashnative crashANR 都会被捕获
  2. 日志目录在 /data/data/[pkg]/files/tombstones
  3. java crash 日志文件为 tombstone_[加载 xCrash 的时间,单位为秒的时间戳,宽度为 20]_[app version]__[process name].java.xcrash
  4. native crash 日志文件为 tombstone_[加载 xCrash 的时间,单位为秒的时间戳,宽度为 20]_[app version]__[process name].native.xcrash
  5. ANR 日志文件为 tombstone_[加载 xCrash 的时间,单位为秒的时间戳,宽度为 20]_[app version]__[process name].trace.xcrash

Java Crash

捕获 Java Crash 用的是 DefaultUncaughtExceptionHandler,相关的基础知识参考 Uncaught Exception Handling

class JavaCrashHandler implements UncaughtExceptionHandler {

    @Override
    public void uncaughtException(Thread thread, Throwable throwable) {
        if (defaultHandler != null) {
            Thread.setDefaultUncaughtExceptionHandler(defaultHandler);
        }

        try {
            handleException(thread, throwable);
        } catch (Exception e) {
            XCrash.getLogger().e(Util.TAG, "JavaCrashHandler handleException failed", e);
        }

        // 可以选择重新抛出给上一个 handler,或者杀死 app
        if (this.rethrow) {
            if (defaultHandler != null) {
                defaultHandler.uncaughtException(thread, throwable);
            }
        } else {
            ActivityMonitor.getInstance().finishAllActivities();
            Process.killProcess(this.pid);
            System.exit(10);
        }
    }

    // 收集各种各样的信息,写入到日志文件
    private void handleException(Thread thread, Throwable throwable) {
        Date crashTime = new Date();

        //notify the java crash
        NativeHandler.getInstance().notifyJavaCrashed();
        AnrHandler.getInstance().notifyJavaCrashed();

        //create log file
        File logFile = null;
        try {
            String logPath = String.format(Locale.US, "%s/%s_%020d_%s__%s%s", logDir, Util.logPrefix, startTime.getTime() * 1000, appVersion, processName, Util.javaLogSuffix);
            logFile = FileManager.getInstance().createLogFile(logPath);
        } catch (Exception e) {
            XCrash.getLogger().e(Util.TAG, "JavaCrashHandler createLogFile failed", e);
        }

        //get emergency
        String emergency = null;
        try {
            emergency = getEmergency(crashTime, thread, throwable);
        } catch (Exception e) {
            XCrash.getLogger().e(Util.TAG, "JavaCrashHandler getEmergency failed", e);
        }

        //write info to log file
        if (logFile != null) {
            RandomAccessFile raf = null;
            try {
                raf = new RandomAccessFile(logFile, "rws");

                //write emergency info
                if (emergency != null) {
                    raf.write(emergency.getBytes("UTF-8"));
                }

                //If we wrote the emergency info successfully, we don't need to return it from callback again.
                emergency = null;

                //write logcat
                if (logcatMainLines > 0 || logcatSystemLines > 0 || logcatEventsLines > 0) {
                    raf.write(Util.getLogcat(logcatMainLines, logcatSystemLines, logcatEventsLines).getBytes("UTF-8"));
                }

                //write fds
                if (dumpFds) {
                    raf.write(Util.getFds().getBytes("UTF-8"));
                }

                //write network info
                if (dumpNetworkInfo) {
                    raf.write(Util.getNetworkInfo().getBytes("UTF-8"));
                }

                //write memory info
                raf.write(Util.getMemoryInfo().getBytes("UTF-8"));

                //write background / foreground
                raf.write(("foreground:\n" + (ActivityMonitor.getInstance().isApplicationForeground() ? "yes" : "no") + "\n\n").getBytes("UTF-8"));

                //write other threads info
                if (dumpAllThreads) {
                    raf.write(getOtherThreadsInfo(thread).getBytes("UTF-8"));
                }
            } catch (Exception e) {
                XCrash.getLogger().e(Util.TAG, "JavaCrashHandler write log file failed", e);
            } finally {
                if (raf != null) {
                    try {
                        raf.close();
                    } catch (Exception ignored) {
                    }
                }
            }
        }

        //callback
        if (callback != null) {
            try {
                callback.onCrash(logFile == null ? null : logFile.getAbsolutePath(), emergency);
            } catch (Exception ignored) {
            }
        }
    }        
}
private String getEmergency(Date crashTime, Thread thread, Throwable throwable) {
    //stack stace
    StringWriter sw = new StringWriter();
    PrintWriter pw = new PrintWriter(sw);
    throwable.printStackTrace(pw);
    String stacktrace = sw.toString();
    return Util.getLogHeader(startTime, crashTime, Util.javaCrashType, appId, appVersion)
            + "pid: " + pid + ", tid: " + Process.myTid() + ", name: " + thread.getName() + "  >>> " + processName + " <<<\n"
            + "\n"
            + "java stacktrace:\n"
            + stacktrace
            + "\n"
            + getBuildId(stacktrace);
}

static String getLogHeader(Date startTime, Date crashTime, String crashType, String appId, String appVersion) {
    DateFormat timeFormatter = new SimpleDateFormat(Util.timeFormatterStr, Locale.US);
    return Util.sepHead + "\n"
        + "Tombstone maker: '" + Version.fullVersion + "'\n"
        + "Crash type: '" + crashType + "'\n"
        + "Start time: '" + timeFormatter.format(startTime) + "'\n"
        + "Crash time: '" + timeFormatter.format(crashTime) + "'\n"
        + "App ID: '" + appId + "'\n"
        + "App version: '" + appVersion + "'\n"
        + "Rooted: '" + (Util.isRoot() ? "Yes" : "No") + "'\n"
        + "API level: '" + Build.VERSION.SDK_INT + "'\n"
        + "OS version: '" + Build.VERSION.RELEASE + "'\n"
        + "ABI list: '" + Util.getAbiList() + "'\n"
        + "Manufacturer: '" + Build.MANUFACTURER + "'\n"
        + "Brand: '" + Build.BRAND + "'\n"
        + "Model: '" + Util.getMobileModel() + "'\n"
        + "Build fingerprint: '" + Build.FINGERPRINT + "'\n";
}

private String getBuildId(String stktrace) {
    String buildId = "";
    List<String> libPathList = new ArrayList<String>();
    if (stktrace.contains("UnsatisfiedLinkError")) {
        String libInfo = null;
        String[] tempLibPathStr;
        tempLibPathStr = stktrace.split("\""); // " is the delimiter
        for (String libPathStr :  tempLibPathStr) {
            if (libPathStr.isEmpty() || !libPathStr.endsWith(".so")) continue;
            libPathList.add(libPathStr);
            String libName = libPathStr.substring(libPathStr.lastIndexOf('/') + 1);
            libPathList.add(XCrash.nativeLibDir + "/" + libName);
            libPathList.add("/vendor/lib/" + libName);
            libPathList.add("/vendor/lib64/" + libName);
            libPathList.add("/system/lib/" + libName);
            libPathList.add("/system/lib64/" + libName);
            libInfo = getLibInfo(libPathList);
        }
        buildId = "build id:"
                + "\n"
                + libInfo
                + "\n";
    }
    return buildId;
}

输出的日志内容如下:

*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
Tombstone maker: 'xCrash 2.4.6'                                                                    // xCrash 把日志叫做 tombstone,这里指的是生成 tombstone 的 xCrash 的版本
Crash type: 'java'                                                                                 // 指明 crash 类型,此日志包含的是 java crash(此外还有 native crash 和 ANR)
Start time: '2019-10-12T03:23:19.580+0800'                                                         // 初始化 xCrash 的时间,也就是调用 XCrash.init 的时间
Crash time: '2019-10-12T03:23:25.533+0800'                                                         // 发生崩溃的时间
App ID: 'xcrash.sample'                                                                            // 发生崩溃的 APP 的包名
App version: '1.2.3-beta456-patch789'                                                              // APP version name
Rooted: 'No'                                                                                       
API level: '29'
OS version: '10'
ABI list: 'arm64-v8a,armeabi-v7a,armeabi'
Manufacturer: 'Google'
Brand: 'google'
Model: 'Pixel'
Build fingerprint: 'google/sailfish/sailfish:10/QP1A.190711.020/5800535:user/release-keys'
pid: 21356, tid: 21356, name: main  >>> xcrash.sample <<<

java stacktrace:                                                                                    // 崩溃线程的调用栈
java.lang.IllegalStateException: Could not execute method for android:onClick
	at androidx.appcompat.app.AppCompatViewInflater$DeclaredOnClickListener.onClick(AppCompatViewInflater.java:402)
	at android.view.View.performClick(View.java:7140)
	at android.view.View.performClickInternal(View.java:7117)
	at android.view.View.access$3500(View.java:801)
	at android.view.View$PerformClick.run(View.java:27351)
	at android.os.Handler.handleCallback(Handler.java:883)
	at android.os.Handler.dispatchMessage(Handler.java:100)
	at android.os.Looper.loop(Looper.java:214)
	at android.app.ActivityThread.main(ActivityThread.java:7356)
	at java.lang.reflect.Method.invoke(Native Method)
	at com.android.internal.os.RuntimeInit$MethodAndArgsCaller.run(RuntimeInit.java:492)
	at com.android.internal.os.ZygoteInit.main(ZygoteInit.java:930)
Caused by: java.lang.reflect.InvocationTargetException
	at java.lang.reflect.Method.invoke(Native Method)
	at androidx.appcompat.app.AppCompatViewInflater$DeclaredOnClickListener.onClick(AppCompatViewInflater.java:397)
	... 11 more
Caused by: java.lang.RuntimeException: test java exception
	at xcrash.XCrash.testJavaCrash(XCrash.java:847)
	at xcrash.sample.MainActivity.testJavaCrashInMainThread_onClick(MainActivity.java:67)
	... 13 more

logcat

其实就是调用 logcat 命令获取崩溃时的 mainsystemevents 三个 buffer 的日志,如:/system/bin/logcat -b main -d -v threadtime -t 200 --pid 21356 *:D

static String getLogcat(int logcatMainLines, int logcatSystemLines, int logcatEventsLines) {
    int pid = android.os.Process.myPid();
    StringBuilder sb = new StringBuilder();
    sb.append("logcat:\n");
    if (logcatMainLines > 0) {
        getLogcatByBufferName(pid, sb, "main", logcatMainLines, 'D');
    }
    if (logcatSystemLines > 0) {
        getLogcatByBufferName(pid, sb, "system", logcatSystemLines, 'W');
    }
    if (logcatEventsLines > 0) {
        getLogcatByBufferName(pid, sb, "events", logcatSystemLines, 'I');
    }
    sb.append("\n");
    return sb.toString();
}

private static void getLogcatByBufferName(int pid, StringBuilder sb, String bufferName, int lines, char priority) {
    boolean withPid = (android.os.Build.VERSION.SDK_INT >= 24);
    String pidString = Integer.toString(pid);
    String pidLabel = " " + pidString + " ";
    //command for ProcessBuilder
    List<String> command = new ArrayList<String>();
    command.add("/system/bin/logcat");
    command.add("-b");
    command.add(bufferName);
    command.add("-d");
    command.add("-v");
    command.add("threadtime");
    command.add("-t");
    command.add(Integer.toString(withPid ? lines : (int) (lines * 1.2)));
    if (withPid) {
        command.add("--pid");
        command.add(pidString);
    }
    command.add("*:" + priority);
    //append the command line
    Object[] commandArray = command.toArray();
    sb.append("--------- tail end of log ").append(bufferName);
    sb.append(" (").append(android.text.TextUtils.join(" ", commandArray)).append(")\n");
    //append logs
    BufferedReader br = null;
    String line;
    try {
        Process process = new ProcessBuilder().command(command).start();
        br = new BufferedReader(new InputStreamReader(process.getInputStream()));
        while ((line = br.readLine()) != null) {
            if (withPid || line.contains(pidLabel)) {
                sb.append(line).append("\n");
            }
        }
    } catch (Exception e) {
        XCrash.getLogger().w(Util.TAG, "Util run logcat command failed", e);
    } finally {
        if (br != null) {
            try {
                br.close();
            } catch (IOException ignored) {
            }
        }
    }
}

输出如下:

logcat:
--------- tail end of log main (/system/bin/logcat -b main -d -v threadtime -t 200 --pid 21356 *:D)
10-12 03:23:19.356 21356 21356 I xcrash.sample: Late-enabling -Xcheck:jni
10-12 03:23:19.398 21356 21356 E xcrash.sample: Unknown bits set in runtime_flags: 0x8000
10-12 03:23:19.571 21356 21356 D xcrash_sample: xCrash SDK init: start
10-12 03:23:19.586 21356 21356 D xcrash_sample: xCrash SDK init: end
10-12 03:23:19.757 21356 21356 W xcrash.sample: Accessing hidden method Landroid/view/View;->computeFitSystemWindows(Landroid/graphics/Rect;Landroid/graphics/Rect;)Z (greylist, reflection, allowed)
10-12 03:23:19.758 21356 21356 W xcrash.sample: Accessing hidden method Landroid/view/ViewGroup;->makeOptionalFitsSystemWindows()V (greylist, reflection, allowed)
10-12 03:23:19.829 21356 21356 I WebViewFactory: Loading com.google.android.webview version 77.0.3865.92 (code 386509238)
10-12 03:23:19.874 21356 21356 I cr_LibraryLoader: Time to load native libraries: 4 ms (timestamps 1922-1926)
10-12 03:23:19.920 21356 21356 I chromium: [INFO:library_loader_hooks.cc(51)] Chromium logging enabled: level = 0, default verbosity = 0
10-12 03:23:19.921 21356 21356 I cr_LibraryLoader: Expected native library version number "77.0.3865.92", actual native library version number "77.0.3865.92"
10-12 03:23:19.926 21356 21402 W cr_ChildProcLH: Create a new ChildConnectionAllocator with package name = com.google.android.webview, sandboxed = true
10-12 03:23:19.930 21356 21402 W xcrash.sample: Accessing hidden method Landroid/content/Context;->bindServiceAsUser(Landroid/content/Intent;Landroid/content/ServiceConnection;ILandroid/os/Handler;Landroid/os/UserHandle;)Z (greylist, reflection, allowed)
10-12 03:23:19.934 21356 21356 I cr_BrowserStartup: Initializing chromium process, singleProcess=false
10-12 03:23:19.979 21356 21430 W chromium: [WARNING:dns_config_service_posix.cc(339)] Failed to read DnsConfig.
10-12 03:23:20.031 21356 21356 W xcrash.sample: Accessing hidden method Landroid/view/textclassifier/logging/SmartSelectionEventTracker;-><init>(Landroid/content/Context;I)V (greylist, reflection, allowed)
10-12 03:23:20.031 21356 21356 W xcrash.sample: Accessing hidden method Landroid/view/textclassifier/logging/SmartSelectionEventTracker;->logEvent(Landroid/view/textclassifier/logging/SmartSelectionEventTracker$SelectionEvent;)V (greylist, reflection, allowed)
10-12 03:23:20.032 21356 21356 W xcrash.sample: Accessing hidden method Landroid/view/textclassifier/logging/SmartSelectionEventTracker$SelectionEvent;->selectionStarted(I)Landroid/view/textclassifier/logging/SmartSelectionEventTracker$SelectionEvent; (greylist, reflection, allowed)
10-12 03:23:20.032 21356 21356 W xcrash.sample: Accessing hidden method Landroid/view/textclassifier/logging/SmartSelectionEventTracker$SelectionEvent;->selectionModified(II)Landroid/view/textclassifier/logging/SmartSelectionEventTracker$SelectionEvent; (greylist, reflection, allowed)
10-12 03:23:20.032 21356 21356 W xcrash.sample: Accessing hidden method Landroid/view/textclassifier/logging/SmartSelectionEventTracker$SelectionEvent;->selectionModified(IILandroid/view/textclassifier/TextClassification;)Landroid/view/textclassifier/logging/SmartSelectionEventTracker$SelectionEvent; (greylist, reflection, allowed)
10-12 03:23:20.032 21356 21356 W xcrash.sample: Accessing hidden method Landroid/view/textclassifier/logging/SmartSelectionEventTracker$SelectionEvent;->selectionModified(IILandroid/view/textclassifier/TextSelection;)Landroid/view/textclassifier/logging/SmartSelectionEventTracker$SelectionEvent; (greylist, reflection, allowed)
10-12 03:23:20.032 21356 21356 W xcrash.sample: Accessing hidden method Landroid/view/textclassifier/logging/SmartSelectionEventTracker$SelectionEvent;->selectionAction(III)Landroid/view/textclassifier/logging/SmartSelectionEventTracker$SelectionEvent; (greylist, reflection, allowed)
10-12 03:23:20.032 21356 21356 W xcrash.sample: Accessing hidden method Landroid/view/textclassifier/logging/SmartSelectionEventTracker$SelectionEvent;->selectionAction(IIILandroid/view/textclassifier/TextClassification;)Landroid/view/textclassifier/logging/SmartSelectionEventTracker$SelectionEvent; (greylist, reflection, allowed)
10-12 03:23:20.143 21356 21395 I Adreno  : QUALCOMM build                   : 4a00b69, I4e7e888065
10-12 03:23:20.143 21356 21395 I Adreno  : Build Date                       : 04/09/19
10-12 03:23:20.143 21356 21395 I Adreno  : OpenGL ES Shader Compiler Version: EV031.26.06.00
10-12 03:23:20.143 21356 21395 I Adreno  : Local Branch                     : mybranche95ae4c8-d77f-f18d-a9ef-1458d0b52ae8
10-12 03:23:20.143 21356 21395 I Adreno  : Remote Branch                    : quic/gfx-adreno.lnx.1.0
10-12 03:23:20.143 21356 21395 I Adreno  : Remote Branch                    : NONE
10-12 03:23:20.143 21356 21395 I Adreno  : Reconstruct Branch               : NOTHING
10-12 03:23:20.143 21356 21395 I Adreno  : Build Config                     : S L 8.0.5 AArch64
10-12 03:23:20.146 21356 21395 I Adreno  : PFP: 0x005ff110, ME: 0x005ff066
10-12 03:23:20.198 21356 21395 W Gralloc3: mapper 3.x is not supported
10-12 03:23:25.531 21356 21356 D AndroidRuntime: Shutting down VM
--------- tail end of log system (/system/bin/logcat -b system -d -v threadtime -t 50 --pid 21356 *:W)
--------- tail end of log events (/system/bin/logcat -b events -d -v threadtime -t 50 --pid 21356 *:I)
10-12 03:23:20.046 21356 21356 I am_on_create_called: [0,xcrash.sample.MainActivity,performCreate]
10-12 03:23:20.053 21356 21356 I am_on_start_called: [0,xcrash.sample.MainActivity,handleStartActivity]
10-12 03:23:20.056 21356 21356 I am_on_resume_called: [0,xcrash.sample.MainActivity,RESUME_ACTIVITY]
10-12 03:23:20.083 21356 21356 I am_on_top_resumed_gained_called: [0,xcrash.sample.MainActivity,topStateChangedWhenResumed]

Opened FD

打印已打开的 FD 及其路径,已打开的 FD 在目录 /proc/self/fd

static String getFds() {
    StringBuilder sb = new StringBuilder("open files:\n");
    try {
        File dir = new File("/proc/self/fd");
        File[] fds = dir.listFiles(new FilenameFilter() {
            @Override
            public boolean accept(File dir, String name) {
                return TextUtils.isDigitsOnly(name);
            }
        });
        int count = 0;
        if (fds != null) {
            for (File fd : fds) {
                String path = null;
                try {
                    if (Build.VERSION.SDK_INT >= 21) {
                        path = Os.readlink(fd.getAbsolutePath());
                    } else {
                        path = fd.getCanonicalPath();
                    }
                } catch (Exception ignored) {
                }
                sb.append("    fd ").append(fd.getName()).append(": ")
                    .append(TextUtils.isEmpty(path) ? "???" : path.trim()).append('\n');
                count++;
                if (count > 1024) {
                    break;
                }
            }
            if (fds.length > 1024) {
                sb.append("    ......\n");
            }
            sb.append("    (number of FDs: ").append(fds.length).append(")\n");
        }
    } catch (Exception ignored) {
    }
    sb.append('\n');
    return sb.toString();
}

输出如下:

open files:
    fd 0: /dev/null
    fd 1: /dev/null
    fd 2: /dev/null
    fd 3: /proc/21356/fd/3
    fd 4: /proc/21356/fd/4
    fd 5: /proc/21356/fd/5
    fd 6: /dev/null
    fd 7: /dev/null
    fd 8: /dev/null
    fd 9: /apex/com.android.runtime/javalib/core-oj.jar
    fd 10: /apex/com.android.runtime/javalib/core-libart.jar
    fd 11: /apex/com.android.runtime/javalib/okhttp.jar
    fd 12: /apex/com.android.runtime/javalib/bouncycastle.jar
    fd 13: /apex/com.android.runtime/javalib/apache-xml.jar
    fd 14: /system/framework/framework.jar
    fd 15: /system/framework/ext.jar
    fd 16: /system/framework/telephony-common.jar
    fd 17: /system/framework/voip-common.jar
    fd 18: /system/framework/ims-common.jar
    fd 19: /dev/null
    fd 20: /dev/null
    fd 21: /system/framework/android.test.base.jar
    fd 22: /apex/com.android.conscrypt/javalib/conscrypt.jar
    fd 23: /apex/com.android.media/javalib/updatable-media.jar
    fd 24: /system/framework/framework-res.apk
    fd 25: /system/product/overlay/GoogleConfigOverlay.apk
    fd 26: /system/product/overlay/GoogleWebViewOverlay.apk
    fd 27: /vendor/overlay/framework-res__auto_generated_rro_vendor.apk
    fd 28: /system/product/overlay/PixelConfigOverlayCommon.apk
    fd 29: /system/product/overlay/framework-res__auto_generated_rro_product.apk
    fd 30: /dev/null
    fd 31: /dev/binder
    fd 32: /proc/21356/fd/32
    fd 33: /proc/21356/fd/33
    fd 34: /proc/21356/fd/34
    fd 35: /proc/21356/fd/35
    fd 36: /proc/21356/fd/36
    fd 37: /data/app/xcrash.sample-WeCpVYjROKKgYtuzbHflHg==/base.apk
    fd 38: /proc/21356/fd/38
    fd 39: /proc/21356/fd/39
    fd 40: /system/product/overlay/NavigationBarModeGestural/NavigationBarModeGesturalOverlay.apk
    fd 41: /dev/null
    fd 42: /dev/null
    fd 43: /dev/null
    fd 44: /dev/null
    fd 45: /proc/21356/fd/45
    fd 46: /proc/21356/fd/46
    fd 47: /proc/21356/fd/47
    fd 48: /proc/21356/fd/48
    fd 49: /dev/ashmem
    fd 50: /proc/21356/fd/50
    fd 51: /proc/21356/fd/51
    fd 52: /data/app/com.google.android.trichromelibrary_386509238-C5vGqz1rgNqceBgeyyw2Aw==/base.apk
    fd 53: /proc/21356/fd/53
    fd 54: /data/data/xcrash.sample/files/tombstones/tombstone_00001570821799580000_1.2.3-beta456-patch789__xcrash.sample.java.xcrash
    fd 55: /data/app/com.google.android.webview-wtyVrSKc9Gzy-ujvyvTNjw==/base.apk
    fd 56: /data/app/com.google.android.trichromelibrary_386509238-C5vGqz1rgNqceBgeyyw2Aw==/base.apk
    fd 57: /data/data/xcrash.sample/app_webview/webview_data.lock
    fd 58: /data/app/com.google.android.webview-wtyVrSKc9Gzy-ujvyvTNjw==/base.apk
    fd 59: /system/product/overlay/NavigationBarModeGestural/NavigationBarModeGesturalOverlay.apk
    fd 60: /proc/21356/fd/60
    fd 61: /proc/21356/fd/61
    fd 62: /data/app/com.google.android.trichromelibrary_386509238-C5vGqz1rgNqceBgeyyw2Aw==/base.apk
    fd 63: /data/app/com.google.android.trichromelibrary_386509238-C5vGqz1rgNqceBgeyyw2Aw==/base.apk
    fd 64: /data/app/com.google.android.webview-wtyVrSKc9Gzy-ujvyvTNjw==/base.apk
    fd 65: /data/app/com.google.android.trichromelibrary_386509238-C5vGqz1rgNqceBgeyyw2Aw==/base.apk
    fd 66: /dev/urandom
    fd 67: /proc/21356/fd/67
    fd 68: /proc/21356/fd/68
    fd 69: /data/app/com.google.android.webview-wtyVrSKc9Gzy-ujvyvTNjw==/base.apk
    fd 70: /proc/21356/fd/70
    fd 71: /proc/21356/fd/71
    fd 72: /data/app/com.google.android.webview-wtyVrSKc9Gzy-ujvyvTNjw==/base.apk
    fd 73: /data/app/com.google.android.webview-wtyVrSKc9Gzy-ujvyvTNjw==/base.apk
    fd 74: /proc/21356/fd/74
    fd 75: /proc/21356/fd/75
    fd 76: /proc/21356/fd/76
    fd 77: /proc/21356/fd/77
    fd 78: /proc/21356/fd/78
    fd 79: /proc/21356/fd/79
    fd 80: /proc/21356/fd/80
    fd 81: /proc/21356/fd/81
    fd 82: /proc/21356/fd/82
    fd 83: /proc/21356/fd/83
    fd 84: /proc/21356/fd/84
    fd 85: /proc/21356/fd/85
    fd 86: /proc/21356/fd/86
    fd 87: /proc/21356/fd/87
    fd 88: /proc/21356/fd/88
    fd 89: /proc/21356/fd/89
    fd 90: /proc/21356/fd/90
    fd 91: /dev/ashmem
    fd 92: /dev/ashmem
    fd 93: /dev/ashmem
    fd 94: /data/data/xcrash.sample/app_webview/Web Data
    fd 95: /proc/21356/fd/95
    fd 96: /proc/21356/fd/96
    fd 97: /dev/ashmem
    fd 98: /dev/ion
    fd 99: /proc/21356/fd/99
    fd 100: /proc/21356/fd/100
    fd 101: /proc/21356/fd/101
    fd 102: /dev/ashmem
    fd 103: /dev/kgsl-3d0
    fd 104: /dev/ion
    fd 105: /dev/hwbinder
    fd 106: /proc/21356/fd/106
    fd 107: /proc/21356/fd/107
    fd 110: /proc/21356/fd/110
    fd 111: /proc/21356/fd/111
    fd 113: /proc/21356/fd/113
    fd 114: /proc/21356/fd/114
    fd 115: /proc/21356/fd/115
    fd 116: /proc/21356/fd/116
    fd 117: /proc/21356/fd/117
    (number of FDs: 115)

System Memory Summary

System Summary (From: /proc/meminfo)
 MemTotal:        3855796 kB
 MemFree:           90124 kB
 MemAvailable:    1452636 kB
 Buffers:           77420 kB
 Cached:          1461900 kB
 SwapCached:        10232 kB
 Active:          1771504 kB
 Inactive:        1014432 kB
 Active(anon):    1046604 kB
 Inactive(anon):   368348 kB
 Active(file):     724900 kB
 Inactive(file):   646084 kB
 Unevictable:      151672 kB
 Mlocked:          151672 kB
 SwapTotal:        524284 kB
 SwapFree:         271320 kB
 Dirty:               136 kB
 Writeback:             0 kB
 AnonPages:       1391280 kB
 Mapped:           620988 kB
 Shmem:             16660 kB
 Slab:             231556 kB
 SReclaimable:      92700 kB
 SUnreclaim:       138856 kB
 KernelStack:       44448 kB
 PageTables:        57544 kB
 NFS_Unstable:          0 kB
 Bounce:                0 kB
 WritebackTmp:          0 kB
 CommitLimit:     2452180 kB
 Committed_AS:   67847232 kB
 VmallocTotal:   258998208 kB
 VmallocUsed:      223632 kB
 VmallocChunk:   258675172 kB

APP Process Summary

Process Status (From: /proc/PID/status)
 Name:	xcrash.sample
 State:	R (running)
 Tgid:	21356
 Pid:	21356
 PPid:	626
 TracerPid:	0
 Uid:	10180	10180	10180	10180
 Gid:	10180	10180	10180	10180
 Ngid:	0
 FDSize:	128
 Groups:	9997 20180 50180
 VmPeak:	 5659228 kB
 VmSize:	 5542192 kB
 VmLck:	       0 kB
 VmPin:	       0 kB
 VmHWM:	   94624 kB
 VmRSS:	   94396 kB
 VmData:	 5051840 kB
 VmStk:	    8192 kB
 VmExe:	      28 kB
 VmLib:	  166580 kB
 VmPTE:	    1068 kB
 VmSwap:	    6476 kB
 Threads:	37
 SigQ:	0/13891
 SigPnd:	0000000000000000
 ShdPnd:	0000000000000000
 SigBlk:	0000000080001200
 SigIgn:	0000000000000001
 SigCgt:	0000000e400084fc
 CapInh:	0000000000000000
 CapPrm:	0000000000000000
 CapEff:	0000000000000000
 CapBnd:	0000000000000000
 CapAmb:	0000000000000000
 Seccomp:	2
 Cpus_allowed:	f
 Cpus_allowed_list:	0-3
 Mems_allowed:	1
 Mems_allowed_list:	0
 voluntary_ctxt_switches:	343
 nonvoluntary_ctxt_switches:	301

APP Process Limits

Process Limits (From: /proc/PID/limits)
 Limit                     Soft Limit           Hard Limit           Units
 Max cpu time              unlimited            unlimited            seconds
 Max file size             unlimited            unlimited            bytes
 Max data size             unlimited            unlimited            bytes
 Max stack size            8388608              unlimited            bytes
 Max core file size        0                    unlimited            bytes
 Max resident set          unlimited            unlimited            bytes
 Max processes             13891                13891                processes
 Max open files            32768                32768                files
 Max locked memory         65536                65536                bytes
 Max address space         unlimited            unlimited            bytes
 Max file locks            unlimited            unlimited            locks
 Max pending signals       13891                13891                signals
 Max msgqueue size         819200               819200               bytes
 Max nice priority         40                   40
 Max realtime priority     0                    0
 Max realtime timeout      unlimited            unlimited            us

APP Memory Summary

Process Summary (From: android.os.Debug.MemoryInfo)
                       Pss(KB)
                        ------
           Java Heap:     7632
         Native Heap:    10932
                Code:    19064
               Stack:       56
            Graphics:     1104
       Private Other:     3448
              System:     4414
               TOTAL:    46650           TOTAL SWAP:     6460

Other StackTraces

private String getOtherThreadsInfo(Thread crashedThread) {

    // 其他线程可能有很多,所以有“白名单”机制
    int thdMatchedRegex = 0;
    int thdIgnoredByLimit = 0;
    int thdDumped = 0;
    //build whitelist regex list
    ArrayList<Pattern> whiteList = null;
    if (dumpAllThreadsWhiteList != null) {
        whiteList = new ArrayList<Pattern>();
        for (String s : dumpAllThreadsWhiteList) {
            try {
                whiteList.add(Pattern.compile(s));
            } catch (Exception e) {
                XCrash.getLogger().w(Util.TAG, "JavaCrashHandler pattern compile failed", e);
            }
        }
    }

    // dump trace
    StringBuilder sb = new StringBuilder();
    Map<Thread, StackTraceElement[]> map = Thread.getAllStackTraces();
    for (Map.Entry<Thread, StackTraceElement[]> entry : map.entrySet()) {
        Thread thd = entry.getKey();
        StackTraceElement[] stacktrace = entry.getValue();
        //skip the crashed thread
        if (thd.getName().equals(crashedThread.getName())) continue;
        //check regex for thread name
        if (whiteList != null && !matchThreadName(whiteList, thd.getName())) continue;
        thdMatchedRegex++;
        //check dump count limit
        if (dumpAllThreadsCountMax > 0 && thdDumped >= dumpAllThreadsCountMax) {
            thdIgnoredByLimit++;
            continue;
        }
        sb.append(Util.sepOtherThreads + "\n");
        sb.append("pid: ").append(pid).append(", tid: ").append(thd.getId()).append(", name: ").append(thd.getName()).append("  >>> ").append(processName).append(" <<<\n");
        sb.append("\n");
        sb.append("java stacktrace:\n");
        for (StackTraceElement element : stacktrace) {
            sb.append("    at ").append(element.toString()).append("\n");
        }
        sb.append("\n");
        thdDumped++;
    }

    // 统计
    if (map.size() > 1) {
        if (thdDumped == 0) {
            sb.append(Util.sepOtherThreads + "\n");
        }
        sb.append("total JVM threads (exclude the crashed thread): ").append(map.size() - 1).append("\n");
        if (whiteList != null) {
            sb.append("JVM threads matched whitelist: ").append(thdMatchedRegex).append("\n");
        }
        if (dumpAllThreadsCountMax > 0) {
            sb.append("JVM threads ignored by max count limit: ").append(thdIgnoredByLimit).append("\n");
        }
        sb.append("dumped JVM threads:").append(thdDumped).append("\n");
        sb.append(Util.sepOtherThreadsEnding + "\n");
    }
    return sb.toString();
}

输出如下:

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4364, name: RenderThread  >>> xcrash.sample <<<

java stacktrace:

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4349, name: Jit thread pool worker thread 0  >>> xcrash.sample <<<

java stacktrace:

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4357, name: Binder:21356_2  >>> xcrash.sample <<<

java stacktrace:

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4374, name: NetworkService  >>> xcrash.sample <<<

java stacktrace:

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4353, name: ReferenceQueueDaemon  >>> xcrash.sample <<<

java stacktrace:
    at java.lang.Object.wait(Native Method)
    at java.lang.Object.wait(Object.java:442)
    at java.lang.Object.wait(Object.java:568)
    at java.lang.Daemons$ReferenceQueueDaemon.runInternal(Daemons.java:215)
    at java.lang.Daemons$Daemon.run(Daemons.java:137)
    at java.lang.Thread.run(Thread.java:919)

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4359, name: Profile Saver  >>> xcrash.sample <<<

java stacktrace:

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4371, name: GoogleApiHandler  >>> xcrash.sample <<<

java stacktrace:
    at android.os.MessageQueue.nativePollOnce(Native Method)
    at android.os.MessageQueue.next(MessageQueue.java:336)
    at android.os.Looper.loop(Looper.java:174)
    at android.os.HandlerThread.run(HandlerThread.java:67)

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4362, name: xcrash_trace_dp  >>> xcrash.sample <<<

java stacktrace:

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4361, name: xcrash_crash_cb  >>> xcrash.sample <<<

java stacktrace:

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4352, name: HeapTaskDaemon  >>> xcrash.sample <<<

java stacktrace:

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4354, name: FinalizerDaemon  >>> xcrash.sample <<<

java stacktrace:
    at java.lang.Object.wait(Native Method)
    at java.lang.Object.wait(Object.java:442)
    at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:190)
    at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:211)
    at java.lang.Daemons$FinalizerDaemon.runInternal(Daemons.java:271)
    at java.lang.Daemons$Daemon.run(Daemons.java:137)
    at java.lang.Thread.run(Thread.java:919)

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4372, name: Chrome_IOThread  >>> xcrash.sample <<<

java stacktrace:

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4370, name: CrAsyncTask #2  >>> xcrash.sample <<<

java stacktrace:
    at sun.misc.Unsafe.park(Native Method)
    at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:230)
    at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2109)
    at java.util.concurrent.ArrayBlockingQueue.poll(ArrayBlockingQueue.java:402)
    at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1091)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1152)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:641)
    at Js.run(PG:2)
    at java.lang.Thread.run(Thread.java:919)

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4375, name: PlatformServiceBridgeHandlerThread  >>> xcrash.sample <<<

java stacktrace:
    at android.os.MessageQueue.nativePollOnce(Native Method)
    at android.os.MessageQueue.next(MessageQueue.java:336)
    at android.os.Looper.loop(Looper.java:174)
    at android.os.HandlerThread.run(HandlerThread.java:67)

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4355, name: FinalizerWatchdogDaemon  >>> xcrash.sample <<<

java stacktrace:
    at java.lang.Thread.sleep(Native Method)
    at java.lang.Thread.sleep(Thread.java:440)
    at java.lang.Thread.sleep(Thread.java:356)
    at java.lang.Daemons$FinalizerWatchdogDaemon.sleepForMillis(Daemons.java:383)
    at java.lang.Daemons$FinalizerWatchdogDaemon.waitForFinalization(Daemons.java:411)
    at java.lang.Daemons$FinalizerWatchdogDaemon.runInternal(Daemons.java:323)
    at java.lang.Daemons$Daemon.run(Daemons.java:137)
    at java.lang.Thread.run(Thread.java:919)

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4378, name: process reaper  >>> xcrash.sample <<<

java stacktrace:
    at sun.misc.Unsafe.park(Native Method)
    at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:230)
    at java.util.concurrent.SynchronousQueue$TransferStack.awaitFulfill(SynchronousQueue.java:461)
    at java.util.concurrent.SynchronousQueue$TransferStack.transfer(SynchronousQueue.java:362)
    at java.util.concurrent.SynchronousQueue.poll(SynchronousQueue.java:937)
    at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1091)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1152)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:641)
    at java.lang.Thread.run(Thread.java:919)

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4377, name: CleanupReference  >>> xcrash.sample <<<

java stacktrace:
    at java.lang.Object.wait(Native Method)
    at java.lang.Object.wait(Object.java:442)
    at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:190)
    at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:211)
    at Po.run(PG:2)

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4350, name: Signal Catcher  >>> xcrash.sample <<<

java stacktrace:

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4373, name: ThreadPoolForeg  >>> xcrash.sample <<<

java stacktrace:

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4369, name: Chrome_ProcessLauncherThread  >>> xcrash.sample <<<

java stacktrace:
    at android.os.MessageQueue.nativePollOnce(Native Method)
    at android.os.MessageQueue.next(MessageQueue.java:336)
    at android.os.Looper.loop(Looper.java:174)
    at android.os.HandlerThread.run(HandlerThread.java:67)

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4358, name: Binder:21356_3  >>> xcrash.sample <<<

java stacktrace:

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4366, name: CrAsyncTask #1  >>> xcrash.sample <<<

java stacktrace:
    at sun.misc.Unsafe.park(Native Method)
    at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:230)
    at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2109)
    at java.util.concurrent.ArrayBlockingQueue.poll(ArrayBlockingQueue.java:402)
    at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1091)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1152)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:641)
    at Js.run(PG:2)
    at java.lang.Thread.run(Thread.java:919)

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4376, name: ThreadPoolForeg  >>> xcrash.sample <<<

java stacktrace:

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
pid: 21356, tid: 4356, name: Binder:21356_1  >>> xcrash.sample <<<

java stacktrace:

total JVM threads (exclude the crashed thread): 24
dumped JVM threads:24

Native Crash

捕获 Native Crash 靠的是信号处理器(sigaction),比如说访问非法地址时,APP 进程会收到 SIGSEGV,对应的信号处理器就可以在这个时间点收集堆栈信息

sigaction

signal 产生后会处于几种状态中:
1. blocked,让内核先持有信号不要分发(deliver),在 unblocked 之前都不会被分发出去;被 blocked 的信号集合叫做 singal mask,每个线程都有自己的 signal mask
2. pending,内核正在分发信号给指定的进程/线程(但还没分发出去)

signal 可以是进程范围的,比如内核产生的信号、kill 和 sigqueue;也可以是线程范围的,比如因执行机器指令而导致的硬件异常(SIGSEGV、SIGFPE)、通过 tgkill 或者 pthread_kill 指定目标线程

进程范围的信号会随机选择一个 signal unblocked 的线程来消费(deliver)

sigaction 用来注册信号处理器,是升级版的 signal

// 如果 act != null,它被注册为新的信号处理器;如果 oldact != null,上一个信号处理器将被保存在此
int sigaction(int signum, const struct sigaction *act, struct sigaction *oldact);

struct sigaction {
    void     (*sa_handler)(int);                        // 只收到 signal 作为参数的处理器
    void     (*sa_sigaction)(int, siginfo_t *, void *); // 当指定 SA_SIGINFO 时,替代 sa_handler 作为处理器(能收到三个参数)
    sigset_t   sa_mask;                                 // 处理器运行时,暂时屏蔽指定信号(将它们加到线程的 signal mask)
    int        sa_flags;
    void     (*sa_restorer)(void);                      // not for app
};

// sa_flags:
//     SA_SIGINFO 使用 sa_sigaction 作为处理器
//     SA_RESTART 当线程阻塞在系统调用/库函数上,因为信号的到来转而进入信号处理器,退出信号处理器后如何恢复上一个系统调用/库函数;
//                默认是使其返回失败码,此 flag 指示重新执行系统调用/库函数
//     SA_ONSTACK 用另一个方法调用栈来执行处理器函数

void sa_sigaction(int sig, siginfo_t *info, void *ucontext)

siginfo_t {
    int      si_signo;     /* Signal number */
    int      si_errno;     /* An errno value */
    int      si_code;      /* Signal code */
    int      si_trapno;    /* Trap number that caused
                              hardware-generated signal
                              (unused on most architectures) */
    pid_t    si_pid;       /* Sending process ID */
    uid_t    si_uid;       /* Real user ID of sending process */
    int      si_status;    /* Exit value or signal */
    clock_t  si_utime;     /* User time consumed */
    clock_t  si_stime;     /* System time consumed */
    sigval_t si_value;     /* Signal value */
    int      si_int;       /* POSIX.1b signal */
    void    *si_ptr;       /* POSIX.1b signal */
    int      si_overrun;   /* Timer overrun count;
                              POSIX.1b timers */
    int      si_timerid;   /* Timer ID; POSIX.1b timers */
    void    *si_addr;      /* Memory location which caused fault */
    long     si_band;      /* Band event (was int in
                              glibc 2.3.2 and earlier) */
    int      si_fd;        /* File descriptor */
    short    si_addr_lsb;  /* Least significant bit of address
                              (since Linux 2.6.32) */
    void    *si_lower;     /* Lower bound when address violation
                              occurred (since Linux 3.19) */
    void    *si_upper;     /* Upper bound when address violation
                              occurred (since Linux 3.19) */
    int      si_pkey;      /* Protection key on PTE that caused
                              fault (since Linux 4.6) */
    void    *si_call_addr; /* Address of system call instruction
                              (since Linux 3.5) */
    int      si_syscall;   /* Number of attempted system call
                              (since Linux 3.5) */
    unsigned int si_arch;  /* Architecture of attempted system call
                              (since Linux 3.5) */
}

sigaltstack

/** 
 * 为当前进程设置一个新的(获取上一个)信号处理器调用栈,其实就是为信号处理器预先分配一块内存,作为其调用栈
 * The most common usage of an alternate signal stack is to handle the SIGSEGV signal 
 * that is generated if the space available for the normal process stack is exhausted: 
 * in this case, a signal handler for SIGSEGV cannot be invoked on the process stack; 
 * if we wish to handle it, we must use an alternate signal stack
 */
int sigaltstack(const stack_t *ss, stack_t *old_ss);

typedef struct {
    void  *ss_sp;     /* Base address of stack */
    int    ss_flags;  /* Flags */
    size_t ss_size;   /* Number of bytes in stack */
} stack_t;

xc_crash_signal_handler

// XCrash.init
// NativeHandler.initialize
// NativeHandler.nativeInit
// xc_jni_init
// xc_crash_init

// 需要捕获的信号
static xcc_signal_crash_info_t xcc_signal_crash_info[] =
{
    {.signum = SIGABRT},
    {.signum = SIGBUS},
    {.signum = SIGFPE},
    {.signum = SIGILL},
    {.signum = SIGSEGV},
    {.signum = SIGTRAP},
    {.signum = SIGSYS},
    {.signum = SIGSTKFLT}
};

int xcc_signal_crash_register(void (*handler)(int, siginfo_t *, void *))
{
    // 预先为处理器分配一块内存
    stack_t ss;
    if(NULL == (ss.ss_sp = calloc(1, XCC_SIGNAL_CRASH_STACK_SIZE))) return XCC_ERRNO_NOMEM;
    ss.ss_size  = XCC_SIGNAL_CRASH_STACK_SIZE;
    ss.ss_flags = 0;
    if(0 != sigaltstack(&ss, NULL)) return XCC_ERRNO_SYS;

    struct sigaction act;
    memset(&act, 0, sizeof(act));
    sigfillset(&act.sa_mask);
    act.sa_sigaction = handler;
    act.sa_flags = SA_RESTART | SA_SIGINFO | SA_ONSTACK;
    
    // 为上述信号注册处理器
    size_t i;
    for(i = 0; i < sizeof(xcc_signal_crash_info) / sizeof(xcc_signal_crash_info[0]); i++)
        if(0 != sigaction(xcc_signal_crash_info[i].signum, &act, &(xcc_signal_crash_info[i].oldact)))
            return XCC_ERRNO_SYS;

    return 0;
}

// 信号处理器,跟上面的 JavaCrashHandler 一样,主要是收集各种信息,写入 tombstone 文件
// 比较复杂,下一章节进行分析
static void xc_crash_signal_handler(int sig, siginfo_t *si, void *uc)

核心步骤

  1. 信号处理器(xc_crash_signal_handler,在 APP 进程)收集相关的信息到 xc_crash_spot
  2. fork 出子进程 dumper,子进程继承了父进程的内存布局,也就捕获到了 APP 进程 crash 时刻的内存布局
  3. dumper 进程的入口点是 xc_crash_exec_dumper,signal handler 线程通过 waitpid 阻塞直到 dumper 进程完成工作
  4. dumper 将 signal 和调用堆栈等信息写入管道,然后加载程序 libxcrash_dumper.so 替换当前的内存空间(旧的内存空间的所有信息将被清空)
  5. xcd_core.c 里的 main 函数从管道里读取 xc_crash_spot 并写入 tombstone 日志文件,退出
  6. signal handler 线程从阻塞中恢复,退出 APP 进程
// APP 进程,signal hander 线程,dump 开始的地方
static void xc_crash_signal_handler(int sig, siginfo_t *si, void *uc)
{
    // set crash spot info
    xc_crash_spot.crash_time = xc_crash_time;
    xc_crash_spot.crash_tid = xc_crash_tid;
    memcpy(&(xc_crash_spot.siginfo), si, sizeof(siginfo_t));
    memcpy(&(xc_crash_spot.ucontext), uc, sizeof(ucontext_t));
    xc_crash_spot.log_pathname_len = strlen(xc_crash_log_pathname);

    // spawn crash dumper process
    pid_t dumper_pid = xc_crash_fork(xc_crash_exec_dumper);

    // wait the crash dumper process terminated
    int wait_r = XCC_UTIL_TEMP_FAILURE_RETRY(waitpid(dumper_pid, &status, __WALL));

    // exit
}

// dumper 进程的入口
static int xc_crash_exec_dumper(void *arg)
{
    // 创建一个管道,第一个用来读,第二个用来写
    int pipefd[2];
    if(0 != pipe2(pipefd, O_CLOEXEC))

    // 将 xc_crash_spot 写入管道
    struct iovec iovs[12] = {
        {.iov_base = &xc_crash_spot,                      .iov_len = sizeof(xcc_spot_t)},
        {.iov_base = xc_crash_log_pathname,               .iov_len = xc_crash_spot.log_pathname_len},
        {.iov_base = xc_common_os_version,                .iov_len = xc_crash_spot.os_version_len},
        {.iov_base = xc_common_kernel_version,            .iov_len = xc_crash_spot.kernel_version_len},
        {.iov_base = xc_common_abi_list,                  .iov_len = xc_crash_spot.abi_list_len},
        {.iov_base = xc_common_manufacturer,              .iov_len = xc_crash_spot.manufacturer_len},
        {.iov_base = xc_common_brand,                     .iov_len = xc_crash_spot.brand_len},
        {.iov_base = xc_common_model,                     .iov_len = xc_crash_spot.model_len},
        {.iov_base = xc_common_build_fingerprint,         .iov_len = xc_crash_spot.build_fingerprint_len},
        {.iov_base = xc_common_app_id,                    .iov_len = xc_crash_spot.app_id_len},
        {.iov_base = xc_common_app_version,               .iov_len = xc_crash_spot.app_version_len},
        {.iov_base = xc_crash_dump_all_threads_whitelist, .iov_len = xc_crash_spot.dump_all_threads_whitelist_len}
    };
    int iovs_cnt = (0 == xc_crash_spot.dump_all_threads_whitelist_len ? 11 : 12);
    ssize_t ret = XCC_UTIL_TEMP_FAILURE_RETRY(writev(pipefd[1], iovs, iovs_cnt));

    // 将 stdin (fd 0) 指向管道的读端口
    XCC_UTIL_TEMP_FAILURE_RETRY(dup2(pipefd[0], STDIN_FILENO));
    
    syscall(SYS_close, pipefd[0]);
    syscall(SYS_close, pipefd[1]);

    // 加载程序 libxcrash_dumper.so 替换当前的内存空间
    execl(xc_crash_dumper_pathname, XCC_UTIL_XCRASH_DUMPER_FILENAME, NULL);
    return 100 + errno;
}

// libxcrash_dumper.so 的入口点,在 xcd_core.c
int main(int argc, char** argv)
{
    // 从 stdin 读取 xc_crash_spot
    if(0 != xcd_core_read_args()) exit(1);

    //open log file
    if(0 > (xcd_core_log_fd = XCC_UTIL_TEMP_FAILURE_RETRY(open(xcd_core_log_pathname, O_WRONLY | O_CLOEXEC)))) exit(2);

    //create process object
    if(0 != xcd_process_create(&xcd_core_proc,
                               xcd_core_spot.crash_pid,
                               xcd_core_spot.crash_tid,
                               &(xcd_core_spot.siginfo),
                               &(xcd_core_spot.ucontext))) exit(3);

    //suspend all threads in the process
    xcd_process_suspend_threads(xcd_core_proc);

    //load process info
    if(0 != xcd_process_load_info(xcd_core_proc)) exit(4);

    //record system info
    if(0 != xcd_sys_record(xcd_core_log_fd,
                           xcd_core_spot.time_zone,
                           xcd_core_spot.start_time,
                           xcd_core_spot.crash_time,
                           xcd_core_app_id,
                           xcd_core_app_version,
                           xcd_core_spot.api_level,
                           xcd_core_os_version,
                           xcd_core_kernel_version,
                           xcd_core_abi_list,
                           xcd_core_manufacturer,
                           xcd_core_brand,
                           xcd_core_model,
                           xcd_core_build_fingerprint)) exit(5);

    //record process info
    if(0 != xcd_process_record(xcd_core_proc,
                               xcd_core_log_fd,
                               xcd_core_spot.logcat_system_lines,
                               xcd_core_spot.logcat_events_lines,
                               xcd_core_spot.logcat_main_lines,
                               xcd_core_spot.dump_elf_hash,
                               xcd_core_spot.dump_map,
                               xcd_core_spot.dump_fds,
                               xcd_core_spot.dump_network_info,
                               xcd_core_spot.dump_all_threads,
                               xcd_core_spot.dump_all_threads_count_max,
                               xcd_core_dump_all_threads_whitelist,
                               xcd_core_spot.api_level)) exit(6);

    //resume all threads in the process
    xcd_process_resume_threads(xcd_core_proc);

    // exit
}

Signal Info

打印导致 Native Crash 的 Signal 的基本信息,可以从 sigaction 的信号处理器(xc_crash_signal_handler)的参数列表里拿到(siginfo_t

  • 信号码(siginfo_t.si_signo),比如:SIGKILL(9)、SIGSEGV(11),更多参考 man signal.7
  • 信号错误码(siginfo_t.si_code),描述此信号的更详细的信息,比如对于 SIGSEGV 有以下错误码:
    • SEGV_MAPERR Address not mapped to object.
    • SEGV_ACCERR Invalid permissions for mapped object.
    • SEGV_BNDERR (since Linux 3.19) Failed address bound checks.
    • SEGV_PKUERR (since Linux 4.6) Access was denied by memory protection keys. See pkeys(7). The protection key which applied to this access is available via si_pkey.
  • SIGSEGV 会将错误地址写入 siginfo_t.si_addr
  • 有关 siginfo_t 的更详细信息请参考 man sigaction.2
static int xcd_process_record_signal_info(xcd_process_t *self, int log_fd)
{
    //fault addr
    char addr_desc[64];
    if(xcc_util_signal_has_si_addr(self->si))
    {
        void *addr = self->si->si_addr;
        if(self->si->si_signo == SIGILL)
        {
            uint32_t instruction = 0;
            xcd_util_ptrace_read(self->pid, (uintptr_t)addr, &instruction, sizeof(instruction));
            snprintf(addr_desc, sizeof(addr_desc), "%p (*pc=%#08x)", addr, instruction);
        }
        else
        {
            snprintf(addr_desc, sizeof(addr_desc), "%p", addr);
        }
    }
    else
    {
        snprintf(addr_desc, sizeof(addr_desc), "--------");
    }

    //from
    char sender_desc[64] = "";
    if(xcc_util_signal_has_sender(self->si, self->pid))
    {
        snprintf(sender_desc, sizeof(sender_desc), " from pid %d, uid %d", self->si->si_pid, self->si->si_uid);
    }

    return xcc_util_write_format(log_fd, "signal %d (%s), code %d (%s%s), fault addr %s\n",
                                 self->si->si_signo, xcc_util_get_signame(self->si),
                                 self->si->si_code, xcc_util_get_sigcodename(self->si),
                                 sender_desc, addr_desc);
}

输出如下:

signal 11 (SIGSEGV), code 1 (SEGV_MAPERR), fault addr 0x0

Registers Info

打印寄存器(Register)的值,下面是 ARM64 的例子,它有 30 个通用寄存器

寄存器 作用
x0 一般表示返回值
x1 ~ x7 般是函数的参数,大于 8 个的会通过堆传参
lr 链接寄存器,存放着函数的返回地址
sp 堆栈顶寄存器,用于指向每个函数调用栈的栈顶
pc 表示当前执行的指令的地址
x0  0000000000000003  x1  0000000000000000  x2  000000751128fd60  x3  0000007511290020
x4  000000751128fd60  x5  00000075a26c1708  x6  000000751128fd50  x7  00000075200a59dc
x8  0000000000000000  x9  79fc7e30c0ff4d9e  x10 00000000000003e8  x11 0000000000000000
x12 0000000000004100  x13 0000000000000001  x14 0000000000080100  x15 0000000000000000
x16 00000074b9be4d20  x17 00000074b9bcc86c  x18 00000075a57fa000  x19 00000075a4f52000
x20 0000000000000000  x21 00000075a4f52000  x22 0000007fe0ef23a0  x23 00000074bb1b62fe
x24 0000000000000004  x25 00000075a5107020  x26 00000075a4f520b0  x27 0000000000000001
x28 0000007fe0ef2130  x29 0000007fe0ef2090
sp  0000007fe0ef2070  lr  00000074b9bcc8cc  pc  00000074b9bcc884

通过 ptrace,一个线程(tracer)可以观察并控制另一线程(tracee)的执行、读取/修改它的内存和寄存器,比如单步调试(debugger);tracer 和 tracee 都是线程,而不是进程(虽然 ptrace 的参数里写的是 pid);它的一般用法是这样的:

  • PTRACE_ATTACH:使当前线程成为 tracer,pid 指定的线程成为 tracee
  • waitpid:PTRACE_ATTACH 发送 SIGSTOP 给 tracee 但它不一定立刻 stop,所以需要 tracer 等待 tracee
  • PTRACE_PEEKDATA(读内存)、 PTRACE_POKEDATA(写内存)、PTRACE_GETREGS(读寄存器)、PTRACE_SETREGS(写寄存器)…
  • PTRACE_DETACH
long ptrace(enum __ptrace_request request, pid_t pid, void *addr, void *data);

// 通过 ptrace 获取寄存器的值并保存到 xcd_thread_t.regs
void xcd_thread_load_regs(xcd_thread_t *self)
{
    uintptr_t regs[64]; //big enough for all architectures
    size_t    regs_len;

    if(0 != ptrace(PTRACE_GETREGS, self->tid, NULL, &regs))
    {
        XCD_LOG_ERROR("THREAD: ptrace GETREGS failed, errno=%d", errno);
        self->status = XCD_THREAD_STATUS_REGS;
        return;
    }
    regs_len = XCD_REGS_USER_NUM;
    xcd_regs_load_from_ptregs(&(self->regs), regs, regs_len);
}

void xcd_regs_load_from_ptregs(xcd_regs_t *self, uintptr_t *regs, size_t regs_len)
{
    if(regs_len > XCD_REGS_USER_NUM) regs_len = XCD_REGS_USER_NUM;
    memcpy(&(self->r), regs, sizeof(uintptr_t) * regs_len);
}

// 打印寄存器的值
int xcd_regs_record(xcd_regs_t *self, int log_fd)
{
    return xcc_util_write_format(log_fd,
                                 "    x0  %016lx  x1  %016lx  x2  %016lx  x3  %016lx\n"
                                 "    x4  %016lx  x5  %016lx  x6  %016lx  x7  %016lx\n"
                                 "    x8  %016lx  x9  %016lx  x10 %016lx  x11 %016lx\n"
                                 "    x12 %016lx  x13 %016lx  x14 %016lx  x15 %016lx\n"
                                 "    x16 %016lx  x17 %016lx  x18 %016lx  x19 %016lx\n"
                                 "    x20 %016lx  x21 %016lx  x22 %016lx  x23 %016lx\n"
                                 "    x24 %016lx  x25 %016lx  x26 %016lx  x27 %016lx\n"
                                 "    x28 %016lx  x29 %016lx\n"
                                 "    sp  %016lx  lr  %016lx  pc  %016lx\n\n",
                                 self->r[XCD_REGS_X0],  self->r[XCD_REGS_X1],  self->r[XCD_REGS_X2],  self->r[XCD_REGS_X3],
                                 self->r[XCD_REGS_X4],  self->r[XCD_REGS_X5],  self->r[XCD_REGS_X6],  self->r[XCD_REGS_X7],
                                 self->r[XCD_REGS_X8],  self->r[XCD_REGS_X9],  self->r[XCD_REGS_X10], self->r[XCD_REGS_X11],
                                 self->r[XCD_REGS_X12], self->r[XCD_REGS_X13], self->r[XCD_REGS_X14], self->r[XCD_REGS_X15],
                                 self->r[XCD_REGS_X16], self->r[XCD_REGS_X17], self->r[XCD_REGS_X18], self->r[XCD_REGS_X19],
                                 self->r[XCD_REGS_X20], self->r[XCD_REGS_X21], self->r[XCD_REGS_X22], self->r[XCD_REGS_X23],
                                 self->r[XCD_REGS_X24], self->r[XCD_REGS_X25], self->r[XCD_REGS_X26], self->r[XCD_REGS_X27],
                                 self->r[XCD_REGS_X28], self->r[XCD_REGS_X29],
                                 self->r[XCD_REGS_SP],  self->r[XCD_REGS_LR],  self->r[XCD_REGS_PC]);
}

Backtrace

/proc/pid/maps

/proc/pid/maps 包含了进程所有的内存映射(mmap)信息,后续的步骤需要用它来查找 函数名 及其所在的 文件路径,它的内容大概是这样的(参考 man mmap.2 & man proc.5):

描述
address 内存映射所在的进程的虚拟地址空间(开始地址 - 结束地址)
perms 这块内存的读写权限:r = read,w = write,x = execute,s = shared,p = private (copy on write)
offset 映射至内存的文件(或者其他东西)的起始偏移
dev 文件所在的设备(major:minor)
inode 文件的 inode
path 文件的路径
address                perms  offset    dev     inode     pathname
12c00000-32c00000      rw-p   00000000  00:00   0         [anon:dalvik-main space (region space)]
70fb9000-71248000      rw-p   00000000  00:00   0         [anon:dalvik-/apex/com.android.art/javalib/boot.art]
71248000-712a3000      rw-p   00000000  00:00   0         [anon:dalvik-/apex/com.android.art/javalib/boot-core-libart.art]
712a3000-7136f000      rw-p   00000000  00:00   0         [anon:dalvik-/apex/com.android.art/javalib/boot-core-icu4j.art]
7136f000-713a6000      rw-p   00000000  00:00   0         [anon:dalvik-/apex/com.android.art/javalib/boot-okhttp.art]
713a6000-713ea000      rw-p   00000000  00:00   0         [anon:dalvik-/apex/com.android.art/javalib/boot-bouncycastle.art]
713ea000-713f9000      rw-p   00000000  00:00   0         [anon:dalvik-/apex/com.android.art/javalib/boot-apache-xml.art]
713f9000-71479000      r--p   00000000  fc:00   150       /apex/com.android.art/javalib/arm64/boot.oat
...                                                       
7e1f72d000-7e1f778000  r--s   00000000  fc:00   2605      /system/fonts/Roboto-Medium.ttf
7e1f778000-7e1f779000  r--p   00000000  fc:00   6422      /system/system_ext/lib64/libqti-at.so
7e1f779000-7e1f77a000  r-xp   00001000  fc:00   6422      /system/system_ext/lib64/libqti-at.so
7e1f77a000-7e1f77b000  r--p   00002000  fc:00   6422      /system/system_ext/lib64/libqti-at.so
7e1f7a0000-7e1f7ab000  r--p   00000000  fc:00   6522      /system/system_ext/lib64/[email protected]
7e1f7ab000-7e1f7b5000  r-xp   0000b000  fc:00   6522      /system/system_ext/lib64/[email protected]
7e1f7b5000-7e1f7b7000  r--p   00015000  fc:00   6522      /system/system_ext/lib64/[email protected]
7e1f7b7000-7e1f7b8000  rw-p   00016000  fc:00   6522      /system/system_ext/lib64/[email protected]
7e1f86b000-7e1fb0e000  r--p   00000000  103:0f  5128711   /data/data/xcrash.sample/code_cache/.overlay/base.apk/classes.dex
7e1fb0e000-7e1fbd4000  r-xp   00000000  103:0f  5096166   /data/data/xcrash.sample/code_cache/startup_agents/e4ee8c59-agent.so
7e1fbd4000-7e1fbe3000  ---p   00000000  00:00   0         
7e1fbe3000-7e1fbec000  rw-p   000c5000  103:0f  5096166   /data/data/xcrash.sample/code_cache/startup_agents/e4ee8c59-agent.so
...                                                       
7e24946000-7e24987000  r--s   0001d000  103:0f  180482    /data/app/~~jln4G3nGOa7-pv4aJFN6jg==/xcrash.sample-icj_DCtDvU5ZX6MZSDcn4Q==/base.apk
...                                                        
7e347f9000-7e34800000  r--s   001be000  103:0f  180482    /data/app/~~jln4G3nGOa7-pv4aJFN6jg==/xcrash.sample-icj_DCtDvU5ZX6MZSDcn4Q==/base.apk
...

ELF

ELF Format

Linux 下的可执行文件(executable)和共享库文件(so - Shared Object)都是 ELF 格式(Executable and Linking Format

ELF Header 里的 e_type 指明这是一个什么类型的文件:

e_type desc
ET_NONE An unknown type
ET_REL A relocatable file
ET_EXEC An executable file
ET_DYN A shared object
ET_CORE A core file

ELF 里有很多 Section,每个 Section 都是一段连续的地址保存了相同类型的数据,具体到哪个 Section 在哪里有多大定义在 Section Header 里,它的一些重要成员属性有:

Fields Desc
sh_name name of the section. Its value is an index into the string section
sh_type SHT_SYMTAB(符号表)、SHT_STRTAB(字符串表)等等
sh_offset Section 在文件里的位置
sh_size Section 的大小

所有的 Section Header 组成一个数组 Section Header Table,它的位置和大小则是在 ELF Header 里定义的:

Fields desc
e_shoff SHT 所在的位置
e_shsize SHT 的大小
e_shentsize 每个 Section Header 的大小
e_shnum Section Header 的数量
e_shstrndx Name Section 在 SHT 的索引(所谓的 Name Section 其实就是专门保存字符串的 Section,类似于 dex 里的字符串池)

Symbol Table 是一个很重要的 Section,它的结构如下:

Fields Desc
st_name symbol name (index of string section)
st_value symbol value
st_size This member holds zero if the symbol has no size or an unknown size
st_info type and binding attributes
STT_FUNC (a function or other executable code)
STT_OBJECT (data object)
STB_LOCAL (Local symbols are not visible outside the object file)
STB_GLOBAL (Global symbols are visible to all object files being combined)…
st_other symbol visibility
STV_DEFAULT (Global and weak symbols are available to other modules; references in the local module can be interposed by definitions in other modules)
STV_HIDDEN (Symbol is unavailable to other modules)
STV_PROTECTED (Symbol is available to other modules)

可以用 readelf 命令查看 ELF 文件的结构,以 /apex/com.android.art/lib64/libart.so 为例,readelf -S -W libart.so 输出 Section Header Table

There are 28 section headers, starting at offset 0x7d8af8:

Section Headers:
  [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
  [ 0]                   NULL            0000000000000000 000000 000000 00      0   0  0
  [ 1] .note.android.ident NOTE            0000000000000270 000270 000018 00   A  0   0  4
  [ 2] .note.gnu.build-id NOTE            0000000000000288 000288 000020 00   A  0   0  4
  [ 3] .dynsym           DYNSYM          00000000000002a8 0002a8 021d50 18   A  7   1  8
  [ 4] .gnu.version      VERSYM          0000000000021ff8 021ff8 002d1c 02   A  3   0  2
  [ 5] .gnu.version_r    VERNEED         0000000000024d14 024d14 000100 00   A  7   7  4
  [ 6] .gnu.hash         GNU_HASH        0000000000024e18 024e18 0085d4 00   A  3   0  8
  [ 7] .dynstr           STRTAB          000000000002d3ec 02d3ec 05c8eb 00   A  0   0  1
  [ 8] .rela.dyn         LOOS+0x2        0000000000089cd8 089cd8 0005f9 01   A  3   0  8
  [ 9] .relr.dyn         LOOS+0xfffff00  000000000008a2d8 08a2d8 000490 08   A  0   0  8
  [10] .rela.plt         RELA            000000000008a768 08a768 002fb8 18   A  3  21  8
  [11] .rodata           PROGBITS        000000000008d720 08d720 03cec6 00 AMS  0   0 16
  [12] .eh_frame_hdr     PROGBITS        00000000000ca5e8 0ca5e8 01083c 00   A  0   0  4
  [13] .eh_frame         PROGBITS        00000000000dae28 0dae28 04de74 00   A  0   0  8
  [14] .text             PROGBITS        0000000000129000 129000 51c160 00  AX  0   0 512
  [15] .plt              PROGBITS        0000000000645160 645160 001ff0 00  AX  0   0 16
  [16] .data.rel.ro      PROGBITS        0000000000648000 648000 00e808 00  WA  0   0  8
  [17] .fini_array       FINI_ARRAY      0000000000656808 656808 000010 00  WA  0   0  8
  [18] .init_array       INIT_ARRAY      0000000000656818 656818 000060 00  WA  0   0  8
  [19] .dynamic          DYNAMIC         0000000000656878 656878 0002b0 10  WA  7   0  8
  [20] .got              PROGBITS        0000000000656b28 656b28 000978 00  WA  0   0  8
  [21] .got.plt          PROGBITS        00000000006574a0 6574a0 001000 00  WA  0   0  8
  [22] .data             PROGBITS        00000000006594a0 6584a0 002879 00  WA  0   0  8
  [23] .bss              NOBITS          000000000065bd20 65ad19 002c90 00  WA  0   0  8
  [24] .comment          PROGBITS        0000000000000000 65ad19 00016b 01  MS  0   0  1
  [25] .symtab           SYMTAB          0000000000000000 65ae88 08d678 18     27 18360  8
  [26] .shstrtab         STRTAB          0000000000000000 6e8500 00010c 00      0   0  1
  [27] .strtab           STRTAB          0000000000000000 6e860c 0f04e7 00      0   0  1
Key to Flags:
  W (write), A (alloc), X (execute), M (merge), S (strings), I (info),
  L (link order), O (extra OS processing required), G (group), T (TLS),
  C (compressed), x (unknown), o (OS specific), E (exclude),
  p (processor specific)

readelf -p .dynstr libart.so 输出字符串表:

String dump of section '.dynstr':
  [     1]  __cxa_atexit
  [     e]  __cxa_finalize
  [    1d]  _ZN3art14AotClassLinkerC2EPNS_11InternTableE
  [    4a]  _ZN3art11ClassLinkerC2EPNS_11InternTableEb
  [    75]  _ZN3art14AotClassLinkerD2Ev
  [    91]  _ZN3art11ClassLinkerD2Ev
  [    aa]  _ZN3art14AotClassLinkerD0Ev
  [    c6]  _ZdlPv
  [    cd]  _ZN3art14AotClassLinker13CanAllocClassEv
  [    f6]  _ZNK3art7Runtime19IsActiveTransactionEv
  [   11e]  _ZN3art7Runtime34AbortTransactionAndThrowAbortErrorEPNS_6ThreadERKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEE
  [   1a1]  _ZN3art14AotClassLinker15InitializeClassEPNS_6ThreadENS_6HandleINS_6mirror5ClassEEEbb
  [   1f7]  _ZNK3art7Runtime29IsActiveStrictTransactionModeEv
  [   229]  _ZN3art11ClassLinker15InitializeClassEPNS_6ThreadENS_6HandleINS_6mirror5ClassEEEbb
  [   27c]  _ZNK3art2gc4Heap24ObjectIsInBootImageSpaceENS_6ObjPtrINS_6mirror6ObjectEEE
  [   2c7]  _ZN3art6mirror6Object12PrettyTypeOfEv
  [   2ed]  _ZN3art6mirror5Class16IsThrowableClassEv
  [   316]  _ZN7android4base10LogMessageC1EPKcjNS0_11LogSeverityES3_i

readelf -s libart.so 输出符号表:

Symbol table '.dynsym' contains 5774 entries:
   Num:    Value          Size Type    Bind   Vis      Ndx Name
     0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT  UND 
     1: 0000000000000000     0 FUNC    GLOBAL DEFAULT  UND __cxa_atexit@LIBC (2)
     2: 0000000000000000     0 FUNC    GLOBAL DEFAULT  UND __cxa_finalize@LIBC (2)
     3: 0000000000000000     0 FUNC    GLOBAL DEFAULT  UND _ZdlPv
     4: 0000000000000000     0 FUNC    GLOBAL DEFAULT  UND _ZN7android4base10LogMess
   ...
   809: 0000000000132340   620 FUNC    LOCAL  HIDDEN    14 art_quick_invoke_stub
   810: 00000000001325b0   640 FUNC    LOCAL  HIDDEN    14 art_quick_invoke_static_s
   811: 000000000013ba10   292 FUNC    LOCAL  HIDDEN    14 art_quick_proxy_invoke_ha
   812: 000000000013c100   256 FUNC    LOCAL  HIDDEN    14 art_quick_instrumentation
   813: 000000000013be40   336 FUNC    LOCAL  HIDDEN    14 art_quick_generic_jni_tra
   814: 000000000013bfa0   248 FUNC    LOCAL  HIDDEN    14 art_quick_to_interpreter_
   815: 000000000013c210   480 FUNC    LOCAL  HIDDEN    14 art_quick_instrumentation
   816: 000000000013bd00   304 FUNC    LOCAL  HIDDEN    14 art_quick_resolution_tram
   817: 000000000013bb40   432 FUNC    LOCAL  HIDDEN    14 art_quick_imt_conflict_tr
   818: 000000000013c0a0    80 FUNC    LOCAL  HIDDEN    14 art_invoke_obsolete_metho
   819: 000000000013c400   164 FUNC    LOCAL  HIDDEN    14 art_quick_deoptimize
   ...

逻辑

通过 ptrace 可以拿到 PC 寄存器的值,它指向正在执行的代码的地址;拿 pc 去 /proc/pid/maps 里找,看 pc 落在哪块 mmap 上,从而得知这段代码在哪个 so 文件里;so 文件是 ELF 结构,解析出它里面的符号表及其偏移,pc - mmap.start 就是这段代码在这块 mmap 上的偏移,再加上 mmap.offset 内存映射的偏移就是这段代码在 so 文件里的偏移,从而得知这段代码在哪个符号/函数里(函数名)

但是怎么从 pc 回溯整个函数调用栈我还没有想明白

打印

打印调用栈,别看代码这么长,其实关键就是这么几个:

Fields Desc
xcd_frame.rel_pc 函数在它所在的内存映射的偏移
xcd_frame.map.name 函数所在 so 文件路径
xcd_frame.func_name 函数名
xcd_frame.func_offset 函数与 pc 的偏移
int xcd_thread_record_backtrace(xcd_thread_t *self, int log_fd)
{
    if(XCD_THREAD_STATUS_OK != self->status) return 0; //ignore

    return xcd_frames_record_backtrace(self->frames, log_fd);
}

int xcd_frames_record_backtrace(xcd_frames_t *self, int log_fd)
{
    xcd_frame_t *frame;
    xcd_elf_t   *elf;
    char        *name;
    char         name_buf[512];
    char        *name_embedded;
    char        *offset;
    char         offset_buf[64];
    char        *func;
    char         func_buf[512];
    int          r;

    if(0 != (r = xcc_util_write_str(log_fd, "backtrace:\n"))) return r;
    
    TAILQ_FOREACH(frame, &(self->frames), link)
    {
        //name
        name = NULL;
        if(NULL == frame->map)
        {
            name = "<unknown>";
        }
        else if(NULL == frame->map->name || '\0' == frame->map->name[0])
        {
            snprintf(name_buf, sizeof(name_buf), "<anonymous:%"XCC_UTIL_FMT_ADDR">", frame->map->start);
            name = name_buf;
        }
        else
        {
            if(0 != frame->map->elf_start_offset)
            {
                elf = xcd_map_get_elf(frame->map, self->pid, (void *)self->maps);
                if(NULL != elf)
                {
                    name_embedded = xcd_elf_get_so_name(elf);
                    if(NULL != name_embedded && strlen(name_embedded) > 0)
                    {
                        snprintf(name_buf, sizeof(name_buf), "%s!%s", frame->map->name, name_embedded);
                        name = name_buf;
                    }
                }
            }
            if(NULL == name) name = frame->map->name;
        }

        //offset
        if(NULL != frame->map && 0 != frame->map->elf_start_offset)
        {
            snprintf(offset_buf, sizeof(offset_buf), " (offset 0x%"PRIxPTR")", frame->map->elf_start_offset);
            offset = offset_buf;
        }
        else
        {
            offset = "";
        }

        //func
        if(NULL != frame->func_name)
        {
            if(frame->func_offset > 0)
                snprintf(func_buf, sizeof(func_buf), " (%s+%zu)", frame->func_name, frame->func_offset);
            else
                snprintf(func_buf, sizeof(func_buf), " (%s)", frame->func_name);
            func = func_buf;
        }
        else
        {
            func = "";
        }

        if(0 != (r = xcc_util_write_format(log_fd, "    #%02zu pc %0"XCC_UTIL_FMT_ADDR"  %s%s%s\n",
                                           frame->num, frame->rel_pc, name, offset, func))) return r;
    }

    if(0 != (r = xcc_util_write_str(log_fd, "\n"))) return r;

    return 0;
}

输出如下:

backtrace:
    #00 pc 000000000000b884  /data/app/xcrash.sample-WeCpVYjROKKgYtuzbHflHg==/lib/arm64/libxcrash.so (xc_test_call_4+24)
    #01 pc 000000000000b8c8  /data/app/xcrash.sample-WeCpVYjROKKgYtuzbHflHg==/lib/arm64/libxcrash.so (xc_test_call_3+24)
    #02 pc 000000000000b8f8  /data/app/xcrash.sample-WeCpVYjROKKgYtuzbHflHg==/lib/arm64/libxcrash.so (xc_test_call_2+24)
    #03 pc 000000000000b920  /data/app/xcrash.sample-WeCpVYjROKKgYtuzbHflHg==/lib/arm64/libxcrash.so (xc_test_call_1+16)
    #04 pc 000000000000b9b4  /data/app/xcrash.sample-WeCpVYjROKKgYtuzbHflHg==/lib/arm64/libxcrash.so (xc_test_crash+124)
    #05 pc 000000000013f350  /apex/com.android.runtime/lib64/libart.so (art_quick_generic_jni_trampoline+144)
    #06 pc 00000000001365b8  /apex/com.android.runtime/lib64/libart.so (art_quick_invoke_static_stub+568)
    #07 pc 0000000000145084  /apex/com.android.runtime/lib64/libart.so (_ZN3art9ArtMethod6InvokeEPNS_6ThreadEPjjPNS_6JValueEPKc+276)
    #08 pc 00000000002e3bc0  /apex/com.android.runtime/lib64/libart.so (_ZN3art11interpreter34ArtInterpreterToCompiledCodeBridgeEPNS_6ThreadEPNS_9ArtMethodEPNS_11ShadowFrameEtPNS_6JValueE+384)
    #09 pc 00000000002deab8  /apex/com.android.runtime/lib64/libart.so (_ZN3art11interpreter6DoCallILb0ELb0EEEbPNS_9ArtMethodEPNS_6ThreadERNS_11ShadowFrameEPKNS_11InstructionEtPNS_6JValueE+928)
    #10 pc 00000000005a4e3c  /apex/com.android.runtime/lib64/libart.so (MterpInvokeStatic+368)
    #11 pc 0000000000130994  /apex/com.android.runtime/lib64/libart.so (mterp_op_invoke_static+20)
    #12 pc 00000000005a2564  /apex/com.android.runtime/lib64/libart.so (MterpInvokeVirtual+1456)
    #13 pc 0000000000130814  /apex/com.android.runtime/lib64/libart.so (mterp_op_invoke_virtual+20)
    #14 pc 00000000005a5154  /apex/com.android.runtime/lib64/libart.so (MterpInvokeStatic+1160)
    #15 pc 0000000000130994  /apex/com.android.runtime/lib64/libart.so (mterp_op_invoke_static+20)
    ...

Stack (Per Frame)

这里打印的是上个章节 Backtrace 描述的函数调用栈里,每一帧(Frame)对应的栈内存,sp 寄存器指向栈顶

stack:
         0000007fe0ef1ff0  0000000be0ef2260
         0000007fe0ef1ff8  00000075a5107020
         0000007fe0ef2000  0000007fe0ef2001  [stack]
         0000007fe0ef2008  0000007511197000
         0000007fe0ef2010  00000000000fd000
         0000007fe0ef2018  0000007511290018
         0000007fe0ef2020  0000007511197000
         0000007fe0ef2028  0000007511290018
         0000007fe0ef2030  0000007f00000000
         0000007fe0ef2038  0000007511197000
         0000007fe0ef2040  00000000000f8d50
         0000007fe0ef2048  0000000000001000
         0000007fe0ef2050  0000000000000000
         0000007fe0ef2058  0000000000000000
         0000007fe0ef2060  00000075a4ff8000  [anon:libc_malloc]
         0000007fe0ef2068  000000006f5df020  /system/framework/arm64/boot-framework.art
    #00  0000007fe0ef2070  0000000000000000
         0000007fe0ef2078  000000030000ddd5
    #01  0000007fe0ef2080  0000007fe0ef2130  [stack]
         0000007fe0ef2088  0000000200000001
         0000007fe0ef2090  0000007fe0ef20b0  [stack]
         0000007fe0ef2098  00000074b9bcc8fc  /data/app/xcrash.sample-WeCpVYjROKKgYtuzbHflHg==/lib/arm64/libxcrash.so (xc_test_call_2+28)
    #02  0000007fe0ef20a0  0000000000000004
         0000007fe0ef20a8  0000000100000000
         0000007fe0ef20b0  0000007fe0ef20d0  [stack]

Memory Near XX

打印所有寄存器地址附近的内存,寄存器的值可以通过 ptrace 拿到

memory near x2:
    000000751128fd40 0000000000000000 0000000000000000  ................
    000000751128fd50 000000751138cd50 0000000000000000  P.8.u...........
    000000751128fd60 00005015000050c4 0000007f00000000  .P...P..........
    000000751128fd70 0000007511197000 00000000000f8d50  .p..u...P.......
    000000751128fd80 0000000000001000 0000000000000000  ................
    000000751128fd90 0000000000000000 00000075a4ff8000  ............u...
    000000751128fda0 0000000000000003 0000000000000000  ................
    000000751128fdb0 00000074b9bcc9dc 0000000000000000  ....t...........
    000000751128fdc0 0000000000000000 00000075a4e67000  .........p..u...
    000000751128fdd0 00000074c5b3b000 0000000000000001  ....t...........
    000000751128fde0 0000007511197000 00000000000fd000  .p..u...........
    000000751128fdf0 0000000000000000 0000000000000000  ................
    000000751128fe00 0000000000000000 0000000000000000  ................
    000000751128fe10 0000000000000000 0000000000000000  ................
    000000751128fe20 0000000000000000 0000000000000000  ................
    000000751128fe30 0000000000000000 0000000000000000  ................

memory near x3:
    0000007511290000 0000007511290060 0000000000000000  `.).u...........
    0000007511290010 0000000000000000 0000007511290060  ........`.).u...
    0000007511290020 00000075a2606c90 000000751128fd50  .l`.u...P.(.u...
    0000007511290030 0000000000000000 0000000000000000  ................
    0000007511290040 0000000000000000 79fc7e30c0ff4d9e  .........M..0~.y
    0000007511290050 0000000000000000 0000000000000000  ................
    0000007511290060 0000000000000000 0000000000000000  ................
    0000007511290070 0000000000000000 0000000000000000  ................
    0000007511290080 0000000000000000 0000000000000000  ................
    0000007511290090 0000000000000000 0000000000000000  ................
    00000075112900a0 0000000000000000 0000000000000000  ................
    00000075112900b0 0000000000000000 0000000000000000  ................
    00000075112900c0 0000000000000000 0000000000000000  ................
    00000075112900d0 0000000000000000 0000000000000000  ................
    00000075112900e0 0000000000000000 0000000000000000  ................
    00000075112900f0 0000000000000000 0000000000000000  ................

Memory Map

也就是 /proc/pid/maps 内存映射

memory map:
    0000000012c00000-00000000133c0000 rw-        0   7c0000 [anon:dalvik-main space (region space)]
    00000000133c0000-0000000013dc0000 ---        0   a00000 >
    0000000013dc0000-0000000013f80000 ---        0   1c0000 >
    0000000013f80000-0000000013fc0000 rw-        0    40000 >
    0000000013fc0000-0000000014100000 ---        0   140000 >
    0000000014100000-0000000014140000 rw-        0    40000 >
    0000000014140000-0000000014200000 ---        0    c0000 >
    0000000014200000-0000000014280000 ---        0    80000 >
    0000000014280000-00000000163c0000 ---        0  2140000 >
    00000000163c0000-0000000032c00000 rw-        0 1c840000 >
    000000006f1a9000-000000006f430000 rw-        0   287000 /system/framework/arm64/boot.art
    000000006f430000-000000006f51f000 rw-        0    ef000 /system/framework/arm64/boot-core-libart.art
    000000006f51f000-000000006f555000 rw-        0    36000 /system/framework/arm64/boot-okhttp.art
    000000006f555000-000000006f596000 rw-        0    41000 /system/framework/arm64/boot-bouncycastle.art
    000000006f596000-000000006f5a6000 rw-        0    10000 /system/framework/arm64/boot-apache-xml.art
    000000006f5a6000-000000006fe62000 rw-        0   8bc000 /system/framework/arm64/boot-framework.art
    000000006fe62000-000000006fe95000 rw-        0    33000 /system/framework/arm64/boot-ext.art
    000000006fe95000-000000006ff8c000 rw-        0    f7000 /system/framework/arm64/boot-telephony-common.art
    000000006ff8c000-000000006ff9a000 rw-        0     e000 /system/framework/arm64/boot-voip-common.art
    000000006ff9a000-000000006ffaf000 rw-        0    15000 /system/framework/arm64/boot-ims-common.art
    000000006ffaf000-000000006ffb2000 rw-        0     3000 /system/framework/arm64/boot-android.test.base.art
    000000006ffb2000-000000007006b000 r--        0    b9000 /system/framework/arm64/boot.oat
    000000007006b000-0000000070300000 r-x    b9000   295000 >
    0000000070300000-0000000070301000 rw-        0     1000 [anon:.bss]
    0000000070301000-0000000070303000 r--        0     2000 /system/framework/boot.vdex
    0000000070303000-0000000070304000 r--   34e000     1000 /system/framework/arm64/boot.oat
    0000000070304000-0000000070305000 rw-   34f000     1000 >
    0000000070305000-000000007034e000 r--        0    49000 /system/framework/arm64/boot-core-libart.oat
    000000007034e000-0000000070453000 r-x    49000   105000 >
    0000000070453000-0000000070454000 rw-        0     1000 [anon:.bss]
    0000000070454000-0000000070455000 r--        0     1000 /system/framework/boot-core-libart.vdex
    0000000070455000-0000000070456000 r--   14e000     1000 /system/framework/arm64/boot-core-libart.oat
    0000000070456000-0000000070457000 rw-   14f000     1000 >
    0000000070457000-0000000070466000 r--        0     f000 /system/framework/arm64/boot-okhttp.oat

ANR Trace

  1. 给主线程注册 SIGQUIT 的信号处理器 xc_trace_handler,当主线程收到 SIGQUIT 信号时,恢复 xc_trace_dumper(dumper 线程),也就是说发生 ANR 时主线程是被 SIGQUIT 中断的而不是 SIGKILL (?)
  2. 启动 xc_trace_dumper(dumper 线程),挂起等待被主线程唤醒
  3. 在内存里找到生成 ANR 报告的函数符号:_ZN3art7Runtime14DumpForSigQuitERNSt3__113basic_ostreamIcNS1_11char_traitsIcEEEE
  4. STDERR_FILENO 指向日志文件,调用 ANR 报告函数(它会把 ANR 日志写入 STDERR_FILENO),这样就捕获了 ANR 日志
// XCrash.init
// NativeHandler.initialize
// NativeHandler.nativeInit
// xc_jni_init

// 起 dumper 线程,注册 SIGQUIT 处理器
int xc_trace_init(JNIEnv *env,
                  int rethrow,
                  unsigned int logcat_system_lines,
                  unsigned int logcat_events_lines,
                  unsigned int logcat_main_lines,
                  int dump_fds,
                  int dump_network_info)
{
    int r;
    pthread_t thd;

    //capture SIGQUIT only for ART
    if(xc_common_api_level < 21) return 0;

    //is Android Lollipop (5.x)?
    xc_trace_is_lollipop = ((21 == xc_common_api_level || 22 == xc_common_api_level) ? 1 : 0);

    xc_trace_dump_status = XC_TRACE_DUMP_NOT_START;
    xc_trace_rethrow = rethrow;
    xc_trace_logcat_system_lines = logcat_system_lines;
    xc_trace_logcat_events_lines = logcat_events_lines;
    xc_trace_logcat_main_lines = logcat_main_lines;
    xc_trace_dump_fds = dump_fds;
    xc_trace_dump_network_info = dump_network_info;

    //init for JNI callback
    xc_trace_init_callback(env);

    //create event FD
    if(0 > (xc_trace_notifier = eventfd(0, EFD_CLOEXEC))) return XCC_ERRNO_SYS;

    //register signal handler
    if(0 != (r = xcc_signal_trace_register(xc_trace_handler))) goto err2;

    //create thread for dump trace
    if(0 != (r = pthread_create(&thd, NULL, xc_trace_dumper, NULL))) goto err1;

    return 0;

 err1:
    xcc_signal_trace_unregister();
 err2:
    close(xc_trace_notifier);
    xc_trace_notifier = -1;
    
    return r;
}

// ANR 发生时,收到 SIGQUIT,此 dumper 线程唤醒,调用 ANR 报告函数
static void *xc_trace_dumper(void *arg)
{
    JNIEnv         *env = NULL;
    uint64_t        data;
    uint64_t        trace_time;
    int             fd;
    struct timeval  tv;
    char            pathname[1024];
    jstring         j_pathname;
    
    (void)arg;
    
    pthread_detach(pthread_self());

    JavaVMAttachArgs attach_args = {
        .version = XC_JNI_VERSION,
        .name    = "xcrash_trace_dp",
        .group   = NULL
    };
    if(JNI_OK != (*xc_common_vm)->AttachCurrentThread(xc_common_vm, &env, &attach_args)) goto exit;

    while(1)
    {
        //block here, waiting for sigquit
        XCC_UTIL_TEMP_FAILURE_RETRY(read(xc_trace_notifier, &data, sizeof(data)));
        
        //check if process already crashed
        if(xc_common_native_crashed || xc_common_java_crashed) break;

        //trace time
        if(0 != gettimeofday(&tv, NULL)) break;
        trace_time = (uint64_t)(tv.tv_sec) * 1000 * 1000 + (uint64_t)tv.tv_usec;

        //Keep only one current trace.
        if(0 != xc_trace_logs_clean()) continue;

        //create and open log file
        if((fd = xc_common_open_trace_log(pathname, sizeof(pathname), trace_time)) < 0) continue;

        //write header info
        if(0 != xc_trace_write_header(fd, trace_time)) goto end;

        //write trace info from ART runtime
        if(0 != xcc_util_write_format(fd, XCC_UTIL_THREAD_SEP"Cmd line: %s\n", xc_common_process_name)) goto end;
        if(0 != xcc_util_write_str(fd, "Mode: ART DumpForSigQuit\n")) goto end;
        if(0 != xc_trace_load_symbols())
        {
            if(0 != xcc_util_write_str(fd, "Failed to load symbols.\n")) goto end;
            goto skip;
        }
        if(0 != xc_trace_check_address_valid())
        {
            if(0 != xcc_util_write_str(fd, "Failed to check runtime address.\n")) goto end;
            goto skip;
        }
        if(dup2(fd, STDERR_FILENO) < 0)
        {
            if(0 != xcc_util_write_str(fd, "Failed to duplicate FD.\n")) goto end;
            goto skip;
        }

        xc_trace_dump_status = XC_TRACE_DUMP_ON_GOING;
        if(sigsetjmp(jmpenv, 1) == 0) 
        {
            if(xc_trace_is_lollipop)
                xc_trace_libart_dbg_suspend();
            xc_trace_libart_runtime_dump(*xc_trace_libart_runtime_instance, xc_trace_libcpp_cerr);
            if(xc_trace_is_lollipop)
                xc_trace_libart_dbg_resume();
        } 
        else 
        {
            fflush(NULL);
            XCD_LOG_WARN("longjmp to skip dumping trace\n");
        }

        dup2(xc_common_fd_null, STDERR_FILENO);
                            
    skip:
        if(0 != xcc_util_write_str(fd, "\n"XCC_UTIL_THREAD_END"\n")) goto end;

        //write other info
        if(0 != xcc_util_record_logcat(fd, xc_common_process_id, xc_common_api_level, xc_trace_logcat_system_lines, xc_trace_logcat_events_lines, xc_trace_logcat_main_lines)) goto end;
        if(xc_trace_dump_fds)
            if(0 != xcc_util_record_fds(fd, xc_common_process_id)) goto end;
        if(xc_trace_dump_network_info)
            if(0 != xcc_util_record_network_info(fd, xc_common_process_id, xc_common_api_level)) goto end;
        if(0 != xcc_meminfo_record(fd, xc_common_process_id)) goto end;

    end:
        //close log file
        xc_common_close_trace_log(fd);

        //rethrow SIGQUIT to ART Signal Catcher
        if(xc_trace_rethrow && (XC_TRACE_DUMP_ART_CRASH != xc_trace_dump_status)) xc_trace_send_sigquit();
        xc_trace_dump_status = XC_TRACE_DUMP_END;

        //JNI callback
        //Do we need to implement an emergency buffer for disk exhausted?
        if(NULL == xc_trace_cb_method) continue;
        if(NULL == (j_pathname = (*env)->NewStringUTF(env, pathname))) continue;
        (*env)->CallStaticVoidMethod(env, xc_common_cb_class, xc_trace_cb_method, j_pathname, NULL);
        XC_JNI_IGNORE_PENDING_EXCEPTION();
        (*env)->DeleteLocalRef(env, j_pathname);
    }
    
    (*xc_common_vm)->DetachCurrentThread(xc_common_vm);

 exit:
    xc_trace_notifier = -1;
    close(xc_trace_notifier);
    return NULL;
}