KOOM - 利用子进程 dump hprof
不冻结 APP 的 dump hprof
在 LeakCanary 浅析 和 Matrix - ResourcesCanary 浅析 这两篇文章里,介绍了检测内存泄漏的两种相似的思路:
WeakReference
+ReferenceQueue
+ 延迟 5s 检查是否被 GCWeakReference
+ConcurrentLinkedQueue
,子线程轮询的方式每隔 1m 检查队列里的对象是否被 GC
但无论 LeakCanary
还是 Matrix.ResourcesCanary
都没有解决 dump hprof 时整个 APP 被「冻结」的问题,而一次 dump hprof 往往要持续 10s 甚至更多,这在线上环境下是不可接受的,所以在发生内存泄漏后只能上报类信息,开发者收到后人工检查涉及此类的相关代码找出泄漏点。但是这种方式是极其低效和不准确的,如果能在端侧找出泄漏对象的 GC ROOT PATH,就能极大地减少人工量,并且提高后续修复的准确度。
如果能够在不冻结 APP 的情况下 dump hprof,不仅仅能够上报内存泄漏问题,还能对 OOM 进行预警:比如监控 JVM 的内存使用率,当达到 90% 的时候将 hprof 上报分析
KOOM
提出了一个在不冻结 APP 的情况下 dump hprof 的思路:fork
出子进程,总体流程是这样的:
- 父进程 suspend JVM
- 父进程 fork 出子进程
- 父进程 resume JVM 后,线程等待子进程结束从而拿到 hprof
- 子进程调用
Debug.dumpHprofData
生成 hprof 后退出 - 父进程启动
Service
在另一个进程里解析 hprof 并构造 GC ROOT PATH
整个过程 APP 只在 fork
前后冻结了一小会,这么短的时间是可以接受的,由于 fork
采用 Copy-On-Write
机制,子进程能够继承父进程的内存
暂停和恢复 JVM
- <= Android 10,使用函数
SuspendVMEv
和ResumeVMEv
- >= Android 11,使用类
ScopedSuspendAll
(局部变量,构造函数暂停 JVM,析构函数恢复 JVM) - <= Android 4,
KOOM
不支持
但是要从各个 Android 版本的 so 文件里准确找到这些函数(符号)是有难度的
寻找 SuspendVMEv
和 ResumeVMEv
的地址
// 自己实现 dlopen dlsym dlclose 以兼容各 Android 版本
bool initForkVMSymbols() {
void *libHandle = kwai::linker::DlFcn::dlopen("libart.so", RTLD_NOW);
if (libHandle == nullptr) {
return false;
}
suspendVM = (void (*)())kwai::linker::DlFcn::dlsym(libHandle, "_ZN3art3Dbg9SuspendVMEv");
if (suspendVM == nullptr) {
__android_log_print(ANDROID_LOG_WARN, LOG_TAG, "_ZN3art3Dbg9SuspendVMEv unsupported!");
}
resumeVM = (void (*)())kwai::linker::DlFcn::dlsym(libHandle, "_ZN3art3Dbg8ResumeVMEv");
if (resumeVM == nullptr) {
__android_log_print(ANDROID_LOG_WARN, LOG_TAG, "_ZN3art3Dbg8ResumeVMEv unsupported!");
}
kwai::linker::DlFcn::dlclose(libHandle);
return suspendVM != nullptr && resumeVM != nullptr;
}
JNIEXPORT void *DlFcn::dlopen(const char *lib_name, int flags) {
ALOGD("dlopen %s", lib_name);
pthread_once(&once_control, init_api);
if (android_api_ < __ANDROID_API_N__) { // N 以下打开 libart.so 查找符号
return ::dlopen(lib_name, flags);
}
if (android_api_ > __ANDROID_API_N__) {
void *handle = ::dlopen("libdl.so", RTLD_NOW);
CHECKP(handle)
auto __loader_dlopen = reinterpret_cast<__loader_dlopen_fn>(::dlsym(handle, "__loader_dlopen"));
CHECKP(__loader_dlopen)
if (android_api_ < __ANDROID_API_Q__) { // Q 以下用 __loader_dlopen 查找符号
return __loader_dlopen(lib_name, flags, (void *)dlerror);
} else {
handle = __loader_dlopen(lib_name, flags, (void *)dlerror); // 其他情况,从 /proc/self/maps 拿到进程的内存布局,找到 mmap 进内存的 libart.so,根据 elf 文件结构解析里面的符号
if (handle == nullptr) {
// Android Q added "runtime" namespace
dl_iterate_data data{};
data.info_.dlpi_name = lib_name;
dl_iterate_phdr_wrapper(dl_iterate_callback, &data);
CHECKP(data.info_.dlpi_addr > 0)
handle = __loader_dlopen(lib_name, flags, (void *)data.info_.dlpi_addr);
}
return handle;
}
}
// __ANDROID_API_N__
auto *data = new dl_iterate_data();
data->info_.dlpi_name = lib_name;
dl_iterate_phdr_wrapper(dl_iterate_callback, data);
return data;
}
JNIEXPORT void *DlFcn::dlsym(void *handle, const char *name) {
ALOGD("dlsym %s", name);
CHECKP(handle)
if (android_api_ != __ANDROID_API_N__) { // 用 dlopen 打开则用 dlsym 查找符号
return ::dlsym(handle, name);
}
// __ANDROID_API_N__
auto *data = (dl_iterate_data *)handle; // 否则遍历 elf 里的符号列表查找
ElfW(Addr) dlpi_addr = data->info_.dlpi_addr;
const ElfW(Phdr) *dlpi_phdr = data->info_.dlpi_phdr;
ElfW(Half) dlpi_phnum = data->info_.dlpi_phnum;
// preserved for parse .symtab
ElfW(Addr) elf_base_addr;
for (int i = 0; i < dlpi_phnum; ++i) {
if (dlpi_phdr[i].p_type == PT_LOAD && dlpi_phdr[i].p_offset == 0) {
elf_base_addr = dlpi_addr + dlpi_phdr[i].p_vaddr;
ALOGD("PT_LOAD dlpi_addr %p p_vaddr %p elf_base_addr %p", dlpi_addr, dlpi_phdr[i].p_vaddr,
elf_base_addr);
}
if (dlpi_phdr[i].p_type == PT_DYNAMIC) {
ElfW(Dyn) *dyn = (ElfW(Dyn) *)(dlpi_addr + dlpi_phdr[i].p_vaddr);
ElfW(Dyn) *dyn_end = dyn + (dlpi_phdr[i].p_memsz / sizeof(ElfW(Dyn)));
const char *strtab;
ElfW(Sym) * symtab;
bool is_use_gnu_hash = false;
// for ELF hash
size_t nbucket_;
size_t nchain_;
uint32_t *bucket_;
uint32_t *chain_;
// for GNU hash
size_t gnu_nbucket_;
uint32_t *gnu_bucket_;
uint32_t *gnu_chain_;
uint32_t gnu_maskwords_;
uint32_t gnu_shift2_;
ElfW(Addr) * gnu_bloom_filter_;
// ELF parse
for (; dyn < dyn_end; dyn++) {
switch (dyn->d_tag) {
case DT_NULL:
// the end of the dynamic-section
dyn = dyn_end;
break;
case DT_STRTAB: {
ElfW(Addr) strtab_addr = dlpi_addr + dyn->d_un.d_ptr;
strtab = reinterpret_cast<const char *>(strtab_addr);
CHECKP(strtab_addr >= dlpi_addr)
break;
}
case DT_SYMTAB: {
ElfW(Addr) symtab_addr = dlpi_addr + dyn->d_un.d_ptr;
symtab = reinterpret_cast<ElfW(Sym) *>(symtab_addr);
CHECKP(symtab_addr >= dlpi_addr)
break;
}
case DT_HASH: {
// ignore DT_HASH when ELF contains DT_GNU_HASH hash table
if (is_use_gnu_hash) {
continue;
}
nbucket_ = reinterpret_cast<uint32_t *>(dlpi_addr + dyn->d_un.d_ptr)[0];
nchain_ = reinterpret_cast<uint32_t *>(dlpi_addr + dyn->d_un.d_ptr)[1];
bucket_ = reinterpret_cast<uint32_t *>(dlpi_addr + dyn->d_un.d_ptr + 8);
chain_ = reinterpret_cast<uint32_t *>(dlpi_addr + dyn->d_un.d_ptr + 8 + nbucket_ * 4);
break;
}
case DT_GNU_HASH: {
gnu_nbucket_ = reinterpret_cast<uint32_t *>(dlpi_addr + dyn->d_un.d_ptr)[0];
// skip symndx
gnu_maskwords_ = reinterpret_cast<uint32_t *>(dlpi_addr + dyn->d_un.d_ptr)[2];
gnu_shift2_ = reinterpret_cast<uint32_t *>(dlpi_addr + dyn->d_un.d_ptr)[3];
gnu_bloom_filter_ = reinterpret_cast<ElfW(Addr) *>(dlpi_addr + dyn->d_un.d_ptr + 16);
gnu_bucket_ = reinterpret_cast<uint32_t *>(gnu_bloom_filter_ + gnu_maskwords_);
// amend chain for symndx = header[1]
gnu_chain_ = gnu_bucket_ + gnu_nbucket_ -
reinterpret_cast<uint32_t *>(dlpi_addr + dyn->d_un.d_ptr)[1];
--gnu_maskwords_;
is_use_gnu_hash = true;
break;
}
default:
break;
}
}
// lookup symbol
if (is_use_gnu_hash) {
ALOGD("lookup use gnu hash");
uint32_t hash = elf_gnu_hash((uint8_t *)name);
constexpr uint32_t kBloomMaskBits = sizeof(ElfW(Addr)) * 8;
const uint32_t word_num = (hash / kBloomMaskBits) & gnu_maskwords_;
const ElfW(Addr) bloom_word = gnu_bloom_filter_[word_num];
const uint32_t h1 = hash % kBloomMaskBits;
const uint32_t h2 = (hash >> gnu_shift2_) % kBloomMaskBits;
// test against bloom filter
CHECKP((1 & (bloom_word >> h1) & (bloom_word >> h2)) != 0)
// bloom test says "probably yes"...
uint32_t n = gnu_bucket_[hash % gnu_nbucket_];
do {
ElfW(Sym) *s = symtab + n;
if (((gnu_chain_[n] ^ hash) >> 1) == 0 && strcmp(strtab + s->st_name, name) == 0) {
ALOGD("find %s %p", name, dlpi_addr + s->st_value);
return reinterpret_cast<void *>(dlpi_addr + s->st_value);
}
} while ((gnu_chain_[n++] & 1) == 0);
} else {
ALOGD("lookup use elf hash");
uint32_t hash = elf_hash((uint8_t *)name);
for (uint32_t n = bucket_[hash % nbucket_]; n != 0; n = chain_[n]) {
ElfW(Sym) *s = symtab + n;
if (strcmp(strtab + s->st_name, name) == 0) {
ALOGD("find %s %p", name, dlpi_addr + s->st_value);
return reinterpret_cast<void *>(dlpi_addr + s->st_value);
}
}
}
}
}
return nullptr;
}
查找 ScopedSuspendAll
的构造函数和析构函数
JNIEXPORT jboolean JNICALL Java_com_kwai_koom_javaoom_dump_ForkJvmHeapDumper_dumpHprofDataNative(
JNIEnv *env, jclass clazz, jstring file_name) {
pthread_once(&once_control, initDumpHprofSymbols);
// ...
}
// For above android 11
static void initDumpHprofSymbols() {
// Parse .dynsym(GLOBAL)
void *libHandle = kwai::linker::DlFcn::dlopen("libart.so", RTLD_NOW);
if (libHandle == nullptr) {
return;
}
ScopedSuspendAllConstructor = (void (*)(void *, const char *, bool))kwai::linker::DlFcn::dlsym(
libHandle, "_ZN3art16ScopedSuspendAllC1EPKcb");
if (ScopedSuspendAllConstructor == nullptr) {
__android_log_print(ANDROID_LOG_WARN, LOG_TAG, "_ZN3art16ScopedSuspendAllC1EPKcb unsupported!");
}
ScopedSuspendAllDestructor =
(void (*)(void *))kwai::linker::DlFcn::dlsym(libHandle, "_ZN3art16ScopedSuspendAllD1Ev");
if (ScopedSuspendAllDestructor == nullptr) {
__android_log_print(ANDROID_LOG_WARN, LOG_TAG, "_ZN3art16ScopedSuspendAllD1Ev unsupported!");
}
kwai::linker::DlFcn::dlclose(libHandle);
// ...
}
fork
使用系统调用 fork
创建子进程
- 子进程返回 0,dump hprof 后退出进程
- 父进程返回子进程 pid,首先恢复 JVM,然后利用
waitpid
阻塞直到子进程完成 dump hprof,然后处理 hprof
public class ForkJvmHeapDumper {
@Override
public boolean dump(String path) {
// ...
boolean dumpRes = false;
try {
int pid = trySuspendVMThenFork();
if (pid == 0) { // 子进程
Debug.dumpHprofData(path);
KLog.i(TAG, "notifyDumped:" + dumpRes);
//System.exit(0);
exitProcess();
} else { // 父进程
resumeVM();
dumpRes = waitDumping(pid);
KLog.i(TAG, "hprof pid:" + pid + " dumped: " + path);
}
} catch (IOException e) {
e.printStackTrace();
KLog.e(TAG, "dump failed caused by IOException!");
}
return dumpRes;
}
}
JNIEXPORT jint JNICALL Java_com_kwai_koom_javaoom_dump_ForkJvmHeapDumper_trySuspendVMThenFork(
JNIEnv *env, jobject jObject) {
if (suspendVM == nullptr) {
initForkVMSymbols();
}
if (suspendVM != nullptr) {
suspendVM();
}
return fork(); // 用系统调用 fork 创建子进程
}
Android 11 的情况下也是利用系统调用 fork
出子进程
JNIEXPORT jboolean JNICALL Java_com_kwai_koom_javaoom_dump_ForkJvmHeapDumper_dumpHprofDataNative(
JNIEnv *env, jclass clazz, jstring file_name) {
pthread_once(&once_control, initDumpHprofSymbols);
if (ScopedSuspendAllConstructor == nullptr || ScopedSuspendAllDestructor == nullptr ||
HprofConstructor == nullptr || HprofDestructor == nullptr || Dump == nullptr) {
return JNI_FALSE;
}
ScopedSuspendAllConstructor(gSSAHandle, LOG_TAG, true);
pid_t pid = fork();
if (pid == -1) {
// Fork error.
__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "failed to fork!");
return JNI_FALSE;
}
if (pid != 0) {
// Parent
ScopedSuspendAllDestructor(gSSAHandle);
int stat_loc;
for (;;) {
if (waitpid(pid, &stat_loc, 0) != -1 || errno != EINTR) {
break;
}
}
return JNI_TRUE;
}
// ...
}
处理 hprof
子进程用 Debug.dumpHprofData
生成 hprof 文件后结束,父进程拿到 hprof 后开个新进程,在 Service
里用 LeakCanary.Shark
解析
public class ForkJvmHeapDumper {
public boolean dump(String path) {
//...
boolean dumpRes = false;
try {
int pid = trySuspendVMThenFork();
if (pid == 0) { // 子进程 dump hprof 后退出
Debug.dumpHprofData(path);
KLog.i(TAG, "notifyDumped:" + dumpRes);
exitProcess();
} else { // 父进程等待 dump hprof 完成
resumeVM();
dumpRes = waitDumping(pid);
KLog.i(TAG, "hprof pid:" + pid + " dumped: " + path);
}
} catch (IOException e) {
e.printStackTrace();
KLog.e(TAG, "dump failed caused by IOException!");
}
return dumpRes;
}
}
public class HeapAnalyzeService extends IntentService { // 在 HeapAnalyzeService 里解析 hprof
public static void runAnalysis(Application application,
HeapAnalysisListener heapAnalysisListener) {
KLog.i(TAG, "runAnalysis startService");
Intent intent = new Intent(application, HeapAnalyzeService.class);
IPCReceiver ipcReceiver = buildAnalysisReceiver(heapAnalysisListener);
intent.putExtra(KConstants.ServiceIntent.RECEIVER, ipcReceiver);
KHeapFile heapFile = KHeapFile.getKHeapFile();
intent.putExtra(KConstants.ServiceIntent.HEAP_FILE, heapFile);
application.startService(intent);
}
}
// HeapAnalyzeService 在另一个进程,防止解析 hprof 时 OOM 影响 APP
// <service
// android:name=".analysis.HeapAnalyzeService"
// android:process=":heap_analysis" />
Android 11 则是查找类 Hprof
的构造/析构函数和 dump
函数,在 native 层进行 dump hprof,后续依然是在 java 层开个新进程解析
JNIEXPORT jboolean JNICALL Java_com_kwai_koom_javaoom_dump_ForkJvmHeapDumper_dumpHprofDataNative(
JNIEnv *env, jclass clazz, jstring file_name) {
// ... 直接在 native 层 dump hprof
const char *filename = env->GetStringUTFChars(file_name, nullptr);
HprofConstructor(gHprofHandle, filename, -1, false);
Dump(gHprofHandle);
HprofDestructor(gHprofHandle);
env->ReleaseStringUTFChars(file_name, filename);
_exit(0);
}
static void initDumpHprofSymbols() { // 查找 Hprof 类的相关函数
//...
libHandle = kwai::linker::DlFcn::dlopen_elf("libart.so", RTLD_NOW);
if (libHandle == nullptr) {
return;
}
HprofConstructor = (void (*)(void *, const char *, int, bool))kwai::linker::DlFcn::dlsym_elf(
libHandle, "_ZN3art5hprof5HprofC2EPKcib");
if (HprofConstructor == nullptr) {
__android_log_print(ANDROID_LOG_WARN, LOG_TAG, "_ZN3art5hprof5HprofC2EPKcib unsupported!");
}
HprofDestructor =
(void (*)(void *))kwai::linker::DlFcn::dlsym_elf(libHandle, "_ZN3art5hprof5HprofD0Ev");
if (HprofDestructor == nullptr) {
__android_log_print(ANDROID_LOG_WARN, LOG_TAG, "_ZN3art5hprof5HprofD0Ev unsupported!");
}
Dump = (void (*)(void *))kwai::linker::DlFcn::dlsym_elf(libHandle, "_ZN3art5hprof5Hprof4DumpEv");
if (Dump == nullptr) {
__android_log_print(ANDROID_LOG_WARN, LOG_TAG, "_ZN3art5hprof5Hprof4DumpEv unsupported!");
}
kwai::linker::DlFcn::dlclose_elf(libHandle);
}