[alibaba/tengine]tengine 运行一段时间后产生一些不死进程

2024-01-03 574 views
2

www 3656 0.6 1.9 511944 77124 ? S Aug04 105:05 nginx: worker process is shutting down
www 3657 0.6 2.6 532424 103528 ? S Aug04 104:13 nginx: worker process is shutting down
www 3658 0.6 2.6 511944 105176 ? S Aug04 104:06 nginx: worker process is shutting down
www 3659 0.6 2.3 512072 93664 ? S Aug04 104:24 nginx: worker process is shutting down
root 5767 0.0 1.9 419300 75904 ? Ss Aug02 0:17 nginx: master process /usr/sbin/nginx -c /etc/nginx/nginx.conf www 5769 0.4 4.9 1355720 196068 ? S Aug02 90:22 nginx: worker process is shutting down
www 5770 0.4 4.7 1277896 185392 ? S Aug02 89:35 nginx: worker process is shutting down
www 5771 0.4 3.7 1339336 145584 ? S Aug02 88:50 nginx: worker process is shutting down
www 5772 0.4 3.8 1155016 149624 ? S Aug02 90:07 nginx: worker process is shutting down
www 7507 0.0 2.2 421640 86784 ? S Aug15 0:29 nginx: worker process
www 7508 0.0 2.2 421640 86784 ? S Aug15 0:29 nginx: worker process
www 7509 0.0 2.2 421704 87064 ? S Aug15 0:30 nginx: worker process
www 7510 0.0 2.2 421640 86784 ? S Aug15 0:29 nginx: worker process
work 13214 0.0 0.0 103308 900 pts/0 S+ 10:03 0:00 grep nginx www 15375 0.2 3.0 507848 120400 ? S Aug03 36:51 nginx: worker process is shutting down
www 15377 0.2 2.3 491464 93392 ? S Aug03 36:49 nginx: worker process is shutting down
www 15378 0.2 2.3 499656 93708 ? S Aug03 37:11 nginx: worker process is shutting down
www 16912 1.2 3.0 1605704 121612 ? S Aug10 106:00 nginx: worker process is shutting down
www 23439 0.6 1.8 462920 72324 ? S Aug09 68:58 nginx: worker process is shutting down
www 23441 0.6 1.7 458824 67068 ? S Aug09 67:19 nginx: worker process is shutting down
www 25351 0.2 2.9 511944 116016 ? S Aug08 23:28 nginx: worker process is shutting down
www 26387 0.8 1.9 442440 78336 ? S Aug05 130:55 nginx: worker process is shutting down
www 26388 0.8 1.8 450504 73968 ? S Aug05 130:17 nginx: worker process is shutting down
www 26389 0.8 1.8 446408 74016 ? S Aug05 131:33 nginx: worker process is shutting down
www 26390 0.8 1.8 442312 74160 ? S Aug05 129:24 nginx: worker process is shutting down

可以看到有一些进程进入了nginx: worker process is shutting down 状态,而且持续很长时间,如果不重启nginx 内存消耗十分严重 strace 一个进程发现 strace -s 1024 -p 11533 Process 11533 attached - interrupt to quit gettimeofday({1471313386, 538559}, NULL) = 0 epoll_wait(6, {}, 512, 30430) = 0 gettimeofday({1471313416, 999254}, NULL) = 0 epoll_wait(6, ^C <unfinished ...> Process 11533 detached

回答

8

shutting down状态的worker主要是因为还有计时器没有被清理完。可能的情况有:

  • shutting down过程中任有老的请求没有处理完,等待处理完请求即可(比如用户的请求在老worker下载超大文件)(一般计时器都会对应到1个连接)
  • 其他模块触发的timer没有处理完(可能为bug,比较少见)
  • lua-nginx-module模块中调用ngx.sleep()

如果想查询什么timer遗留,可以通过以通过如下gdb脚本,查看nginx worker中timer rbtree:

# dump active timers 
define dump_timer
    dump_timer_iter ngx_event_timer_rbtree.root
end

define dump_timer_iter
    # NOTE: dont set $node = $arg0, because $node will be changed by next calling dump_timer_iter()
    if $arg0 != ngx_event_timer_rbtree.sentinel
        # timer node($arg0) to event($ev)
        set $ev = (ngx_event_t *) ((char *) $arg0 - (int)&((ngx_event_t *) 0x0)->timer)
        printf "set $ev = (ngx_event_t *) %p\n", $ev
        p *$ev
        printf "\n"

        dump_timer_iter $arg0->left
        dump_timer_iter $arg0->right
    end
end
  • 示例: 通过event->handler可以推测timer创建者(比如 请求下载等情况,示例中为ngx.sleep()触发)
(gdb) dump_timer
set $ev = (ngx_event_t *) 0x62100003b3d0
$2 = {
    data = 0x62100003b380,
    write = 0,
    accept = 0,
    instance = 0,
    active = 0,
    disabled = 0,
    ready = 0,
    oneshot = 0,
    complete = 0,
    eof = 0,
    error = 0,
    timedout = 0,
    timer_set = 1,
    delayed = 0,
    read_discarded = 0,
    unexpected_eof = 0,
    deferred_accept = 0,
    pending_eof = 0,
    posted_ready = 0,
    available = 0,
    handler = 0x10791f0 <ngx_http_lua_sleep_handler>,         <<< ngx.sleep()触发的timer
    index = 0,
    log = 0x61200000b630,
    timer = {
        key = 1472011644596,
        left = 0x32d2a20 <ngx_event_timer_sentinel>,
        right = 0x32d2a20 <ngx_event_timer_sentinel>,
        parent = 0x6330000009c8,
        color = 1 '\001',
        data = 0 '\000'
    },
    closed = 0,
    channel = 0,
    resolver = 0,
    next = 0x0,
    prev = 0x0
}
2

我们内部tengine有个功能,可以设置shutting down worker最大持续时间。 比如这个时间设置为60s: 如果老worker超过1分钟没有退出就强制退出其,不过这个会导致老worker上遗留的请求全部被中断处理(连接被关闭)。 后续我们也会把这个功能开源出来:)

7

@chobits 亲,shutting down worker好久开源出来呀。

4

force_exit not working. nginx version:

 Tengine version: Tengine/2.2.1 (nginx/1.8.1)
built by gcc 4.4.7 20120313 (Red Hat 4.4.7-17) (GCC)
TLS SNI support enabled
configure arguments: --prefix=/home/netopsdev/nginx --with-http_stub_status_module --with-
http_ssl_module --with-http_gzip_static_module --with-ipv6 --with-pcre --with-
http_concat_module --with-http_v2_module --with-openssl=/usr/local/openssl-1.0.2h/ --with-
force-exit

tengine configuration: force_exit 600s; and I strace a shutdown work ,It shows

 # strace -s 1024 -p  13102
Process 13102 attached
gettimeofday({1509360833, 697103}, NULL) = 0
epoll_wait(14, {{EPOLLIN|EPOLLOUT, {u32=1829681969, u64=139717115820849}}}, 512, 43195070) = 1
gettimeofday({1509360833, 768518}, NULL) = 0
recvfrom(45, "2\r\n0\n\r\n", 131072, 0, NULL, NULL) = 7
writev(44, [{"2\r\n", 3}, {"0\n", 2}, {"\r\n", 2}], 3) = 7
recvfrom(45, 0x2278510, 131072, 0, 0, 0) = -1 EAGAIN (Resource temporarily unavailable)
epoll_wait(14, {{EPOLLOUT, {u32=1829681969, u64=139717115820849}}}, 512, 43200000) = 1
gettimeofday({1509360833, 769953}, NULL) = 0
epoll_wait(14, {{EPOLLIN|EPOLLOUT, {u32=1829681969, u64=139717115820849}}}, 512, 43199999) = 1
gettimeofday({1509360838, 769024}, NULL) = 0
recvfrom(45, "2\r\n0\n\r\n", 131072, 0, NULL, NULL) = 7
writev(44, [{"2\r\n", 3}, {"0\n", 2}, {"\r\n", 2}], 3) = 7
recvfrom(45, 0x2278510, 131072, 0, 0, 0) = -1 EAGAIN (Resource temporarily unavailable)
epoll_wait(14, {{EPOLLOUT, {u32=1829681969, u64=139717115820849}}}, 512, 43200000) = 1
gettimeofday({1509360838, 770277}, NULL) = 0
epoll_wait(14, {{EPOLLIN|EPOLLOUT, {u32=1829681969, u64=139717115820849}}}, 512, 43199999) = 1
7

这个进程运行了多久了?

3

cc @aholic 一起看下

7

@chobits 这个gdb的调试脚本适用于nginx吗?而不只是tengine

3

hi @JoyChou93 可以用,tengine对timer的红黑树的数据结构没有动