httpd-38227

Bug report link:

https://issues.apache.org/bugzilla/show_bug.cgi?id=38227

Version:

2.2.0

How is this diagnosed:

We cannot reproduce this failure, therefore rely on source code understanding.

Symptom:

Hang caused by resource leak.

Apache is used in the following configuration:

tomcat (backend app server) <=> apache (proxypass) <=> client

Apache failed to reuse existing threads, resulting in tomcat server to exhaust thread pool. So everytime a new request arrives, apache is creating a new thread to handle it, without closing them.

At the same time, apache server would print the following:

[Wed Jan 11 11:20:33 2006] [error] (70007)The timeout specified has expired:
ajp_ilink_receive() can't receive header
[Wed Jan 11 11:20:33 2006] [error] ajp_read_header: ajp_ilink_receive failed
[Wed Jan 11 11:20:33 2006] [error] (120006)APR does not understand this error
code: proxy: read response failed from (null) (127.0.0.1)

NOTE:

Since Apache does not have any resource management mechanism, the users and developer were first misled by the symptom:

U: Report Apache cannot connect with tomcat.

D: Proposed workaround 1: change Apache’s configuration

U: Still have the problem...

… Discussion …

U: Suspect tomcat is misconfigured, proposed workaround 2

… …

U: Rejects workaround 2: does not solve the problem

… …

U: Finally realized the problem was caused by Apache leaking socket

Also, seems Tomcat is not handling resource exhaustion as well: No error message printed about resource exhaustion!

Root cause:

Didn't initialize 'worker->cp->conn', causing everytime to create new thread via 'connection_construction'. This eventually results in thread exhaustion in Tomcat server.

Let’s first see where the msgs are printed from:

apr_status_t  ajp_ilink_receive(apr_socket_t *sock, ajp_msg_t *msg) {

   … …

   /* Error message printed because ilink_read returns failure status.

        The HANG also occurred in ‘ilink_read’. */

   status = ilink_read(sock, msg->buf, hlen);

   if (status != APR_SUCCESS) {

       ap_log_error(APLOG_MARK, APLOG_ERR, status, NULL,

                    "ajp_ilink_receive() can't receive header");

       return AJP_ENO_HEADER;

   }

   … …

}

ajp_read_header (... …) {

   … ...

   rc = ajp_ilink_receive(sock, *msg);

   if (rc != APR_SUCCESS) {

       ap_log_error(APLOG_MARK, APLOG_ERR, 0, r->server,

              "ajp_read_header: ajp_ilink_receive failed");

       return rc;

   }

  … ...

}

ap_proxy_ajp_request ( … … ) {

   … ...

   /* read the response */

   conn->data = NULL;

   status = ajp_read_header(conn->sock, r, (ajp_msg_t **)&(conn->data));

   if (status != APR_SUCCESS) {

       /* We had a failure: Close connection to backend */

       conn->close++;

       apr_brigade_destroy(input_brigade);

       ap_log_error(APLOG_MARK, APLOG_ERR, status, r->server,

                    "proxy: read response failed from %pI (%s)",  conn->worker->cp->addr,  conn->worker->hostname);

       return HTTP_SERVICE_UNAVAILABLE;

   }

  … ...

}

----------------------- Now, where it hang ----------------

apr_status_t ilink_read(... ...) {

       /* Hang in apr_socket_recv */

       status = apr_socket_recv(sock, (char *)(buf + rdlen), &length);

      if (status != APR_SUCCESS)

           return status;          /* any error. */

          // returned here.

   return APR_SUCCESS;

}

/* Hang in apr_socket_recv. It uses the same timeout-based I/O implementation as describe in apache bug 22030. */

apr_status_t apr_socket_recv(apr_socket_t *sock, char *buf, apr_size_t *len) {

   do {

       /* This read is non-blocking read, return immediately when read failed. */

       rv = read(sock->socketdes, buf, (*len));

   } while (rv == -1 && errno == EINTR);

   while ((rv == -1) && (errno == EAGAIN || errno == EWOULDBLOCK)

                     && (sock->timeout > 0)) {

do_select:

       /* The read failed, and apache uses apr_wait_for_io_or_timeout to poll the socket. */

       /* We can automatically put a log message here, since the ‘read’ system call returned error result.

             Indeed, when we call apr_wait_for_io_or_timeout below, it hang. */

       arv = apr_wait_for_io_or_timeout(NULL, sock, 1);

       if (arv != APR_SUCCESS) {

           *len = 0;

           return arv;

       }

       else {

           do {

               rv = read(sock->socketdes, buf, (*len));

           } while (rv == -1 && errno == EINTR);

       }

   }

   if (rv == -1) {

       (*len) = 0;

       return errno;

   }

   …...

}

apr_status_t apr_wait_for_io_or_timeout(apr_file_t *f, apr_socket_t *s,

                                         int for_read)

{

  …

  do {

      /* Hangs within apr_poll. */

      srv = apr_poll(&pollset, 1, &n, timeout);

      if (n == 1 && pollset.rtnevents & type) {

          return APR_SUCCESS;

      }

  } while (APR_STATUS_IS_EINTR(srv));

  return srv;

}

APR_DECLARE(apr_status_t) apr_poll(apr_pollfd_t *aprset, apr_int32_t num,

                    apr_int32_t *nsds, apr_interval_time_t timeout)

{

 ...

  if (timeout > 0) {

      timeout /= 1000; /* convert microseconds to milliseconds */

  }

  /* Where it hangs! */

  rv = poll(pollset->pollset, pollset->nelts, timeout);

  (*num) = rv;

  if (rv < 0) {

      return apr_get_netos_error();

  }

  if (rv == 0) {

      // We can also put a message here -- indeed, it timed out!!!

      return APR_TIMEUP;

  }

}

However, only knowing the failure symptom is not sufficient to diagnose the failure. Developers further instrumented apache with additional debug log messages and asked user to reproduce the failure, and collect the log messages:

--- proxy_util-old.c        2005-11-10 07:20:05.000000000 -0800

+++ proxy_util.c        2006-04-05 21:09:13.000000000 -0700

@@ -1729,18 +1730,22 @@

ap_proxy_acquire_connection(... ...) {

     … …

     /* Instrumented log message: examine the fields of ‘worker’ data structure. */

+    ap_log_error(APLOG_MARK, APLOG_ERR, 0, s, "1: worker->hmax: %d, worker->cp->res: %s", worker->hmax, (worker->cp->res ? "not null" : "null"));

    if (worker->hmax && worker->cp->res) {

        // worker->cp->res -- ‘cp’ is the proxy_conn_pool structure, res is the Connection resource list.

        //  worker->hmax -- Hard maximum on the total number of connections.

        rv = apr_reslist_acquire(worker->cp->res, (void **)conn);

+       ap_log_error(APLOG_MARK, APLOG_ERR, 0, s, "2: calling apr_res_list_acquire conn");

    }

    else{

        /* create the new connection if the previous was destroyed */

       if (!worker->cp->conn) {

            connection_constructor((void **)conn, worker, worker->cp->pool);

          // in the buggy execution, this message is printed, indicating

          // worker->cp->conn == NULL!

          // This will cause every connection creating a new worker thread

          // using the function above!!!

+           ap_log_error(APLOG_MARK, APLOG_ERR, 0, s, "2: called connection_constructor");

       } else {

            *conn = worker->cp->conn;

            worker->cp->conn = NULL;

+           ap_log_error(APLOG_MARK, APLOG_ERR, 0, s, "2: used conn = worker->cp->conn");

        }

        rv = APR_SUCCESS;

    }

ap_proxy_intialize_worker (... …) {

   … ...

   if (worker->hmax) {

       rv = apr_reslist_create(&(worker->cp->res),

                               worker->min, worker->smax,

                                worker->hmax, worker->ttl,

                                connection_constructor, connection_destructor,

                                worker, worker->cp->pool);

+    ap_log_error(APLOG_MARK, APLOG_ERR, 0, s, "0: worker->hmax: %d, worker->cp->res: %s", worker->hmax, (worker->cp->res ? "not null" : "null"));

        apr_pool_cleanup_register(worker->cp->pool, (void *)worker,

                                  conn_pool_cleanup, apr_pool_cleanup_null);

        … …

       }

}

OK, now, what is the patch?

--- httpd/httpd/trunk/modules/proxy/mod_proxy_balancer. 394445
+++ httpd/httpd/trunk/modules/proxy/mod_proxy_balancer.c 394446
@@ -88,6 +88,7 @@ static int init_balancer_members(proxy_s

init_balancer_members (... …) {

   … ...
    for (i = 0; i < balancer->workers->nelts; i++) {
        ap_proxy_initialize_worker_share(conf, workers, s);
+       ap_proxy_initialize_worker(workers, s);
        ++workers;
    }

}

----------------------------------------------------------------------------------------------------

Here is the analysis:

1). the error messages are printed because of the tomcat server is dead, which is caused by thread exhaustion.

2). The root cause, as the patch, in ‘init_balancer_members’, it didn’t call ‘ap_proxy_initialize_worker’, which would set ‘worker->cp->conn’.

3). The ‘error’, in ‘ap_proxy_acquire_connection’, it would call ‘connection_constructor’, which creates new connection, rather than re-use the existing connection in the ‘else’ branch.

Can Errlog automatically handle:

Yes. Besides to log at where it hang (in the error return value of ‘read’), we can also use resource allocation sampling to diagnose the root cause: Adaptively sample every resource allocation (in this case, connection_constructor where it allocate a new thread), print a log message.

Is there Error Message?

Yes