[ipxe-devel] [PATCH] [tcp] Send keepalive packets to prevent TCP stalls

Ladi Prosek lprosek at redhat.com
Thu Jun 9 07:58:59 UTC 2016


When running on less than perfect networks, TCP stalls or freezes have
been observed with the imgfetch/boot family of commands. There is nothing
wrong with neither the server nor the iPXE implementation of TCP, the
network breaks in a way that makes it impossible for the connection
to recover. For example, the connection may be evicted from a NAT table
somewhere along the way, or a stray frame with a conflicting MAC address
confuses local Ethernet switches and effectively cuts the client off the
network.

Here's an example user report and analysis of such condition (link loss
during image download):
http://lists.ipxe.org/pipermail/ipxe-devel/2014-October/003829.html

The --timeout option helps detect the problem but could be wasteful in
terms of time and network bandwidth. There is no guarantee that the next
download attempt will succeed.

This patch implements an alternative approach. When the connection is
believed to be stalled, a TCP keepalive packet is sent to the server.
The resulting ACK replies (if the server is actually alive) are ignored
or processed as usual, i.e. no actual liveness detection is being added.
The point of the packet is to reestablish the L2/L3 path from the server
back to the client so the download can resume.

This keepalive implementation has proven very effective in dealing with
freezes. Real-world customer deployments have shown an improvement from
5% to virtually zero failed boots.

It is without doubt that this use of TCP keepalive is unorthodox at the
very least. What sets iPXE apart from a typical OS environment is the
absence of other network traffic, which would normally help, especially
with L2 breaks. The one TCP connection tends to be the only thing running
on the client. When it stalls, there's nothing to make the network aware
of the host's existence.

The functionality is behind a new "tcp-keepalive" config option, defaulting
to off for full backward compatibility.

Signed-off-by: Ladi Prosek <lprosek at redhat.com>
---
 src/net/tcp.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/src/net/tcp.c b/src/net/tcp.c
index 68128e8..a5ab296 100644
--- a/src/net/tcp.c
+++ b/src/net/tcp.c
@@ -115,6 +115,8 @@ struct tcp_connection {
 	struct retry_timer timer;
 	/** Shutdown (TIME_WAIT) timer */
 	struct retry_timer wait;
+	/** Keepalive timer */
+	struct retry_timer keepalive;
 
 	/** Pending operations for SYN and FIN */
 	struct pending_operation pending_flags;
@@ -132,6 +134,8 @@ enum tcp_flags {
 	TCP_ACK_PENDING = 0x0004,
 	/** TCP selective acknowledgement is enabled */
 	TCP_SACK_ENABLED = 0x0008,
+	/** TCP keepalive setting is enabled */
+	TCP_KEEPALIVE_ENABLED = 0x0010,
 };
 
 /** TCP internal header
@@ -178,10 +182,18 @@ static struct process_descriptor tcp_process_desc;
 static struct interface_descriptor tcp_xfer_desc;
 static void tcp_expired ( struct retry_timer *timer, int over );
 static void tcp_wait_expired ( struct retry_timer *timer, int over );
+static void tcp_keepalive_expired ( struct retry_timer *timer, int over );
 static struct tcp_connection * tcp_demux ( unsigned int local_port );
 static int tcp_rx_ack ( struct tcp_connection *tcp, uint32_t ack,
 			uint32_t win );
 
+/** The "tcp-keepalive" setting */
+const struct setting tcp_keepalive_setting __setting ( SETTING_MISC, tcp-keepalive ) = {
+	.name = "tcp-keepalive",
+	.description = "Use keepalive packets to prevent TCP stalls",
+	.type = &setting_type_int8,
+};
+
 /**
  * Name TCP state
  *
@@ -285,6 +297,7 @@ static int tcp_open ( struct interface *xfer, struct sockaddr *peer,
 	process_init_stopped ( &tcp->process, &tcp_process_desc, &tcp->refcnt );
 	timer_init ( &tcp->timer, tcp_expired, &tcp->refcnt );
 	timer_init ( &tcp->wait, tcp_wait_expired, &tcp->refcnt );
+	timer_init ( &tcp->keepalive, tcp_keepalive_expired, &tcp->refcnt );
 	tcp->prev_tcp_state = TCP_CLOSED;
 	tcp->tcp_state = TCP_STATE_SENT ( TCP_SYN );
 	tcp_dump_state ( tcp );
@@ -314,6 +327,11 @@ static int tcp_open ( struct interface *xfer, struct sockaddr *peer,
 	tcp->local_port = port;
 	DBGC ( tcp, "TCP %p bound to port %d\n", tcp, tcp->local_port );
 
+	if ( fetch_intz_setting ( NULL, &tcp_keepalive_setting ) ) {
+		DBGC ( tcp, "TCP %p keepalive is enabled\n", tcp );
+		tcp->flags |= TCP_KEEPALIVE_ENABLED;
+	}
+
 	/* Start timer to initiate SYN */
 	start_timer_nodelay ( &tcp->timer );
 
@@ -381,6 +399,7 @@ static void tcp_close ( struct tcp_connection *tcp, int rc ) {
 		process_del ( &tcp->process );
 		stop_timer ( &tcp->timer );
 		stop_timer ( &tcp->wait );
+		stop_timer ( &tcp->keepalive );
 		list_del ( &tcp->list );
 		ref_put ( &tcp->refcnt );
 		DBGC ( tcp, "TCP %p connection deleted\n", tcp );
@@ -823,6 +842,71 @@ static void tcp_wait_expired ( struct retry_timer *timer, int over __unused ) {
 }
 
 /**
+ * Keepalive timer expired
+ *
+ * @v timer		Keepalive timer
+ * @v over		Failure indicator
+ */
+static void tcp_keepalive_expired ( struct retry_timer *timer, int over __unused ) {
+	struct tcp_connection *tcp =
+		container_of ( timer, struct tcp_connection, keepalive );
+
+	struct io_buffer *iobuf;
+	struct tcp_header *tcphdr;
+	int rc;
+
+	/* If the connection is not established, do nothing */
+	if ( tcp->tcp_state != TCP_ESTABLISHED ) {
+		return;
+	}
+
+	/* If retransmission timer is running, there's no need to send keepalive */
+	if ( timer_running ( &tcp->timer ) ) {
+		start_timer ( &tcp->keepalive );
+		return;
+	}
+
+	DBGC ( tcp, "TCP %p sending keepalive packet\n", tcp );
+
+	/* Allocate I/O buffer */
+	iobuf = alloc_iob ( TCP_MAX_HEADER_LEN );
+	if ( ! iobuf ) {
+		DBGC ( tcp, "TCP %p could not allocate iobuf for %08x..%08x "
+		       "%08x\n", tcp, tcp->snd_seq, tcp->snd_seq,
+		       tcp->rcv_ack );
+		return;
+	}
+	iob_reserve ( iobuf, TCP_MAX_HEADER_LEN );
+
+	/* Fill up the TCP header */
+	tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
+	memset ( tcphdr, 0, sizeof ( *tcphdr ) );
+	tcphdr->src = htons ( tcp->local_port );
+	tcphdr->dest = tcp->peer.st_port;
+
+	/* Sequence number one less than the current one indicates keepalive */
+	tcphdr->seq = htonl ( tcp->snd_seq - 1 );
+	tcphdr->ack = htonl ( tcp->rcv_ack );
+	tcphdr->hlen = ( ( sizeof ( *tcphdr ) / 4 ) << 4 );
+	tcphdr->flags = TCP_ACK;
+	tcphdr->win = htons ( tcp->rcv_win >> tcp->rcv_win_scale );
+	tcphdr->csum = tcpip_chksum ( iobuf->data, iob_len ( iobuf ) );
+
+	/* Transmit packet */
+	if ( ( rc = tcpip_tx ( iobuf, &tcp_protocol, NULL, &tcp->peer, NULL,
+			       &tcphdr->csum ) ) != 0 ) {
+		DBGC ( tcp, "TCP %p could not transmit %08x..%08x %08x: %s\n",
+		       tcp, tcp->snd_seq, ( tcp->snd_seq + tcp->snd_sent ),
+		       tcp->rcv_ack, strerror ( rc ) );
+	}
+
+	/* Reschedule the timer. We may need to send multiple keepalive
+	 * packets before the server side starts responding again.
+	 */
+	start_timer ( &tcp->keepalive );
+}
+
+/**
  * Send RST response to incoming packet
  *
  * @v in_tcphdr		TCP header of incoming packet
@@ -1434,6 +1518,12 @@ static int tcp_rx ( struct io_buffer *iobuf,
 		goto discard;
 	}
 
+	if ( tcp->flags & TCP_KEEPALIVE_ENABLED ) {
+		/* The connection is alive, restart the keepalive timer */
+		stop_timer ( &tcp->keepalive );
+		start_timer ( &tcp->keepalive );
+	}
+
 	/* Record old data-transfer window */
 	old_xfer_window = tcp_xfer_window ( tcp );
 
-- 
2.5.5




More information about the ipxe-devel mailing list