rfc8684xml2.original.xml   rfc8684.xml 
<?xml version="1.0" encoding="US-ASCII"?> <?xml version='1.0' encoding='utf-8'?>
<!-- Convert to HTML and Text with xml2rfc: http://xml2rfc.ietf.org. -->
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [ <!DOCTYPE rfc SYSTEM "rfc2629-xhtml.ent">
<!ENTITY RFC5533 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. <rfc xmlns:xi="http://www.w3.org/2001/XInclude" submissionType="IETF"
RFC.5533.xml"> category="std" consensus="true" docName="draft-ietf-mptcp-rfc6824bis-18" nu
<!ENTITY RFC5062 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. mber="8684" ipr="trust200902" obsoletes="6824" updates="" xml:lang="en" tocInclu
RFC.5062.xml"> de="true" symRefs="true" sortRefs="true" version="3">
<!ENTITY RFC5061 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.5061.xml"> <!-- xml2rfc v2v3 conversion 2.27.0 -->
<!ENTITY RFC4960 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.4960.xml">
<!ENTITY RFC4987 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.4987.xml">
<!ENTITY RFC6234 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.6234.xml">
<!ENTITY RFC4086 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.4086.xml">
<!ENTITY RFC5681 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.5681.xml">
<!ENTITY RFC2119 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.2119.xml">
<!ENTITY RFC2992 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.2992.xml">
<!ENTITY RFC2979 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.2979.xml">
<!ENTITY RFC2104 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.2104.xml">
<!ENTITY RFC2018 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.2018.xml">
<!ENTITY RFC1918 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.1918.xml">
<!ENTITY RFC0793 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.0793.xml">
<!ENTITY RFC7323 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.7323.xml">
<!ENTITY RFC1122 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.1122.xml">
<!ENTITY RFC3135 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.3135.xml">
<!ENTITY RFC3022 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.3022.xml">
<!ENTITY RFC6181 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.6181.xml">
<!ENTITY RFC6182 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.6182.xml">
<!ENTITY RFC6356 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.6356.xml">
<!ENTITY RFC6555 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.6555.xml">
<!ENTITY RFC8126 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.8126.xml">
<!ENTITY RFC6897 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.6897.xml">
<!ENTITY RFC6528 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.6528.xml">
<!ENTITY RFC5961 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.5961.xml">
<!ENTITY RFC7413 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.7413.xml">
<!ENTITY RFC7430 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.7430.xml">
<!ENTITY RFC8174 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.8174.xml">
<!ENTITY RFC8041 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.
RFC.8041.xml">
]>
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
<?rfc strict="no" ?>
<?rfc toc="yes"?>
<?rfc tocdepth="4"?>
<?rfc symrefs="yes"?>
<?rfc sortrefs="yes" ?>
<?rfc compact="yes" ?>
<?rfc subcompact="no" ?>
<?rfc rfcedstyle="yes"?>
<rfc category="std" docName="draft-ietf-mptcp-rfc6824bis-18" ipr="trust200902" o bsoletes="6824">
<front> <front>
<title abbrev="Multipath TCP">TCP Extensions for Multipath Operation with Mu ltiple Addresses</title> <title abbrev="Multipath TCP">TCP Extensions for Multipath Operation with Mu ltiple Addresses</title>
<seriesInfo name="RFC" value="8684"/>
<author fullname="Alan Ford" initials="A." surname="Ford"> <author fullname="Alan Ford" initials="A." surname="Ford">
<organization>Pexip</organization> <organization>Pexip</organization>
<address> <address>
<!-- <postal>
<street>Beech Court</street>
<city>Hurst</city>
<region>Berkshire</region>
<code>RG10 0RQ</code>
<country>UK</country>
</postal> -->
<email>alan.ford@gmail.com</email> <email>alan.ford@gmail.com</email>
</address> </address>
</author> </author>
<author fullname="Costin Raiciu" initials="C." surname="Raiciu"> <author fullname="Costin Raiciu" initials="C." surname="Raiciu">
<organization abbrev="U. Politechnica of Bucharest">University Politehnica of Bucharest</organization> <organization abbrev="U. Politehnica of Bucharest">University Politehnica of Bucharest</organization>
<address> <address>
<postal> <postal>
<street>Splaiul Independentei 313</street> <street>Splaiul Independentei 313</street>
<city>Bucharest</city> <city>Bucharest</city>
<country>Romania</country> <country>Romania</country>
</postal> </postal>
<email>costin.raiciu@cs.pub.ro</email> <email>costin.raiciu@cs.pub.ro</email>
</address> </address>
</author> </author>
<author fullname="Mark Handley" initials="M." surname="Handley"> <author fullname="Mark Handley" initials="M." surname="Handley">
<organization abbrev="U. College London">University College London</organi zation> <organization abbrev="U. College London">University College London</organi zation>
<address> <address>
<postal> <postal>
<street>Gower Street</street> <street>Gower Street</street>
<city>London</city> <city>London</city>
<code>WC1E 6BT</code> <code>WC1E 6BT</code>
<country>UK</country> <country>United Kingdom</country>
</postal> </postal>
<email>m.handley@cs.ucl.ac.uk</email> <email>m.handley@cs.ucl.ac.uk</email>
</address> </address>
</author> </author>
<author fullname="Olivier Bonaventure" initials="O." surname="Bonaventure"> <author fullname="Olivier Bonaventure" initials="O." surname="Bonaventure">
<organization abbrev="U. catholique de Louvain">Universit&eacute; catholiq <organization abbrev="U. catholique de Louvain" ascii="Universite catholique
ue de Louvain</organization> de Louvain">Université catholique de Louvain</organization>
<address> <address>
<postal> <postal>
<street>Pl. Ste Barbe, 2</street> <street>Pl. Ste Barbe, 2</street>
<code>1348</code> <code>1348</code>
<city>Louvain-la-Neuve</city> <city>Louvain-la-Neuve</city>
<country>Belgium</country> <country>Belgium</country>
</postal> </postal>
<email>olivier.bonaventure@uclouvain.be</email> <email>olivier.bonaventure@uclouvain.be</email>
</address> </address>
</author> </author>
skipping to change at line 101 skipping to change at line 55
<address> <address>
<postal> <postal>
<street>Pl. Ste Barbe, 2</street> <street>Pl. Ste Barbe, 2</street>
<code>1348</code> <code>1348</code>
<city>Louvain-la-Neuve</city> <city>Louvain-la-Neuve</city>
<country>Belgium</country> <country>Belgium</country>
</postal> </postal>
<email>olivier.bonaventure@uclouvain.be</email> <email>olivier.bonaventure@uclouvain.be</email>
</address> </address>
</author> </author>
<author fullname="Christoph Paasch" initials="C." surname="Paasch"> <author fullname="Christoph Paasch" initials="C." surname="Paasch">
<organization abbrev="Apple, Inc.">Apple, Inc.</organization> <organization abbrev="Apple, Inc.">Apple, Inc.</organization>
<address> <address>
<postal> <postal>
<street></street> <street/>
<city>Cupertino</city> <city>Cupertino</city>
<country>US</country> <region>CA</region>
<country>United States of America</country>
</postal> </postal>
<email>cpaasch@apple.com</email> <email>cpaasch@apple.com</email>
</address> </address>
</author> </author>
<date year="2020" month="January"/>
<date year="2019" />
<area>General</area>
<workgroup>Internet Engineering Task Force</workgroup>
<keyword>tcp extensions multipath multihomed subflow</keyword> <keyword>tcp extensions multipath multihomed subflow</keyword>
<abstract> <abstract>
<t>TCP/IP communication is currently restricted to a single path per conne <t>TCP/IP communication is currently restricted to a single path per conne
ction, yet multiple paths often exist between peers. The simultaneous use of the ction, yet multiple paths often exist between peers. The simultaneous use of the
se multiple paths for a TCP/IP session would improve resource usage within the n se multiple paths for a TCP/IP session would improve resource usage within the n
etwork and, thus, improve user experience through higher throughput and improved etwork and thus improve user experience through higher throughput and improved r
resilience to network failure.</t> esilience to network failure.</t>
<t>Multipath TCP provides the ability to simultaneously use multiple
<t>Multipath TCP provides the ability to simultaneously use multiple paths paths between peers. This document presents a set of extensions to
between peers. This document presents a set of extensions to traditional TCP to traditional TCP to support multipath operation. The protocol offers the
support multipath operation. The protocol offers the same type of service to ap same type of service to applications as TCP (i.e., a reliable bytestream),
plications as TCP (i.e., reliable bytestream), and it provides the components ne and it provides the components necessary to establish and use multiple TCP flow
cessary to establish and use multiple TCP flows across potentially disjoint path s across potentially disjoint paths.</t>
s.</t> <t>This document specifies v1 of Multipath TCP, obsoleting v0 as
specified in RFC 6824, through clarifications and modifications primarily
<t>This document specifies v1 of Multipath TCP, obsoleting v0 as specified driven by deployment experience.</t>
in RFC6824, through clarifications and modifications primarily driven by deploy
ment experience.</t>
</abstract> </abstract>
</front> </front>
<middle> <middle>
<section title="Introduction" anchor="sec_intro"> <section anchor="sec_intro" numbered="true" toc="default">
<t>Multipath TCP (MPTCP) is a set of extensions to regular TCP <xref targe <name>Introduction</name>
t="RFC0793"/> to provide a Multipath TCP <xref target="RFC6182"/> service, which <t>Multipath TCP (MPTCP) is a set of extensions to regular TCP <xref
enables a transport connection to operate across multiple paths target="RFC0793" format="default"/> to provide a Multipath TCP service <xr
simultaneously. This document presents the protocol changes required to add mult ef target="RFC6182" format="default"/>, which enables a transport connection to
ipath capability to TCP; specifically, those for signaling and setting up multip operate across multiple paths
le paths ("subflows"), managing these subflows, reassembly of data, and terminat simultaneously. This document presents the protocol changes required to add
ion of sessions. multipath capability to TCP -- specifically, those for signaling and setting
This is not the only information required to create a Multipath TCP implem up multiple paths ("subflows"), managing these subflows, reassembly of data,
entation, however. This document is complemented by three others: and termination of sessions. This is not the only information required to create
<list style="symbols"> a Multipath TCP implementation, however. This document is complemented by three
<t>Architecture <xref target="RFC6182"/>, which explains the motivatio others:
ns behind Multipath TCP, contains a discussion of high-level design decisions on
which this design is based, and an explanation of a functional separation throu
gh which an extensible MPTCP implementation can be developed.</t>
<t>Congestion control <xref target="RFC6356"/> presents a safe congest
ion control algorithm for coupling the behavior of the multiple paths in order t
o "do no harm" to other network users.</t>
<t>Application considerations <xref target="RFC6897"/> discusses what
impact MPTCP will have on applications, what applications will want to do with M
PTCP, and as a consequence of these factors, what API extensions an MPTCP implem
entation should present.</t>
</list>
This document is an update to, and obsoletes, the v0 specification of Mult
ipath TCP (RFC6824). This document specifies MPTCP v1, which is not backward com
patible with MPTCP v0. This document additionally defines version negotiation pr
ocedures for implementations that support both versions.
</t> </t>
<ul spacing="normal">
<section title="Design Assumptions" anchor="sec_assum"> <li><xref target="RFC6182" format="default"/> (MPTCP architecture), whic
<t>In order to limit the potentially huge design space, the mptcp workin h
g group imposed two key constraints on the Multipath TCP design presented in thi explains the motivations behind Multipath TCP, contains a discussion
s document: of high-level design decisions on which this design is based, and provid
<list style="symbols"> es an explanation of a functional separation through which an extensible MPTCP i
<t>It must be backwards-compatible with current, regular TCP, to inc mplementation can be developed.</li>
rease its chances of deployment.</t> <li><xref target="RFC6356" format="default"/> (congestion control), whic
<t>It can be assumed that one or both hosts are multihomed and multi h presents a safe congestion control algorithm for coupling the behavior of the
addressed.</t> multiple paths in order to "do no harm" to other network users.</li>
</list> <li><xref target="RFC6897"
format="default"/> (application considerations), which discusses what im
pact MPTCP will have on applications, what applications will want to do with MPT
CP, and as a consequence of these factors, what API extensions an MPTCP implemen
tation should present.</li>
</ul>
<t>
This document obsoletes the v0 specification of
Multipath TCP <xref target="RFC6824"/>. This document specifies MPTCP v1,
which is not backward compatible with MPTCP v0. This document additionally defin
es version negotiation procedures for implementations that support both versions
.
</t>
<section anchor="sec_assum" numbered="true" toc="default">
<name>Design Assumptions</name>
<t>In order to limit the potentially huge design space, the
MPTCP Working Group imposed two key constraints on the Multipath TCP des
ign presented in this document:
</t> </t>
<t>To simplify the design, we assume that the presence of multiple addre <ul spacing="normal">
sses at a host is sufficient to indicate the existence of multiple paths. These <li>It must be backward compatible with current, regular TCP, to incre
paths need not be entirely disjoint: they may share one or many routers between ase its chances of deployment.</li>
them. Even in such a situation, making use of multiple paths is beneficial, impr <li>It can be assumed that one or both hosts are multihomed and multia
oving resource utilization and resilience to a subset of node failures. The cong ddressed.</li>
estion control algorithms defined in <xref target="RFC6356"/> ensure this does n </ul>
ot act detrimentally. Furthermore, there may be some scenarios where different T <t>To simplify the design, we assume that the presence of multiple
CP ports on a single host can provide disjoint paths (such as through certain Eq addresses at a host is sufficient to indicate the existence of
ual-Cost Multipath (ECMP) implementations <xref target="RFC2992"/>), and so the multiple paths. These paths need not be entirely disjoint: they may
MPTCP design also supports the use of ports in path identifiers.</t> share one or many routers between them. Even in such a situation,
<t>There are three aspects to the backwards-compatibility listed above ( making use of multiple paths is beneficial, improving resource
discussed in more detail in <xref target="RFC6182"/>): utilization and resilience to a subset of node failures. The
<list style="hanging"> congestion control algorithm defined in <xref target="RFC6356"
<t hangText="External Constraints:"> The protocol must function thro format="default"/> ensures that the use of multiple paths does not act d
ugh the vast majority of existing etrimentally.
middleboxes such as NATs, firewalls, and proxies, and as such must resemble exis Furthermore, there may be some scenarios where different TCP ports on a
ting TCP as far as possible on the single host can provide disjoint paths (such as through certain
wire. Furthermore, the protocol must not assume the segments it sends on the wir Equal-Cost Multipath (ECMP) implementations <xref target="RFC2992"
e arrive unmodified at the destination: format="default"/>), and so the MPTCP design also supports the use of
they may be split or coalesced; TCP options may be removed or duplicated. </t> ports in path identifiers.</t>
<t hangText="Application Constraints:"> The protocol must be usable <t>There are three aspects to the backward compatibility listed above (d
with no change to existing applications that use the common TCP API (although it iscussed in more detail in <xref target="RFC6182" format="default"/>):
is reasonable that not all features would be available to such legacy applicati
ons). Furthermore, the protocol must provide the same service model as regular T
CP to the application.</t>
<t hangText="Fallback:"> The protocol should be able to fall back to
standard TCP with no interference from the user, to be able to communicate with
legacy hosts.</t>
</list>
</t> </t>
<t>The complementary application considerations document <xref target="R <dl newline="false" spacing="normal" indent="3">
FC6897"/> discusses the necessary features of an API to provide backwards-compat <dt>External Constraints:</dt>
ibility, as well as API extensions to convey the behavior of MPTCP at a level of <dd> The protocol must function through the vast majority of existing
control and information equivalent to that available with regular, single-path middleboxes such as NATs, firewalls, and proxies, and as such must resemble exis
TCP.</t> ting TCP as far as possible on the
<t>Further discussion of the design constraints and associated design de wire. Furthermore, the protocol must not assume that the segments it sends on th
cisions are given in the MPTCP Architecture document <xref target="RFC6182"/> an e wire arrive unmodified at the destination:
d in <xref target="howhard"/>.</t> they may be split or coalesced; TCP options may be removed or duplicated. </dd>
<dt>Application Constraints:</dt>
<dd> The protocol must be usable with no change to existing applicatio
ns that use the common TCP API (although it is reasonable that not all features
would be available to such legacy applications). Furthermore, the protocol must
provide the same service model as regular TCP to the application.</dd>
<dt>Fallback:</dt>
<dd> The protocol should be able to fall back to standard TCP with no
interference from the user, to be able to communicate with legacy hosts.</dd>
</dl>
<t>The complementary application considerations document <xref
target="RFC6897" format="default"/> discusses the necessary features
of an API to provide backward compatibility, as well as API extensions t
o convey the behavior of MPTCP at a level of control and information equivalent
to that available with regular, single-path TCP.</t>
<t>Further discussion of the design constraints and associated design de
cisions is given in the MPTCP architecture document <xref target="RFC6182" forma
t="default"/> and in <xref target="howhard" format="default"/>.</t>
</section> </section>
<section anchor="sec_layers" numbered="true" toc="default">
<section title="Multipath TCP in the Networking Stack" anchor="sec_layers" <name>Multipath TCP in the Networking Stack</name>
>
<t>MPTCP operates at the transport layer and aims to be transparent to b oth higher and lower <t>MPTCP operates at the transport layer and aims to be transparent to b oth higher and lower
layers. It is a set of additional features on top of standard TCP; <xref target= "fig_arch" /> illustrates layers. It is a set of additional features on top of standard TCP; <xref target= "fig_arch" format="default"/> illustrates
this layering. MPTCP is designed to be usable by legacy applications with no cha nges; detailed discussion this layering. MPTCP is designed to be usable by legacy applications with no cha nges; detailed discussion
of its interactions with applications is given in <xref target="RFC6897"/>.</t> of its interactions with applications is given in <xref target="RFC6897" format=
"default"/>.</t>
<figure align="center" anchor="fig_arch" title="Comparison of Standard T <figure anchor="fig_arch">
CP and MPTCP Protocol Stacks"> <name>Comparison of Standard TCP and MPTCP Protocol Stacks</name>
<artwork align="left"><![CDATA[ <artwork align="left" name="" type="" alt=""><![CDATA[
+-------------------------------+ +-------------------------------+
| Application | | Application |
+---------------+ +-------------------------------+ +---------------+ +-------------------------------+
| Application | | MPTCP | | Application | | MPTCP |
+---------------+ + - - - - - - - + - - - - - - - + +---------------+ + - - - - - - - + - - - - - - - +
| TCP | | Subflow (TCP) | Subflow (TCP) | | TCP | | Subflow (TCP) | Subflow (TCP) |
+---------------+ +-------------------------------+ +---------------+ +-------------------------------+
| IP | | IP | IP | | IP | | IP | IP |
+---------------+ +-------------------------------+ +---------------+ +-------------------------------+ ]]></artwork>
]]></artwork>
</figure> </figure>
</section> </section>
<section numbered="true" toc="default">
<section title="Terminology"> <name>Terminology</name>
<t>This document makes use of a number of terms that are either MPTCP-sp <t>This document makes use of a number of terms that are either MPTCP sp
ecific or have defined meaning in the context of MPTCP, as follows: ecific or have defined meaning in the context of MPTCP, as follows:
<list style="hanging"> </t>
<t hangText="Path:"> A sequence of links between a sender and a receiv <dl newline="false" spacing="normal" indent="3">
er, defined in this context by a 4-tuple of source and destination address/port <dt>Path:</dt>
pairs.</t> <dd> A sequence of links between a sender and a receiver, defined in t
<t hangText="Subflow:"> A flow of TCP segments operating over an indiv his context by a 4-tuple of source and destination address&wj;/port pairs.</dd>
idual path, which forms part of a larger MPTCP connection. A subflow is started <dt>Subflow:</dt>
and terminated similar to a regular TCP connection.</t> <dd> A flow of TCP segments operating over an individual path, which f
<t hangText="(MPTCP) Connection:"> A set of one or more subflows, over orms part of a larger MPTCP connection. A subflow is started and terminated simi
which an application can communicate between two hosts. There is a one-to-one m larly to a regular TCP connection.</dd>
apping between a connection and an application socket.</t> <dt>(MPTCP) Connection:</dt>
<t hangText="Data-level:"> The payload data is nominally transferred o <dd> A set of one or more subflows, over which an application can comm
ver a connection, which in turn is transported over subflows. Thus, the term "d unicate between two hosts. There is a one&#8209;to&#8209;one mapping between a c
ata-level" is synonymous with "connection level", in contrast to "subflow-level" onnection and an application socket.</dd>
, which refers to properties of an individual subflow.</t> <dt>Data-level:</dt>
<t hangText="Token:"> A locally unique identifier given to a multipath <dd> The payload data is nominally transferred over a connection, whic
connection by a host. May also be referred to as a "Connection ID".</t> h in turn is transported over subflows. Thus, the term "data-level" is synonymo
<t hangText="Host:"> An end host operating an MPTCP implementation, an us with "connection-level", in contrast to "subflow-level", which refers to prop
d either initiating or accepting an MPTCP connection.</t> erties of an individual subflow.</dd>
</list> <dt>Token:</dt>
In addition to these terms, note that MPTCP's interpretation of, and eff <dd> A locally unique identifier given to a multipath connection by a
ect on, regular single-path TCP semantics are discussed in <xref target="sec_sem host. May also be referred to as a "Connection ID".</dd>
antics"/>.</t> <dt>Host:</dt>
<dd> An end host operating an MPTCP implementation, and either initiat
ing or accepting an MPTCP connection.</dd>
</dl>
<t>
In addition to these terms, note that MPTCP's interpretation of, and eff
ect on, regular single-path TCP semantics are discussed in <xref target="sec_sem
antics" format="default"/>.</t>
</section> </section>
<section anchor="sec_operation" numbered="true" toc="default">
<section title="MPTCP Concept" anchor="sec_operation"> <name>MPTCP Concept</name>
<t>This section provides a high-level summary of normal <t>This section provides a high-level summary of normal
operation of MPTCP, and is illustrated by the scenario shown in operation of MPTCP; this type of scenario is illustrated in
<xref target="fig_scenario"/>. A detailed description of operation is given in < <xref target="fig_scenario" format="default"/>. A detailed description of how
xref target="sec_protocol"/>. MPTCP operates is given in <xref target="sec_protocol" format="default"/>.
<list style="symbols">
<t>To a non-MPTCP-aware application, MPTCP will behave the same as n
ormal TCP. Extended APIs could provide
additional control to MPTCP-aware applications <xref target="RFC6897"/>.
An application begins by opening a TCP socket in the normal way.
MPTCP signaling and operation are handled by the MPTCP implementation.
</t>
<t>An MPTCP connection begins similarly to a regular TCP connection.
This is
illustrated in <xref target="fig_scenario"/> where an MPTCP connection is establ
ished between
addresses A1 and B1 on Hosts A and B, respectively.</t>
<t>If extra paths are available, additional TCP sessions (termed MPT
CP "subflows")
are created on these paths, and are combined with the existing session, which co
ntinues
to appear as a single connection to the applications at both ends. The creation
of the
additional TCP session is illustrated between Address A2 on Host A and Address B
1 on
Host B.</t>
<t>MPTCP identifies multiple paths by the presence of multiple addre
sses
at hosts. Combinations of these multiple addresses equate to the additional path
s.
In the example, other potential paths that could be set up are A1&lt;-&gt;B2 and
A2&lt;-&gt;B2.
Although this additional session is shown as being initiated from A2, it could e
qually have
been initiated from B1 or B2.</t>
<t>The discovery and setup of additional subflows
will be achieved through a path management method; this document describes a mec
hanism
by which a host can initiate new subflows by using its own additional addresses,
or by
signaling its available addresses to the other host.</t>
<t>MPTCP adds connection-level sequence numbers to allow the reassem
bly of
segments arriving on multiple subflows with differing network delays. </t>
<t>Subflows are terminated as regular TCP connections, with a four-w
ay FIN
handshake. The MPTCP connection is terminated by a connection-level FIN.</t>
</list>
</t> </t>
<?rfc needLines='17'?> <figure anchor="fig_scenario">
<figure align="center" anchor="fig_scenario" title="Example MPTCP Usag <name>Example MPTCP Usage Scenario</name>
e Scenario"> <artwork align="left" name="" type="" alt=""><![CDATA[
<artwork align="left"><![CDATA[
Host A Host B Host A Host B
------------------------ ------------------------ ------------------------ ------------------------
Address A1 Address A2 Address B1 Address B2 Address A1 Address A2 Address B1 Address B2
---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
| | | | | | | |
| (initial connection setup) | | | (initial connection setup) | |
|----------------------------------->| | |----------------------------------->| |
|<-----------------------------------| | |<-----------------------------------| |
| | | | | | | |
| (additional subflow setup) | | (additional subflow setup) |
| |--------------------->| | | |--------------------->| |
| |<---------------------| | | |<---------------------| |
| | | | | | | |
| | | | | | | | ]]></artwork>
]]></artwork> </figure>
</figure> <ul spacing="normal">
<li>To a non-MPTCP-aware application, MPTCP will behave the same as no
rmal TCP. Extended APIs could provide
additional control to MPTCP-aware applications <xref target="RFC6897" format="de
fault"/>.
An application begins by opening a TCP socket in the normal way.
MPTCP signaling and operation are handled by the MPTCP implementation.
</li>
<li>An MPTCP connection begins similarly to a regular TCP connection.
This is
illustrated in <xref target="fig_scenario" format="default"/>, where an MPTCP co
nnection is established between
addresses A1 and B1 on Hosts A and B, respectively.</li>
<li>If extra paths are available, additional TCP sessions (termed MPTC
P "subflows")
are created on these paths and are combined with the existing session, which con
tinues
to appear as a single connection to the applications at both ends. The creation
of the
additional TCP session is illustrated between Address A2 on Host A and Address B
1 on
Host B.</li>
<li>MPTCP identifies multiple paths by the presence of multiple addres
ses
at hosts. Combinations of these multiple addresses equate to the additional path
s.
In the example, other potential paths that could be set up are A1&lt;-&gt;B2 and
A2&lt;-&gt;B2.
Although this additional session is shown as being initiated from A2, it could e
qually have
been initiated from B1 or B2.</li>
<li>The discovery and setup of additional subflows
will be achieved through a path management method; this document describes a mec
hanism
by which a host can initiate new subflows by using its own additional addresses
or by
signaling its available addresses to the other host.</li>
<li>MPTCP adds connection-level sequence numbers to allow the reassemb
ly of
segments arriving on multiple subflows with differing network delays. </li>
<li>Subflows are terminated as regular TCP connections, with a four&#8
209;way FIN
handshake. The MPTCP connection is terminated by a connection-level FIN.</li>
</ul>
</section> </section>
<section numbered="true" toc="default">
<name>Requirements Language</name>
<section title="Requirements Language"> <t>
<t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL The key words "<bcp14>MUST</bcp14>", "<bcp14>MUST NOT</bcp14>",
NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "<bcp14>REQUIRED</bcp14>", "<bcp14>SHALL</bcp14>", "<bcp14>SHALL NOT</bcp14>
"MAY", and "OPTIONAL" in this document are to be interpreted as ",
described in BCP&nbsp;14 <xref target="RFC2119"/> <xref target="RFC8174" "<bcp14>SHOULD</bcp14>", "<bcp14>SHOULD NOT</bcp14>",
/> "<bcp14>RECOMMENDED</bcp14>", "<bcp14>NOT RECOMMENDED</bcp14>",
when, and only when, they appear in all capitals, as shown here.</t> "<bcp14>MAY</bcp14>", and "<bcp14>OPTIONAL</bcp14>" in this document are to
be
interpreted as described in BCP&nbsp;14 <xref target="RFC2119"/> <xref
target="RFC8174"/> when, and only when, they appear in all capitals, as
shown here.
</t>
</section> </section>
</section> </section>
<section anchor="sec_overview" numbered="true" toc="default">
<section title="Operation Overview" anchor="sec_overview"> <name>Operation Overview</name>
<t>This section presents a single description of common MPTCP operation, w <t>This section presents a single description of common MPTCP operation, w
ith reference to the protocol operation. This is a high-level overview of the ke ith reference to the protocol operation. This is a high-level overview of the ke
y functions; the full specification follows in <xref target="sec_protocol"/>. Ex y functions; the full specification follows in <xref target="sec_protocol" forma
tensibility and negotiated features are not discussed here. Considerable referen t="default"/>. Extensibility and negotiated features are not discussed here. Con
ce is made to symbolic names of MPTCP options throughout this section -- these a siderable reference is made to symbolic names of MPTCP options throughout this s
re subtypes of the IANA-assigned MPTCP option (see <xref target="IANA"/>), and t ection -- these are subtypes of the IANA&#8209;assigned MPTCP option (see <xref
heir formats are defined in the detailed protocol specification that follows in target="IANA" format="default"/>), and their formats are defined in the detailed
<xref target="sec_protocol"/>.</t> protocol specification provided in <xref target="sec_protocol" format="default"
/>.</t>
<t>A Multipath TCP connection provides a bidirectional bytestream between two ho <t>A Multipath TCP connection provides a bidirectional bytestream between
sts communicating like normal TCP and, thus, does not require any change to the two hosts communicating like normal TCP and thus does not require any change to
applications. However, Multipath TCP enables the hosts to use different paths wi the applications. However, Multipath TCP enables the hosts to use different path
th different IP addresses to exchange packets belonging to the MPTCP connection. s with different IP addresses to exchange packets belonging to the MPTCP connect
A Multipath TCP connection appears like a normal TCP connection to an applicati ion. A Multipath TCP connection appears like a normal TCP connection to an appli
on. However, to the network layer, each MPTCP subflow looks like a regular TCP f cation. However, to the network layer, each MPTCP subflow looks like a regular T
low whose segments carry a new TCP option type. Multipath TCP manages the creati CP flow whose segments carry a new TCP option type. Multipath TCP manages the cr
on, removal, and utilization of these subflows to send data. The number of subfl eation, removal, and utilization of these subflows to send data. The number of s
ows that are managed within a Multipath TCP connection is not fixed and it can f ubflows that are managed within a Multipath TCP connection is not fixed, and it
luctuate during the lifetime of the Multipath TCP connection.</t> can fluctuate during the lifetime of the Multipath TCP connection.</t>
<t>All MPTCP operations are signaled with a TCP option -- a single numeric
<t>All MPTCP operations are signaled with a TCP option -- a single numerical typ al type for MPTCP, with "subtypes" for each MPTCP message. What follows is a sum
e for MPTCP, with "sub-types" for each MPTCP message. What follows is a summary mary of the purpose and rationale of these messages.</t>
of the purpose and rationale of these messages.</t> <section numbered="true" toc="default">
<section title="Initiating an MPTCP Connection"> <name>Initiating an MPTCP Connection</name>
<t>This is the same signaling as for initiating a normal TCP connection, but the <t>This is the same signaling as for initiating a normal TCP connection,
SYN, SYN/ACK, and initial ACK (and data) packets also carry the MP_CAPABLE opti but the SYN, SYN/ACK, and initial ACK (and data) packets also carry the MP_CAPA
on. This option has a variable length and serves multiple purposes. Firstly, it BLE option. This option has a variable length and serves multiple purposes. Firs
verifies whether the remote host supports Multipath TCP; secondly, this option a tly, it verifies whether the remote host supports Multipath TCP; secondly, this
llows the hosts to exchange some information to authenticate the establishment o option allows the hosts to exchange some information to authenticate the establi
f additional subflows. Further details are given in <xref target="sec_init"/>.</ shment of additional subflows. Further details are given in <xref target="sec_in
t> it" format="default"/>.</t>
<artwork align="left" name="" type="" alt=""><![CDATA[
<figure><artwork align="left"><![CDATA[
Host A Host B Host A Host B
------ ------ ------ ------
MP_CAPABLE -> MP_CAPABLE ->
[flags] [flags]
<- MP_CAPABLE <- MP_CAPABLE
[B's key, flags] [B's key, flags]
ACK + MP_CAPABLE (+ data) -> ACK + MP_CAPABLE (+ data) ->
[A's key, B's key, flags, (data-level details)] [A's key, B's key, flags, (data-level details)] ]]></artwork>
]]></artwork></figure> <t>Retransmission of the ACK + MP_CAPABLE can occur if it is not known i
f it has been received. The following diagrams show all possible exchanges for t
<t>Retransmission of the ACK + MP_CAPABLE can occur if it is not known if it has he initial subflow setup to ensure this reliability.</t>
been received. The following diagrams show all possible exchanges for the initi <artwork align="left" name="" type="" alt=""><![CDATA[
al subflow setup to ensure this reliability.</t>
<figure><artwork align="left"><![CDATA[
Host A (with data to send immediately) Host B Host A (with data to send immediately) Host B
------ ------ ------ ------
MP_CAPABLE -> MP_CAPABLE ->
[flags] [flags]
<- MP_CAPABLE <- MP_CAPABLE
[B's key, flags] [B's key, flags]
ACK + MP_CAPABLE + data -> ACK + MP_CAPABLE + data ->
[A's key, B's key, flags, data-level details] [A's key, B's key, flags, data-level details]
Host A (with data to send later) Host B Host A (with data to send later) Host B
skipping to change at line 316 skipping to change at line 304
Host A Host B (sending first) Host A Host B (sending first)
------ ------ ------ ------
MP_CAPABLE -> MP_CAPABLE ->
[flags] [flags]
<- MP_CAPABLE <- MP_CAPABLE
[B's key, flags] [B's key, flags]
ACK + MP_CAPABLE -> ACK + MP_CAPABLE ->
[A's key, B's key, flags] [A's key, B's key, flags]
<- ACK + DSS + data <- ACK + DSS + data
[data-level details] [data-level details] ]]></artwork>
]]></artwork></figure> </section>
</section> <section numbered="true" toc="default">
<name>Associating a New Subflow with an Existing MPTCP Connection</name>
<section title="Associating a New Subflow with an Existing MPTCP Connection"> <t>The exchange of keys in the MP_CAPABLE handshake provides material th
<t>The exchange of keys in the MP_CAPABLE handshake provides material that can b at can be used to authenticate the endpoints when new subflows will be set up.
e used to authenticate the endpoints when new subflows will be set up.
Additional subflows begin in the same way as initiating a normal TCP connection, but the SYN, SYN/ACK, and ACK packets also carry the MP_JOIN option. </t> Additional subflows begin in the same way as initiating a normal TCP connection, but the SYN, SYN/ACK, and ACK packets also carry the MP_JOIN option. </t>
<t>Host A initiates a new subflow between one of its addresses and one
<t>Host A initiates a new subflow between one of its addresses and one of Host B of Host B's addresses. The token -- generated from the key -- is used
's addresses. The token -- generated from the key -- is used to identify which M to identify which MPTCP connection it is joining, and the Hash&#8209;bas
PTCP connection it is joining, and the HMAC is used for authentication. The Hash ed
-based Message Authentication Code (HMAC) uses the keys exchanged in the MP_CAPA Message Authentication Code (HMAC) is used for authentication. The HMAC
BLE handshake, and the random numbers (nonces) exchanged in these MP_JOIN option uses the keys exchanged in the MP_CAPABLE handshake and the random numbers (nonc
s. MP_JOIN also contains flags and an Address ID that can be used to refer to th es) exchanged in these MP_JOIN options. MP_JOIN also contains flags and an Addre
e source address without the sender needing to know if it has been changed by a ss ID that can be used to refer to the source address without the sender needing
NAT. Further details are in <xref target="sec_join"/>.</t> to know if it has been changed by a NAT. Further details are given in <xref tar
get="sec_join" format="default"/>.</t>
<figure><artwork align="left"><![CDATA[ <artwork align="left" name="" type="" alt=""><![CDATA[
Host A Host B Host A Host B
------ ------ ------ ------
MP_JOIN -> MP_JOIN ->
[B's token, A's nonce, [B's token, A's nonce,
A's Address ID, flags] A's Address ID, flags]
<- MP_JOIN <- MP_JOIN
[B's HMAC, B's nonce, [B's HMAC, B's nonce,
B's Address ID, flags] B's Address ID, flags]
ACK + MP_JOIN -> ACK + MP_JOIN ->
[A's HMAC] [A's HMAC]
<- ACK <- ACK ]]></artwork>
]]></artwork></figure> </section>
</section> <section numbered="true" toc="default">
<name>Informing the Other Host about Another Potential Address</name>
<section title="Informing the Other Host about Another Potential Address"> <t>The set of IP addresses associated to a multihomed host may change du
<t>The set of IP addresses associated to a multihomed host may change during the ring the lifetime of an MPTCP connection. MPTCP supports the addition and remova
lifetime of an MPTCP connection. MPTCP supports the addition and removal of add l of addresses on a host both implicitly and explicitly. If Host A has establish
resses on a host both implicitly and explicitly. If Host A has established a sub ed a subflow starting at address&wj;/port pair IP#-A1 and wants to open a second
flow starting at address/port pair IP#-A1 and wants to open a second subflow sta subflow starting at address&wj;/port pair IP#-A2, it simply initiates the estab
rting at address/port pair IP#-A2, it simply initiates the establishment of the lishment of the subflow as explained above. The remote host will then be implici
subflow as explained above. The remote host will then be implicitly informed abo tly informed about the new address.</t>
ut the new address.</t> <t>In some circumstances, a host may want to advertise to the remote
host the availability of an address without establishing a new subflow
<t>In some circumstances, a host may want to advertise to the remote host the av -- for example, when a NAT prevents setup in one direction. In&nbsp;the exampl
ailability of an address without establishing a new subflow, for example, when a e below, Host A informs Host B about its alternative IP&nbsp;address&wj;/port pa
NAT prevents setup in one direction. In the example below, Host A informs Host ir (IP#-A2). Host B may later send an MP_JOIN to this new address. The ADD_ADDR
B about its alternative IP address/port pair (IP#-A2). Host B may later send an option contains an HMAC to authenticate the address as having been sent from the
MP_JOIN to this new address. The ADD_ADDR option contains a HMAC to authenticat originator of the connection. The receiver of this option echoes it back to the
e the address as having been sent from the originator of the connection. The rec client to indicate successful receipt. Further details are given in <xref targe
eiver of this option echoes it back to the client to indicate successful receipt t="sec_add_address" format="default"/>.</t>
. Further details are in <xref target="sec_add_address"/>.</t> <artwork align="left" name="" type="" alt=""><![CDATA[
<figure><artwork align="left"><![CDATA[
Host A Host B Host A Host B
------ ------ ------ ------
ADD_ADDR -> ADD_ADDR ->
[Echo-flag=0, [Echo-flag=0,
IP#-A2, IP#-A2,
IP#-A2's Address ID, IP#-A2's Address ID,
HMAC of IP#-A2] HMAC of IP#-A2]
<- ADD_ADDR <- ADD_ADDR
[Echo-flag=1, [Echo-flag=1,
IP#-A2, IP#-A2,
IP#-A2's Address ID, IP#-A2's Address ID,
HMAC of IP#-A2] HMAC of IP#-A2] ]]></artwork>
]]></artwork></figure> <t>There is a corresponding signal for address removal, making use of
the Address ID that is signaled in the ADD_ADDR handshake.
<t>There is a corresponding signal for address removal, making use of the Addres
s ID that is signaled in the add address handshake. Further details in <xref tar
get="sec_remove_addr"/>.</t>
<figure><artwork align="left"><![CDATA[ Further details are given in <xref target="sec_remove_addr" format="default"/>.
</t>
<artwork align="left" name="" type="" alt=""><![CDATA[
Host A Host B Host A Host B
------ ------ ------ ------
REMOVE_ADDR -> REMOVE_ADDR ->
[IP#-A2's Address ID] [IP#-A2's Address ID] ]]></artwork>
]]></artwork></figure> </section>
</section> <section numbered="true" toc="default">
<name>Data Transfer Using MPTCP</name>
<section title="Data Transfer Using MPTCP"> <t>To ensure reliable, in-order delivery of data over subflows that may
<t>To ensure reliable, in-order delivery of data over subflows that may appear a appear and disappear at any time, MPTCP uses a 64-bit Data Sequence Number (DSN)
nd disappear at any time, MPTCP uses a 64-bit data sequence number (DSN) to numb to number all data sent over the MPTCP connection. Each subflow has its own 32-
er all data sent over the MPTCP connection. Each subflow has its own 32-bit sequ bit sequence number space, utilizing the regular TCP sequence number header, and
ence number space, utilising the regular TCP sequence number header, and an MPTC an MPTCP option maps the subflow sequence space to the data sequence space. In
P option maps the subflow sequence space to the data sequence space. In this way this way, data can be retransmitted on different subflows (mapped to the same DS
, data can be retransmitted on different subflows (mapped to the same DSN) in th N) in the event of failure.</t>
e event of failure.</t> <t>The Data Sequence Signal (DSS) carries the Data Sequence Mapping. The
Data Sequence Mapping consists of the subflow sequence number, data sequence nu
<t>The Data Sequence Signal (DSS) carries the Data Sequence Mapping. The Data Se mber, and length for which this mapping is valid. This option can also carry a c
quence Mapping consists of the subflow sequence number, data sequence number, an onnection-level acknowledgment (the "Data ACK") for the received DSN.</t>
d length for which this mapping is valid. This option can also carry a connectio <t>With MPTCP, all subflows share the same receive buffer and advertise
n-level acknowledgment (the "Data ACK") for the received DSN.</t> the same receive window. There are two levels of acknowledgment in MPTCP. Regula
r TCP acknowledgments are used on each subflow to acknowledge the reception of t
<t>With MPTCP, all subflows share the same receive buffer and advertise the same he segments sent over the subflow independently of their DSN. In addition, there
receive window. There are two levels of acknowledgment in MPTCP. Regular TCP ac are connection-level acknowledgments for the data sequence space. These acknowl
knowledgments are used on each subflow to acknowledge the reception of the segme edgments track the advancement of the bytestream and slide the receive window.</
nts sent over the subflow independently of their DSN. In addition, there are con t>
nection-level acknowledgments for the data sequence space. These acknowledgments <t>Further details are given in <xref target="sec_generalop" format="def
track the advancement of the bytestream and slide the receiving window.</t> ault"/>.</t>
<artwork align="left" name="" type="" alt=""><![CDATA[
<t>Further details are in <xref target="sec_generalop"/>.</t>
<figure><artwork align="left"><![CDATA[
Host A Host B Host A Host B
------ ------ ------ ------
DSS -> DSS ->
[Data Sequence Mapping] [Data Sequence Mapping]
[Data ACK] [Data ACK]
[Checksum] [Checksum] ]]></artwork>
]]></artwork></figure> </section>
</section> <section numbered="true" toc="default">
<name>Requesting a Change in a Path's Priority</name>
<section title="Requesting a Change in a Path's Priority"> <t>Hosts can indicate at initial subflow setup whether they wish the sub
<t>Hosts can indicate at initial subflow setup whether they wish the subflow to flow to be used as a regular or backup path -- a backup path only being used if
be used as a regular or backup path -- a backup path only being used if there ar there are no regular paths available. During a connection, Host A can request a
e no regular paths available. During a connection, Host A can request a change i change in the priority of a subflow through the MP_PRIO signal to Host B. Furthe
n the priority of a subflow through the MP_PRIO signal to Host B. Further detail r details are given in <xref target="sec_policy" format="default"/>.</t>
s are in <xref target="sec_policy"/>.</t> <artwork align="left" name="" type="" alt=""><![CDATA[
<figure><artwork align="left"><![CDATA[
Host A Host B Host A Host B
------ ------ ------ ------
MP_PRIO -> MP_PRIO -> ]]></artwork>
]]></artwork></figure> </section>
</section> <section numbered="true" toc="default">
<name>Closing an MPTCP Connection</name>
<section title="Closing an MPTCP Connection"> <t>When a host wants to close an existing subflow but not the whole conn
<t>When a host wants to close an existing subflow, but not the whole connection, ection, it can initiate a regular TCP FIN/ACK exchange.</t>
it can initiate a regular TCP FIN/ACK exchange.</t> <t>When Host A wants to inform Host B that it has no more data to send,
it signals this "Data FIN" as part of the DSS (see above). It has the same seman
<t>When Host A wants to inform Host B that it has no more data to send, it signa tics and behavior as a regular TCP FIN, but at the connection level. Once all th
ls this "Data FIN" as part of the Data Sequence Signal (see above). It has the s e data on the MPTCP connection has been successfully received, this message is a
ame semantics and behavior as a regular TCP FIN, but at the connection level. On cknowledged at the connection level with a Data ACK. Further details are given i
ce all the data on the MPTCP connection has been successfully received, then thi n <xref target="sec_close" format="default"/>.</t>
s message is acknowledged at the connection level with a Data ACK. Further detai <artwork align="left" name="" type="" alt=""><![CDATA[
ls are in <xref target="sec_close"/>.</t>
<figure><artwork align="left"><![CDATA[
Host A Host B Host A Host B
------ ------ ------ ------
DSS -> DSS ->
[Data FIN] [Data FIN]
<- DSS <- DSS
[Data ACK] [Data ACK] ]]></artwork>
]]></artwork></figure> <t>There is an additional method of connection closure, referred to as
"Fast Close", which is analogous to closing a single-path TCP
<t>There is an additional method of connection closure, referred to as "Fast Clo connection with a RST signal. The MP_FASTCLOSE signal is used to
se", which is analogous to closing a single-path TCP connection with a RST signa indicate to the peer that the connection will be abruptly closed and
l. The MP_FASTCLOSE signal is used to indicate to the peer that the connection w no data will be accepted anymore. This can be used on an ACK (which
ill be abruptly closed and no data will be accepted anymore. This can be used on ensures reliability of the signal) or a RST (which does not).
an ACK (ensuring reliability of the signal), or a RST (which is not). Both exam Both examples are shown in the following diagrams. Further details are given in
ples are shown in the following diagrams. Further details are in <xref target="s <xref target="sec_fastclose" format="default"/>.</t>
ec_fastclose"/>.</t> <artwork align="left" name="" type="" alt=""><![CDATA[
<figure><artwork align="left"><![CDATA[
Host A Host B Host A Host B
------ ------ ------ ------
ACK + MP_FASTCLOSE -> ACK + MP_FASTCLOSE ->
[B's key] [B's key]
[RST on all other subflows] -> [RST on all other subflows] ->
<- [RST on all subflows] <- [RST on all subflows]
Host A Host B Host A Host B
------ ------ ------ ------
RST + MP_FASTCLOSE -> RST + MP_FASTCLOSE ->
[B's key] [on all subflows] [B's key] [on all subflows]
<- [RST on all subflows] <- [RST on all subflows] ]]></artwork>
]]></artwork></figure> </section>
</section> <section numbered="true" toc="default">
<name>Notable Features</name>
<section title="Notable Features"> <t>It is worth highlighting that MPTCP's signaling has been designed wit
<t>It is worth highlighting that MPTCP's signaling has been designed with severa h several key requirements in mind:
l key requirements in mind:
<list style="symbols"> </t>
<t>To cope with NATs on the path, addresses are referred to by Address IDs, in c <ul spacing="normal">
ase the IP packet's source <li>To cope with NATs on the path, addresses are referred to by Addres
s IDs, in case the IP packet's source
address gets changed by a NAT. Setting up a new TCP flow is not possible if the receiver of the SYN is behind a NAT; address gets changed by a NAT. Setting up a new TCP flow is not possible if the receiver of the SYN is behind a NAT;
to allow subflows to be created when either end is behind a NAT, MPTCP uses the to allow subflows to be created when either end is behind a NAT, MPTCP uses the
ADD_ADDR message. </t> ADD_ADDR message. </li>
<li>MPTCP falls back to ordinary TCP if MPTCP operation is not
<t>MPTCP falls back to ordinary TCP if MPTCP operation is not possible, for exam possible -- for example, if one host is not MPTCP capable or if a middlebox alt
ple, if one host is not MPTCP capable or if a middlebox alters the payload. This ers the payload. This is discussed in <xref target="sec_fallback" format="defaul
is discussed in <xref target="sec_fallback"/>.</t> t"/>.</li>
<li>To address the threats identified in <xref target="RFC6181"
<t>To address the threats identified in <xref target="RFC6181"/>, the following format="default"/>, the following steps are taken: keys are sent in
steps are taken: keys are sent in the clear in the MP_CAPABLE messages; MP_JOIN the clear in the MP_CAPABLE messages; MP_JOIN messages are secured
messages are secured with HMAC-SHA256 (<xref target="RFC2104"/>, <xref target="R with HMAC-SHA256 (<xref target="RFC2104" format="default"/> using
FC6234"/>) using those keys; and standard TCP validity checks are made on the ot the algorithm in <xref target="RFC6234" format="default"/>) using thos
her messages (ensuring sequence numbers are in-window <xref target="RFC5961"/>). e keys; and standard
Residual threats to MPTCP v0 were identified in <xref target="RFC7430"/>, and t TCP validity checks are made on the other messages (ensuring that
hose affecting the protocol (i.e. modification to ADD_ADDR) have been incorporat sequence numbers are in&#8209;window <xref target="RFC5961"
ed in this document. Further discussion of security can be found in <xref target format="default"/>).
="sec_security"/>.</t> Residual threats to MPTCP v0 were identified in <xref target="RFC7430"
</list></t> format="default"/>, and those affecting the protocol (i.e., modifications to
</section> ADD_ADDR) have been incorporated in this document.
Further discussion of security can be found in <xref target="sec_security" form
at="default"/>.</li>
</ul>
</section>
</section> </section>
<section anchor="sec_protocol" numbered="true" toc="default">
<section title="MPTCP Protocol" anchor="sec_protocol"> <name>MPTCP Operations: An Overview</name>
<t>This section describes the operation of the MPTCP protocol, and is subd <t>This section describes the operation of MPTCP. The
ivided into sections for each key part of the protocol operation.</t> subsections below discuss each key part of the protocol operation.</t>
<t>All MPTCP operations are signaled using optional TCP header fields. A s <t>All MPTCP operations are signaled using optional TCP header fields. A s
ingle TCP option number ("Kind") has been assigned by IANA for MPTCP (see <xref ingle TCP option number ("Kind") has been assigned by IANA for MPTCP (see <xref
target="IANA"/>), and then individual messages will be determined by a "subtype" target="IANA" format="default"/>), and then individual messages will be determin
, the values of which are also stored in an IANA registry (and are also listed i ed by a "subtype", the values of which are also stored in an IANA registry (and
n <xref target="IANA"/>). As with all TCP options, the Length field is specified are also listed in <xref target="IANA" format="default"/>). As with all TCP opti
in bytes, and includes the 2 bytes of Kind and Length.</t> ons, the Length field is specified in bytes and includes the 2&nbsp;bytes of Kin
<t>Throughout this document, when reference is made to an MPTCP option by d and Length.</t>
symbolic name, such as "MP_CAPABLE", this refers to a TCP option with the single <t>Throughout this document, when reference is made to an MPTCP option by
MPTCP option type, and with the subtype value of the symbolic name as defined i symbolic name, such as "MP_CAPABLE", this refers to a TCP option with the single
n <xref target="IANA"/>. This subtype is a 4-bit field -- the first 4 bits of th MPTCP option type, and with the subtype value of the symbolic name as defined i
e option payload, as shown in <xref target="fig_option"/>. The MPTCP messages ar n <xref target="IANA" format="default"/>. This subtype is a 4-bit field -- the f
e defined in the following sections.</t> irst 4 bits of the option payload, as shown in <xref target="fig_option" format=
"default"/>. The MPTCP messages are defined in the following sections.</t>
<?rfc needLines='8'?> <figure anchor="fig_option">
<figure align="center" anchor="fig_option" title="MPTCP Option Format"> <name>MPTCP Option Format</name>
<artwork align="left"><![CDATA[ <artwork align="left" name="" type="" alt=""><![CDATA[
1 2 3 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+---------------+---------------+-------+-----------------------+ +---------------+---------------+-------+-----------------------+
| Kind | Length |Subtype| | | Kind | Length |Subtype| |
+---------------+---------------+-------+ | +---------------+---------------+-------+ |
| Subtype-specific data | | Subtype-specific data |
| (variable length) | | (variable length) |
+---------------------------------------------------------------+ +---------------------------------------------------------------+ ]]></artwork
]]></artwork> >
</figure> </figure>
<t>Those MPTCP options associated with subflow initiation are used on
<t>Those MPTCP options associated with subflow initiation are used on pack packets with the SYN flag set. Additionally, there is one MPTCP option
ets with the SYN flag set. Additionally, there is one MPTCP option for signaling for signaling metadata to ensure that segmented data can be recombined for
metadata to ensure segmented data can be recombined for delivery to the applica delivery to the application.</t>
tion.</t> <t>The remaining options, however, are signals that do not need to be on
<t>The remaining options, however, are signals that do not need to be on a a specific packet, such as those for signaling additional
specific packet, such as those for signaling additional addresses. Whilst an im addresses. While an implementation may desire to send MPTCP options as
plementation may desire to send MPTCP options as soon as possible, it may not be soon as possible, it may not be possible to combine all desired options
possible to combine all desired options (both those for MPTCP and for regular T (both those for MPTCP and for regular TCP, such as SACK (selective
CP, such as SACK (selective acknowledgment) <xref target="RFC2018"/>) on a singl acknowledgment) <xref target="RFC2018" format="default"/>) on a single
e packet. Therefore, an implementation may choose to send duplicate ACKs contain packet. Therefore, an implementation may choose to send duplicate ACKs
ing the additional signaling information. This changes the semantics of a duplic containing the additional signaling information. This changes the
ate ACK; these are usually only sent as a signal of a lost segment <xref target= semantics of a duplicate ACK; these are usually only sent as a signal of
"RFC5681"/> in regular TCP. Therefore, an MPTCP implementation receiving a dupli a lost segment <xref target="RFC5681" format="default"/> in regular
cate ACK that contains an MPTCP option MUST NOT treat it as a signal of congesti TCP. Therefore, an MPTCP implementation receiving a duplicate ACK that
on. Additionally, an MPTCP implementation SHOULD NOT send more than two duplicat contains an MPTCP option <bcp14>MUST NOT</bcp14> treat it as a signal of
e ACKs in a row for the purposes of sending MPTCP options alone, in order to ens congestion. Additionally, an MPTCP implementation <bcp14>SHOULD
ure no middleboxes misinterpret this as a sign of congestion.</t> NOT</bcp14> send more than two duplicate ACKs in a row for the purposes
<t>Furthermore, standard TCP validity checks (such as ensuring the sequenc of sending MPTCP options alone, in order to ensure that no middleboxes mis
e number and acknowledgment number are within window) MUST be undertaken before interpret this as a sign of congestion.</t>
processing any MPTCP signals, as described in <xref target="RFC5961"/>, and init <t>Furthermore, standard TCP validity checks (such as ensuring that the
ial subflow sequence numbers SHOULD be generated according to the recommendation sequence number and acknowledgment number are within the window) <bcp14>MU
s in <xref target="RFC6528"/>.</t> ST</bcp14> be undertaken before processing any MPTCP signals, as described in <x
ref target="RFC5961" format="default"/>, and initial subflow sequence numbers <b
<section title="Connection Initiation" anchor="sec_init"> cp14>SHOULD</bcp14> be generated according to the recommendations in <xref targe
t="RFC6528" format="default"/>.</t>
<section anchor="sec_init" numbered="true" toc="default">
<name>Connection Initiation</name>
<t>Connection initiation begins with a SYN, SYN/ACK, ACK exchange <t>Connection initiation begins with a SYN, SYN/ACK, ACK exchange
on a single path. Each packet on a single path. Each packet
contains the Multipath Capable (MP_CAPABLE) MPTCP option contains the Multipath Capable (MP_CAPABLE) MPTCP option
(<xref target="tcpm_capable"/>). This option declares its (<xref target="tcpm_capable" format="default"/>). This option declares i
sender is capable of performing Multipath TCP and wishes to do ts
sender capable of performing Multipath TCP and wishes to do
so on this particular connection.</t> so on this particular connection.</t>
<figure anchor="tcpm_capable">
<t>The MP_CAPABLE exchange in this specification (v1) is different to <name>Multipath Capable (MP_CAPABLE) Option</name>
<artwork align="left" name="" type="" alt=""><![CDATA[
1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+---------------+---------------+-------+-------+---------------+
| Kind | Length |Subtype|Version|A|B|C|D|E|F|G|H|
+---------------+---------------+-------+-------+---------------+
| Option Sender's Key (64 bits) |
| (if option Length > 4) |
| |
+---------------------------------------------------------------+
| Option Receiver's Key (64 bits) |
| (if option Length > 12) |
| |
+-------------------------------+-------------------------------+
| Data-Level Length (16 bits) | Checksum (16 bits, optional) |
+-------------------------------+-------------------------------+ ]]></artwork
>
</figure>
<t>The MP_CAPABLE exchange in this specification (v1) is different than
that specified in v0. If a host supports multiple versions that specified in v0. If a host supports multiple versions
of MPTCP, the sender of the MP_CAPABLE option SHOULD signal the of MPTCP, the sender of the MP_CAPABLE option <bcp14>SHOULD</bcp14> sign al the
highest version number it supports. In return, in its MP_CAPABLE option , highest version number it supports. In return, in its MP_CAPABLE option ,
the receiver will signal the version number it wishes to use, which MUST the receiver will signal the version number it wishes to use, which <bcp 14>MUST</bcp14>
be equal to or lower than the version number indicated in the initial be equal to or lower than the version number indicated in the initial
MP_CAPABLE. MP_CAPABLE.
There is a caveat though with respect to this version negotiation with There is a caveat, though, with respect to this version negotiation with
old listeners that only support v0. A listener that supports v0 expects that old listeners that only support v0. A listener that supports v0 expects that
the MP_CAPABLE option in the SYN-segment includes the initiator's key. I the MP_CAPABLE option in the SYN segment will include the initiator's
f key. If, however,
the initiator however already upgraded to v1, it won't include the key i the initiator already upgraded to v1, it won't include the key in the
n the SYN segment. Thus, the listener will ignore the MP_CAPABLE of this SYN s
SYN-segment. Thus, the listener will ignore the MP_CAPABLE of this SYN-s egment
egment and reply with a SYN/ACK that does not include an MP_CAPABLE. The initia
and reply with a SYN/ACK that does not include an MP_CAPABLE. The initia tor <bcp14>MAY</bcp14>
tor MAY choose to immediately fall back to TCP or <bcp14>MAY</bcp14> choose to a
choose to immediately fall back to TCP or MAY choose to attempt a connec ttempt a connection
tion
using MPTCP v0 (if the initiator supports v0), in order to discover whet her the using MPTCP v0 (if the initiator supports v0), in order to discover whet her the
listener supports the earlier version of MPTCP. In general a MPTCP v0 co listener supports the earlier version of MPTCP. In general, an MPTCP v0
nnection connection
is likely to be preferred to a TCP one, however in a particular deployme will likely be preferred over a TCP connection; however, in a particular
nt scenario deployment scenario,
it may be known that the listener is unlikely to support MPTCPv0 and so it may be known that the listener is unlikely to support MPTCP v0 and so
the the
initiator may prefer not to attempt a v0 connection. An initiator MAY ca initiator may prefer not to attempt a v0 connection. An initiator <bcp14
che >MAY</bcp14> cache
information for a peer about what version of MPTCP it supports if any, a information for a peer about what version of MPTCP it supports, if any,
nd use and use
this information for future connection attempts.</t> this information for future connection attempts.</t>
<t>The MP_CAPABLE option is of variable length, with different fields
<t>The MP_CAPABLE option is variable-length, with different fields included, depending on which packet the option is used on. The full
included depending on which packet the option is used on. The full MP_CAPABLE option is shown in <xref target="tcpm_capable" format="defaul
MP_CAPABLE option is shown in <xref target="tcpm_capable"/>.</t> t"/>.</t>
<t>The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK packets
<?rfc needLines='10'?> that start the first subflow of an MPTCP connection, as well as the first packe
<figure align="center" anchor="tcpm_capable" title="Multipath Capable (M t that carries data, if the initiator wishes to send first. The data carried by
P_CAPABLE) Option"> each option is as follows, where A&nbsp;=&nbsp;initiator and B = listener.
<artwork align="left"><![CDATA[ </t>
1 2 3 <ul spacing="normal">
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 <li>SYN (A-&gt;B): only the first 4 octets (Length = 4).</li>
+---------------+---------------+-------+-------+---------------+ <li>SYN/ACK (B-&gt;A): B's key for this connection (Length = 12).</li>
| Kind | Length |Subtype|Version|A|B|C|D|E|F|G|H| <li>ACK (no data) (A-&gt;B): A's key followed by B's key (Length = 20)
+---------------+---------------+-------+-------+---------------+ .</li>
| Option Sender's Key (64 bits) | <li>ACK (with first data) (A-&gt;B): A's key followed by B's key follo
| (if option Length > 4) | wed by Data-Level Length, and optional Checksum (Length = 22 or 24).</li>
| | </ul>
+---------------------------------------------------------------+ <t>
| Option Receiver's Key (64 bits) | The contents of the option are determined by the SYN and ACK flags of th
| (if option Length > 12) | e packet, along with the option's Length field. In <xref target="tcpm_capable" f
| | ormat="default"/>, "Sender" and "Receiver" refer to the sender or receiver of th
+-------------------------------+-------------------------------+ e TCP packet (which can be either host).</t>
| Data-Level Length (16 bits) | Checksum (16 bits, optional) |
+-------------------------------+-------------------------------+
]]></artwork>
</figure>
<t>The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK packets
that start the first subflow of an MPTCP connection, as well as the first packe
t that carries data, if the initiator wishes to send first. The data carried by
each option is as follows, where A = initiator and B = listener.
<list style="symbols">
<t>SYN (A-&gt;B): only the first four octets (Length = 4).</t>
<t>SYN/ACK (B-&gt;A): B's Key for this connection (Length = 12).</t>
<t>ACK (no data) (A-&gt;B): A's Key followed by B's Key (Length = 20
).</t>
<t>ACK (with first data) (A-&gt;B): A's Key followed by B's Key foll
owed by Data-Level Length, and optional Checksum (Length = 22 or 24).</t>
</list>
The contents of the option is determined by the SYN and ACK flags of the
packet, along with the option's length field. For the diagram shown in <xref ta
rget="tcpm_capable"/>, "sender" and "receiver" refer to the sender or receiver o
f the TCP packet (which can be either host).</t>
<t>The initial SYN, containing just the MP_CAPABLE header, is used <t>The initial SYN, containing just the MP_CAPABLE header, is used
to define the version of MPTCP being requested, as well as exchanging to define the version of MPTCP being requested and also to exchange
flags to negotiate connection features, described later.</t> flags to negotiate connection features, as described later.</t>
<t>This option is used to declare the 64-bit keys that the end hosts
<t>This option is used to declare the 64-bit keys that the end hosts hav have generated for this MPTCP connection. These keys are used to
e generated for this MPTCP connection. These keys are used to authenticate the a authenticate the addition of future subflows to this connection. This
ddition of future subflows to this connection. This is the only time the key wil is the only time the key will be sent in the clear on the wire (unless "
l be sent in clear on the wire (unless "fast close", <xref target="sec_fastclose Fast Close" (<xref target="sec_fastclose" format="default"/>) is used); all futu
"/>, is used); all future subflows will identify the connection using a 32-bit " re subflows will identify the connection using a 32-bit "token". This token is a
token". This token is a cryptographic hash of this key. The algorithm for this p cryptographic hash of this key. The algorithm for this process is dependent on
rocess is dependent on the authentication algorithm selected; the method of sele the authentication algorithm selected; the method of selection is defined later
ction is defined later in this section.</t> in this section.</t>
<t>Upon reception of the initial SYN segment, a stateful server generate
<t>Upon reception of the initial SYN-segment, a stateful server generate s a random key and replies with a SYN/ACK. The key's method of generation is imp
s a random key and replies with a SYN/ACK. The key's method of generation is imp lementation specific. The key <bcp14>MUST</bcp14> be hard to guess, and it <bcp1
lementation specific. The key MUST be hard to guess, and it MUST be unique for t 4>MUST</bcp14> be unique for the sending host across all its current MPTCP conne
he sending host across all its current MPTCP connections. Recommendations for ge ctions. Recommendations for generating random numbers for use in keys are given
nerating random numbers for use in keys are given in <xref target="RFC4086"/>. C in <xref target="RFC4086" format="default"/>. Connections will be indexed at eac
onnections will be indexed at each host by the token (a one-way hash of the key) h host by the token (a one-way hash of the key). Therefore, an implementation wi
. Therefore, an implementation will require a mapping from each token to the cor ll require a mapping from each token to the corresponding connection, and in tur
responding connection, and in turn to the keys for the connection.</t> n to the keys for the connection.</t>
<t>There is a risk that two different keys will hash to the same
<t>There is a risk that two different keys will hash to the same token. token. The risk of hash collisions is usually small, unless the host
The risk of hash collisions is usually small, unless the host is handling many t is handling many tens of thousands of connections. Therefore, an
ens of thousands of connections. Therefore, an implementation SHOULD check its l implementation <bcp14>SHOULD</bcp14> check its list of connection
ist of connection tokens to ensure there is no collision before sending its key, tokens to ensure that there is no collision before sending its key,
and if there is, then it should generate a new key. This would, however, be cos and if there is, then it should generate a new key. This would,
tly for a server with thousands of connections. The subflow handshake mechanism however, be costly for a server with thousands of connections. The
(<xref target="sec_join"/>) will ensure that new subflows only join the correct subflow handshake mechanism (<xref target="sec_join"
connection, however, through the cryptographic handshake, as well as checking th format="default"/>) will ensure that new subflows only join the
e connection tokens in both directions, and ensuring sequence numbers are in-win correct connection, however, through the cryptographic handshake, as
dow. So in the worst case if there was a token collision, the new subflow would well as checking the connection tokens in both directions, and
not succeed, but the MPTCP connection would continue to provide a regular TCP se ensuring that sequence numbers are in-window. So, in the worst case, if
rvice.</t> there was a token collision, the new subflow would not succeed, but the MPTCP co
nnection would continue to provide a regular TCP service.</t>
<t>Since key generation is implementation-specific, there is no r <t>Since key generation is implementation specific, there is no
equirement that they be simply random numbers. An implementation is free to exch requirement that they simply be random numbers. An implementation is
ange cryptographic material out-of-band and generate these keys from this, in or free to exchange cryptographic material out of band and generate these
der to provide additional mechanisms by which to verify the identity of the comm keys from this material, in order to provide additional mechanisms by wh
unicating entities. For example, an implementation could choose to link its MPTC ich to verify the identity of the communicating entities. For example, an implem
P keys to those used in higher-layer TLS or SSH connections.</t> entation could choose to link its MPTCP keys to those used in higher-layer TLS o
r SSH connections.</t>
<t>If the server behaves in a <t>If the server behaves in a
stateless manner, it has to generate its own key in a verifiable stateless manner, it has to generate its own key in a verifiable
fashion. This verifiable way of generating the key can be done by fashion. This verifiable way of generating the key can be done by
using a hash of the 4-tuple, sequence number and a local secret using a hash of the 4-tuple, sequence number, and a local secret
(similar to what is done for the TCP-sequence number <xref target="RFC49 (similar to what is done for the TCP sequence number <xref target="RFC49
87"/>). 87" format="default"/>).
It will thus be able to verify whether it is indeed the originator of It will thus be able to verify whether it is indeed the originator of
the key echoed back in the later MP_CAPABLE option. the key echoed back in the subsequent MP_CAPABLE option.
As for a stateful server, the tokens SHOULD be checked for uniqueness, h As for a stateful server, the tokens <bcp14>SHOULD</bcp14> be checked fo
owever r uniqueness; however,
if uniqueness is not met, and there is no way to generate an alternative if uniqueness is not met and there is no way to generate an alternative
verifiable verifiable
key, then the connection MUST fall back to using regular TCP by not send key, then the connection <bcp14>MUST</bcp14> fall back to using regular
ing a TCP by not sending an
MP_CAPABLE in the SYN/ACK.</t> MP_CAPABLE in the SYN&wj;/ACK.</t>
<t>The ACK carries both A's key and B's key. This is the first time that A's key is seen on the wire, although it is expected that A will have generated a key locally before the initial SYN. The echoing of B's key allows B to operat e statelessly, as described above. Therefore, A's key must be delivered reliably to B, and in order to do this, the transmission of this packet must be made rel iable.</t> <t>The ACK carries both A's key and B's key. This is the first time that A's key is seen on the wire, although it is expected that A will have generated a key locally before the initial SYN. The echoing of B's key allows B to operat e statelessly, as described above. Therefore, A's key must be delivered reliably to B, and in order to do this, the transmission of this packet must be made rel iable.</t>
<t>If B has data to send first, then the reliable delivery of the
<t>If B has data to send first, then the reliable delivery of the ACK+MP ACK&nbsp;+&nbsp;MP_CAPABLE can be inferred by the receipt of this data w
_CAPABLE can be inferred by the receipt of this data with a MPTCP Data Sequence ith an
Signal (DSS) option (<xref target="sec_generalop"/>). If, however, A wishes to s MPTCP Data Sequence Signal (DSS) option (<xref target="sec_generalop"
end data first, it has two options to ensure the reliable delivery of the ACK+MP format="default"/>). If, however, A wishes to send data first, it has
_CAPABLE. If it immediately has data to send, then the third ACK (with data) wou two options to ensure the reliable delivery of the ACK + MP_CAPABLE. If
ld also contain an MP_CAPABLE option with additional data parameters (the Data-L it immediately has data to send, then the first ACK (with data) would
evel Length and optional Checksum as shown in <xref target="tcpm_capable"/>). If also contain an MP_CAPABLE option with additional data parameters (the
A does not immediately have data to send, it MUST include the MP_CAPABLE on the Data-Level Length and optional Checksum as shown in <xref
third ACK, but without the additional data parameters. When A does have data to target="tcpm_capable" format="default"/>). If A does not immediately
send, it must repeat the sending of the MP_CAPABLE option from the third ACK, w have data to send, it <bcp14>MUST</bcp14> include the MP_CAPABLE on
ith additional data parameters. This MP_CAPABLE option is in place of the DSS, a the first ACK, but without the additional data parameters. When A does
nd simply specifies the data-level length of the payload, and the checksum (if t have data to send, it must repeat the sending of the MP_CAPABLE option
he use of checksums is negotiated). This is the minimal data required to establi from the first ACK, with additional data parameters. This MP_CAPABLE
sh a MPTCP connection - it allows validation of the payload, and given it is the option is used in place of the DSS and simply specifies (1)&nbsp;the Dat
first data, the Initial Data Sequence Number (IDSN) is also known (as it is gen a-Level
erated from the key, as described below). Conveying the keys on the first data p Length of the payload and (2)&nbsp;the checksum (if the use of checksums
acket allows the TCP reliability mechanisms to ensure the packet is successfully is
delivered. The receiver will acknowledge this data at the connection level with negotiated). This is the minimal data required to establish an MPTCP
a Data ACK, as if a DSS option has been received.</t> connection -- it allows validation of the payload, and given that it is
the
<t>There could be situations where both A and B attempt to transmit init first data, the Initial Data Sequence Number (IDSN) is also known (as
ial data at the same time. For example, if A did not initially have data to send it is generated from the key, as described below). Conveying the keys
, but then needed to transmit data before it had received anything from B, it wo on the first data packet allows the TCP reliability mechanisms to
uld use a MP_CAPABLE option with data parameters (since it would not know if the ensure that the packet is successfully delivered. The receiver will ackn
MP_CAPABLE on the ACK was received). In such a situation, B may also have trans owledge this data at the connection level with a Data ACK, as if a DSS option ha
mitted data with a DSS option, but it had not yet been received at A. Therefore, s been received.</t>
B has received data with a MP_CAPABLE mapping after it has sent data with a DSS <t>There could be situations where both A and B attempt to transmit
option. To ensure these situations can be handled, it follows that the data par initial data at the same time. For example, if A did not initially
ameters in a MP_CAPABLE are semantically equivalent to those in a DSS option and have data to send but then needed to transmit data before it had
can be used interchangeably. Similar situations could occur when the MP_CAPABLE received anything from B, it would use an MP_CAPABLE option with data
with data is lost and retransmitted. Furthermore, in the case of TCP Segmentati parameters (since it would not know if the MP_CAPABLE on the ACK was
on Offloading, the MP_CAPABLE with data parameters may be duplicated across mult received). In such a situation, B may also have transmitted data with
iple packets, and implementations must also be able to cope with duplicate MP_CA a DSS option, but it had not yet been received at A. Therefore, B has
PABLE mappings as well as duplicate DSS mappings.</t> received data with an MP_CAPABLE mapping after it has sent data with a
DSS option. To ensure that these situations can be handled, it follows t
<t>Additionally, the MP_CAPABLE exchange allows the safe passage of MPTC hat the data parameters in an MP_CAPABLE are semantically equivalent to those in
P options on SYN packets to be determined. If any of these options are dropped, a DSS option and can be used interchangeably. Similar situations could occur wh
MPTCP will gracefully fall back to regular single-path TCP, as documented in <xr en the MP_CAPABLE with data is lost and retransmitted. Furthermore, in the case
ef target="sec_fallback"/>. If at any point in the handshake either party think of TCP segmentation offloading, the MP_CAPABLE with data parameters may be dupli
s the MPTCP negotiation is compromised, for example by a middlebox corrupting th cated across multiple packets, and implementations must also be able to cope wit
e TCP options, or unexpected ACK numbers being present, the host MUST stop using h duplicate MP_CAPABLE mappings as well as duplicate DSS mappings.</t>
MPTCP and no longer include MPTCP options in future TCP packets. The other host <t>Additionally, the MP_CAPABLE exchange allows the safe passage of
will then also fall back to regular TCP using the fall back mechanism. Note th MPTCP options on SYN packets to be determined. If any of these options
at new subflows MUST NOT be established (using the process documented in <xref t are dropped, MPTCP will gracefully fall back to regular single-path
arget="sec_join"/>) until a Data Sequence Signal (DSS) option has been successfu TCP, as documented in <xref target="sec_fallback" format="default"/>.
lly received across the path (as documented in <xref target="sec_generalop"/>).< If at any point in the handshake either party thinks the MPTCP
/t> negotiation is compromised -- for example, by a middlebox corrupting
the TCP options or by unexpected ACK numbers being present -- the host <
<t>Like all MPTCP options, the MP_CAPABLE option starts with the Kind an bcp14>MUST</bcp14> stop using MPTCP and no longer include MPTCP options in futur
d Length to specify the TCP-option kind and its length. Followed by that is the e TCP packets. The other host will then also fall back to regular TCP using the
MP_CAPABLE option. The first 4 bits of the first octet in the MP_CAPABLE option fallback mechanism. Note that new subflows <bcp14>MUST NOT</bcp14> be establish
(<xref target="tcpm_capable"/>) define the MPTCP option subtype (see <xref targe ed (using the process documented in <xref target="sec_join" format="default"/>)
t="IANA"/>; for MP_CAPABLE, this is 0x0), and the remaining 4 bits of this octet until a DSS option has been successfully received across the path (as documented
specify the MPTCP version in use (for this specification, this is 1).</t> in <xref target="sec_generalop" format="default"/>).</t>
<t>Like all MPTCP options, the MP_CAPABLE option starts with the Kind
and Length to specify the TCP option's kind and length. This
information is followed by the MP_CAPABLE option. The first 4 bits of
the first octet in the MP_CAPABLE option (<xref target="tcpm_capable"
format="default"/>) define the MPTCP Option Subtype (see <xref
target="IANA" format="default"/>; for MP_CAPABLE, this value is
0x0), and the remaining 4&nbsp;bits of this octet specify the MPTCP
version in use (for this specification, this value is&nbsp;1).</t>
<t>The second octet is reserved for flags, allocated as follows: <t>The second octet is reserved for flags, allocated as follows:
<list style="hanging"> </t>
<t hangText="A:"> The leftmost bit, labeled "A", SHOULD be set to 1 to <dl newline="false" spacing="normal" indent="14">
indicate "Checksum Required", unless the system administrator has decided that <dt>A:</dt>
checksums are not required (for example, if the environment is controlled and no <dd> The leftmost bit, labeled "A", <bcp14>SHOULD</bcp14> be set to 1
middleboxes exist that might adjust the payload).</t> to indicate "Checksum required", unless the system administrator has decided tha
<t hangText="B:"> The second bit, labeled "B", is an extensibility fla t checksums are not required (for example, if the environment is controlled and
g, and MUST be set to 0 for current implementations. This will be used for an ex no middleboxes exist that might adjust the payload).</dd>
tensibility mechanism in a future specification, and the impact of this flag wil <dt>B:</dt>
l be defined at a later date. It is expected, but not mandated, that this flag w <dd> The second bit, labeled "B", is an extensibility flag. It
ould be used as part of an alternative security mechanism that does not require <bcp14>MUST</bcp14> be set to 0 for current implementations. This
a full version upgrade of the protocol, but does require redefining some element flag will be used for an extensibility mechanism in a future specifica
s of the handshake. If receiving a message with the 'B' flag set to 1, and this tion, and the impact of this flag will be defined at a later date. It is expecte
is not understood, then the MP_CAPABLE in this SYN MUST be silently ignored, whi d, but not mandated, that this flag would be used as part of an alternative secu
ch triggers a fallback to regular TCP; the sender is expected to retry with a fo rity mechanism that does not require a full version upgrade of the protocol but
rmat compatible with this legacy specification. Note that the length of the MP_C does require redefining some elements of the handshake. If receiving a message w
APABLE option, and the meanings of bits "D" through "H", may be altered by setti ith the "B" flag set to 1 and this is not understood, then the MP_CAPABLE in thi
ng B=1.</t> s SYN <bcp14>MUST</bcp14> be silently ignored, which triggers a fallback to regu
<t hangText="C:"> The third bit, labeled "C", is set to "1" to indicat lar TCP; the sender is expected to retry with a format compatible with this lega
e that the sender of this option will not accept additional MPTCP subflows to th cy specification. Note that the length of the MP_CAPABLE option, and the meaning
e source address and port, and therefore the receiver MUST NOT try to open any a s of bits "D" through "H", may be altered by setting B=1.</dd>
dditional subflows towards this address and port. This is an efficiency improvem <dt>C:</dt>
ent for situations where the sender knows a restriction is in place, for example <dd> The third bit, labeled "C", is set to 1 to indicate that the
if the sender is behind a strict NAT, or operating behind a legacy Layer 4 load sender of this option will not accept additional MPTCP subflows to
balancer.</t> the source address and port, and therefore the receiver <bcp14>MUST
<t hangText="D through H:"> The remaining bits, labeled "D" through "H NOT</bcp14> try to open any additional subflows toward this address
", are used for crypto algorithm negotiation. In this specification only the ri and port. This improves efficiency in situations where the
ghtmost bit, labeled "H", is assigned. Bit "H" indicates the use of HMAC-SHA256 sender knows a restriction is in place -- for example, if the sender i
(as defined in <xref target="sec_join"/>). An implementation that only support s behind a strict NAT or operating behind a legacy Layer 4 load balancer.</dd>
s this method MUST set bit "H" to 1, and bits "D" through "G" to 0.</t> <dt>D through H:</dt>
</list> <dd> The remaining bits, labeled "D" through "H", are used for
crypto algorithm negotiation. In this specification, only the
A crypto algorithm MUST be specified. If flag bits D through H are all rightmost bit, labeled "H", is assigned. Bit "H" indicates the use
0, the MP_CAPABLE option MUST be treated as invalid and ignored (that is, it mus of HMAC-SHA256 (as defined in <xref target="sec_join"
t be treated as a regular TCP handshake).</t> format="default"/>). An implementation that only supports this
method <bcp14>MUST</bcp14> set bit "H" to 1 and bits "D"
<t>The selection of the authentication algorithm also impacts the algori through "G" to 0.</dd>
thm used to generate the token and the Initial Data Sequence Number (IDSN). In t
his specification, with only the SHA-256 algorithm (bit "H") specified and selec
ted, the token MUST be a truncated (most significant 32 bits) SHA-256 hash (<xre
f target="RFC6234"/>) of the key. A different, 64-bit truncation (the least sign
ificant 64 bits) of the SHA-256 hash of the key MUST be used as the IDSN. Note t
hat the key MUST be hashed in network byte order. Also note that the "least sign
ificant" bits MUST be the rightmost bits of the SHA-256 digest, as per <xref tar
get="RFC6234"/>. Future specifications of the use of the crypto bits may choose
to specify different algorithms for token and IDSN generation.</t>
<t>Both the crypto and checksum bits negotiate capabilities in similar w
ays. For the Checksum Required bit (labeled "A"), if either host requires the us
e of checksums, checksums MUST be used. In other words, the only way for checksu
ms not to be used is if both hosts in their SYNs set A=0. This decision is confi
rmed by the setting of the "A" bit in the third packet (the ACK) of the handshak
e. For example, if the initiator sets A=0 in the SYN, but the responder sets A=1
in the SYN/ACK, checksums MUST be used in both directions, and the initiator wi
ll set A=1 in the ACK. The decision whether to use checksums will be stored by a
n implementation in a per-connection binary state variable. If A=1 is received b
y a host that does not want to use checksums, it MUST fall back to regular TCP b
y ignoring the MP_CAPABLE option as if it was invalid.</t>
<t>For crypto negotiation, the responder has the choice. The initiator c
reates a proposal setting a bit for each algorithm it supports to 1 (in this ver
sion of the specification, there is only one proposal, so bit "H" will be always
set to 1). The responder responds with only 1 bit set -- this is the chosen alg
orithm. The rationale for this behavior is that the responder will typically be
a server with potentially many thousands of connections, so it may wish to choos
e an algorithm with minimal computational complexity, depending on the load. If
a responder does not support (or does not want to support) any of the initiator'
s proposals, it MUST respond without an MP_CAPABLE option, thus forcing a fallba
ck to regular TCP.</t>
<t>The MP_CAPABLE option is only used in the first subflow of a connecti </dl>
on, in order to identify the connection; all following subflows will use the "Jo <t>A crypto algorithm <bcp14>MUST</bcp14> be specified. If flag bits "D
in" option (see <xref target="sec_join"/>) to join the existing connection.</t> " through "H" are all 0, the MP_CAPABLE option <bcp14>MUST</bcp14> be treated as
invalid and ignored (that is, it must be treated as a regular TCP handshake).</
t>
<t>The selection of the authentication algorithm also impacts the algori
thm used to generate the token and the IDSN. In this specification, with only th
e SHA-256 algorithm (bit "H") specified and selected, the token <bcp14>MUST</bcp
14> be a truncated (most significant 32&nbsp;bits) SHA-256 hash <xref target="RF
C6234" format="default"/> of the key. A different, 64-bit truncation (the least
significant 64 bits) of the SHA-256 hash of the key <bcp14>MUST</bcp14> be used
as the IDSN. Note that the key <bcp14>MUST</bcp14> be hashed in network byte ord
er. Also note that the "least significant" bits <bcp14>MUST</bcp14> be the right
most bits of the SHA-256 digest, as per <xref target="RFC6234" format="default"/
>. Future specifications of the use of the crypto bits may choose to specify dif
ferent algorithms for token and IDSN generation.</t>
<t>Both the crypto and checksum bits negotiate capabilities in similar
ways. For the "Checksum required" bit (labeled "A"), if either host
requires the use of checksums, checksums <bcp14>MUST</bcp14> be
used. In other words, the only way for checksums not to be used is if
both hosts in their SYNs set A=0. This decision is confirmed by the
setting of the "A" bit in the third packet (the ACK) of the
handshake. For example, if the initiator sets A=0 in the SYN but the
responder sets A=1 in the SYN/ACK, checksums <bcp14>MUST</bcp14> be
used in both directions, and the initiator will set A=1 in the
ACK. The decision regarding whether to use checksums will be stored by a
n implementation in a per-connection binary state variable. If A=1 is received b
y a host that does not want to use checksums, it <bcp14>MUST</bcp14> fall back t
o regular TCP by ignoring the MP_CAPABLE option as if it was invalid.</t>
<t>For crypto negotiation, the responder has the choice. The initiator
creates a proposal setting a bit for each algorithm it supports to 1
(in this version of the specification, there is only one proposal, so
bit "H" will always be set to 1). The responder responds with only 1&nbs
p;bit set -- this is the chosen algorithm. The rationale for this behavior is th
at the responder will typically be a server with potentially many thousands of c
onnections, so it may wish to choose an algorithm with minimal computational com
plexity, depending on the load. If a responder does not support (or does not wan
t to support) any of the initiator's proposals, it <bcp14>MUST</bcp14> respond w
ithout an MP_CAPABLE option, thus forcing a fallback to regular TCP.</t>
<t>The MP_CAPABLE option is only used in the first subflow of a
connection, in order to identify the connection; all subsequent
subflows will use the MP_JOIN option (see <xref target="sec_join"
format="default"/>) to join the existing connection.</t>
<t>If a SYN contains an MP_CAPABLE option but the <t>If a SYN contains an MP_CAPABLE option but the
SYN/ACK does not, it is assumed that sender of the SYN/ACK is not SYN/ACK does not, it is assumed that the sender of the SYN/ACK is not
multipath capable; thus, the MPTCP session MUST operate as multipath capable; thus, the MPTCP session <bcp14>MUST</bcp14> operate a
a regular, single-path TCP. If a SYN does not contain a s
MP_CAPABLE option, the SYN/ACK MUST NOT contain one a regular, single-path TCP session. If a SYN does not contain an
MP_CAPABLE option, the SYN/ACK <bcp14>MUST NOT</bcp14> contain one
in response. If the third packet (the ACK) does not contain in response. If the third packet (the ACK) does not contain
the MP_CAPABLE option, then the session MUST fall back to the MP_CAPABLE option, then the session <bcp14>MUST</bcp14> fall back to
operating as a regular, single-path TCP. This is to maintain operating as a regular, single-path TCP session. This is done to maintai
n
compatibility with middleboxes on the path that drop some compatibility with middleboxes on the path that drop some
or all TCP options. Note that an implementation MAY choose or all TCP options. Note that an implementation <bcp14>MAY</bcp14> choos e
to attempt sending MPTCP options more than one time before to attempt sending MPTCP options more than one time before
making this decision to operate as regular TCP (see making this decision to operate as regular TCP (see
<xref target="heuristics"/>).</t> <xref target="heuristics" format="default"/>).</t>
<t>If the SYN packets are unacknowledged, it is up to local <t>If the SYN packets are unacknowledged, it is up to local
policy to decide how to respond. It is expected that a sender policy to decide how to respond. It is expected that a sender
will eventually fall back to single-path TCP (i.e., without the will eventually fall back to single-path TCP (i.e., without the
MP_CAPABLE option) in order to work around middleboxes that MP_CAPABLE option) in order to work around middleboxes that
may drop packets with unknown options; however, the number of may drop packets with unknown options; however, the number of
multipath-capable attempts that are made first will be up to multipath-capable attempts that are made first will be up to
local policy. local policy.
It is possible that MPTCP and non-MPTCP SYNs could get reordered It is possible that MPTCP and non-MPTCP SYNs could get reordered
in the network. Therefore, the final state is inferred from the in the network. Therefore, the final state is inferred from the
presence or absence of the MP_CAPABLE option in the third packet presence or absence of the MP_CAPABLE option in the third packet
of the TCP handshake. If this option is not present, the of the TCP handshake. If this option is not present, the
connection SHOULD fall back to regular TCP, as documented in connection <bcp14>SHOULD</bcp14> fall back to regular TCP, as documented
<xref target="sec_fallback"/>.</t> in
<xref target="sec_fallback" format="default"/>.</t>
<t>The initial data sequence number on an MPTCP connection <t>The IDSN on an MPTCP connection
is generated from the key. The algorithm for IDSN generation is is generated from the key. The algorithm for IDSN generation is
also determined from the negotiated authentication algorithm. also determined from the negotiated authentication algorithm.
In this specification, with only the SHA-256 algorithm specified and In this specification, with only the SHA-256 algorithm specified and
selected, the IDSN of a host MUST be the least significant 64 bits of th e selected, the IDSN of a host <bcp14>MUST</bcp14> be the least significan t 64&nbsp;bits of the
SHA-256 hash of its key, i.e., IDSN-A = Hash(Key-A) and IDSN-B = Hash(Ke y-B). SHA-256 hash of its key, i.e., IDSN-A = Hash(Key-A) and IDSN-B = Hash(Ke y-B).
This deterministic generation of the IDSN allows a receiver to ensure This deterministic generation of the IDSN allows a receiver to ensure
that there are no gaps in sequence space at the start of the connection. that there are no gaps in sequence space at the start of the connection.
The SYN with MP_CAPABLE occupies the first octet of data sequence space, The SYN with MP_CAPABLE occupies the first octet of data sequence space,
although this does not need to be acknowledged at the connection level although this does not need to be acknowledged at the connection level
until the first data is sent (see <xref target="sec_generalop"/>).</t> until the first data is sent (see <xref target="sec_generalop" format="d efault"/>).</t>
</section> </section>
<section anchor="sec_join" numbered="true" toc="default">
<section title="Starting a New Subflow" anchor="sec_join"> <name>Starting a New Subflow</name>
<t>Once an MPTCP connection has begun with the MP_CAPABLE <t>Once an MPTCP connection has begun with the MP_CAPABLE
exchange, further subflows can be added to the connection. exchange, further subflows can be added to the connection.
Hosts have knowledge of their own address(es), and can Hosts have knowledge of their own address(es) and can
become aware of the other host's addresses through become aware of the other host's addresses through
signaling exchanges as described in signaling exchanges as described in
<xref target="sec_pm"/>. Using this knowledge, a host <xref target="sec_pm" format="default"/>. Using this knowledge, a host
can initiate a new subflow over a currently unused pair of can initiate a new subflow over a currently unused pair of
addresses. It is permitted for either host in a connection addresses. It is permissible for either host in a connection
to initiate the creation of a new subflow, but it is expected to initiate the creation of a new subflow, but it is expected
that this will normally be the original connection initiator that this will normally be the original connection initiator
(see <xref target="heuristics"/> for heuristics).</t> (see <xref target="heuristics" format="default"/> for heuristics).</t>
<t>A new subflow is started as a normal TCP SYN/ACK <t>A new subflow is started as a normal TCP SYN/ACK
exchange. The Join Connection (MP_JOIN) MPTCP option exchange. The Join Connection (MP_JOIN) MPTCP option
is used to identify the connection to be joined by the new subflow. is used to identify the connection to be joined by the new subflow.
It uses keying material that was exchanged in the initial MP_CAPABLE It uses keying material that was exchanged in the initial MP_CAPABLE
handshake (<xref target="sec_init"/>), and that handshake also handshake (<xref target="sec_init" format="default"/>), and that handsha ke also
negotiates the crypto algorithm in use for the MP_JOIN handshake.</t> negotiates the crypto algorithm in use for the MP_JOIN handshake.</t>
<t>This section specifies the behavior of MP_JOIN using the HMAC-SHA256 <t>This section specifies the behavior of MP_JOIN using the HMAC-SHA256
algorithm. An MP_JOIN option is present in the SYN, SYN/ACK, algorithm. An MP_JOIN option is present in the SYN, SYN/ACK,
and ACK of the three-way handshake, although in each case with a and ACK of the three-way handshake, although in each case with a
different format.</t> different format.</t>
<t>In the first MP_JOIN on the SYN packet, illustrated in <t>In the first MP_JOIN on the SYN packet, illustrated in
<xref target="tcpm_join"/>, the initiator sends a token, random <xref target="tcpm_join" format="default"/>, the initiator sends a token
number, and address ID.</t> , random
number, and Address ID.</t>
<figure anchor="tcpm_join">
<name>Join Connection (MP_JOIN) Option (for Initial SYN)</name>
<artwork align="left" name="" type="" alt=""><![CDATA[
1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+---------------+---------------+-------+-----+-+---------------+
| Kind | Length = 12 |Subtype|(rsv)|B| Address ID |
+---------------+---------------+-------+-----+-+---------------+
| Receiver's Token (32 bits) |
+---------------------------------------------------------------+
| Sender's Random Number (32 bits) |
+---------------------------------------------------------------+ ]]></artwork
>
</figure>
<t>The token is used to identify the MPTCP connection and is a <t>The token is used to identify the MPTCP connection and is a
cryptographic hash of the receiver's key, as exchanged cryptographic hash of the receiver's key, as exchanged
in the initial MP_CAPABLE handshake (<xref target="sec_init"/>). in the initial MP_CAPABLE handshake (<xref target="sec_init" format="def ault"/>).
In this specification, the tokens presented in this In this specification, the tokens presented in this
option are generated by the SHA-256 <xref target="RFC6234"/> option are generated by the SHA-256 algorithm <xref target="RFC6234" for
algorithm, truncated to the most significant 32 bits. The token mat="default"/>, truncated to the most significant 32 bits. The token
included in the MP_JOIN option is the token that the receiver included in the MP_JOIN option is the token that the receiver
of the packet uses to identify this connection; i.e., Host A of the packet uses to identify this connection; i.e., Host A
will send Token-B (which is generated from Key-B). Note that the will send Token-B (which is generated from Key-B). Note that the
hash generation algorithm can be overridden by the choice of hash generation algorithm can be overridden by the choice of
cryptographic handshake algorithm, as defined in <xref target="sec_init" cryptographic handshake algorithm, as defined in <xref target="sec_init"
/>.</t> format="default"/>.</t>
<t>The MP_JOIN SYN sends not only the token (which is static for a <t>The MP_JOIN SYN sends not only the token (which is static for a
connection) but also random numbers (nonces) that are used to prevent connection) but also random numbers (nonces) that are used to prevent
replay attacks on the authentication method. Recommendations for the replay attacks on the authentication method. Recommendations for the
generation of random numbers for this purpose are given in <xref target= generation of random numbers for this purpose are given in <xref target=
"RFC4086"/>.</t> "RFC4086" format="default"/>.</t>
<t>The MP_JOIN option includes an "Address ID". This is an identifier <t>The MP_JOIN option includes an "Address ID". This is an identifier
generated by the sender of the option, used to identify the source addre ss generated by the sender of the option, used to identify the source addre ss
of this packet, even if the IP header has been changed in transit by a m iddlebox. of this packet, even if the IP header has been changed in transit by a m iddlebox.
The numeric value of this field is generated by the sender and must map uniquely The numeric value of this field is generated by the sender and must map uniquely
to a source IP address for the sending host. to a source IP address for the sending host.
The Address ID allows address removal (<xref target="sec_remove_addr"/>) The Address ID allows address removal (<xref target="sec_remove_addr" fo rmat="default"/>)
without needing to know what the source address at the without needing to know what the source address at the
receiver is, thus allowing address removal through NATs. receiver is, thus allowing address removal through NATs.
The Address ID also allows correlation between new subflow setup attempt s The Address ID also allows correlation between new subflow setup attempt s
and address signaling (<xref target="sec_add_address"/>), and address signaling (<xref target="sec_add_address" format="default"/> ),
to prevent setting up duplicate subflows on the same path, if an MP_JOIN to prevent setting up duplicate subflows on the same path, if an MP_JOIN
and ADD_ADDR are sent at the same time.</t> and ADD_ADDR are sent at the same time.</t>
<t>The Address IDs of the subflow used in the initial SYN <t>The Address IDs of the subflow used in the initial SYN
exchange of the first subflow in the connection are implicit, exchange of the first subflow in the connection are implicit
and have the value zero. A host MUST store the mappings between and have the value zero. A host <bcp14>MUST</bcp14> store the mappings b
etween
Address IDs and addresses both for itself and the remote host. Address IDs and addresses both for itself and the remote host.
An implementation will also need to know which local and remote An implementation will also need to know which local and remote
Address IDs are associated with which established subflows, for Address IDs are associated with which established subflows, for
when addresses are removed from a local or remote host.</t> when addresses are removed from a local or remote host.</t>
<t>The MP_JOIN option on packets with the SYN flag set also includes
<t>The MP_JOIN option on packets with the SYN flag set also includes 4 b 4&nbsp;bits of flags, 3 of which are currently reserved and
its of flags, 3 of which are currently reserved and MUST be set to zero by the s <bcp14>MUST</bcp14> be set to 0 by the sender. The final bit, labeled
ender. The final bit, labeled "B", indicates whether the sender of this option w "B", indicates whether the sender of this option (1)&nbsp;wishes this
ishes this subflow to be used as a backup path (B=1) in the event of failure of subflow to be used as a backup path (B=1) in the event of failure of
other paths, or whether it wants it to be used as part of the connection immedia other paths or (2)&nbsp;wants the subflow to be used as part of the
tely. By setting B=1, the sender of the option is requesting the other host to o connection immediately. By setting B=1, the sender of the option is
nly send data on this subflow if there are no available subflows where B=0. Subf requesting that the other host only send data on this subflow if there
low policy is discussed in more detail in <xref target="sec_policy"/>.</t> are no available subflows where B=0. Subflow policy is discussed in more
detail in <xref target="sec_policy" format="default"/>.</t>
<?rfc needLines='10'?>
<figure align="center" anchor="tcpm_join" title="Join Connection (MP_JOI
N) Option (for Initial SYN)">
<artwork align="left"><![CDATA[
1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+---------------+---------------+-------+-----+-+---------------+
| Kind | Length = 12 |Subtype|(rsv)|B| Address ID |
+---------------+---------------+-------+-----+-+---------------+
| Receiver's Token (32 bits) |
+---------------------------------------------------------------+
| Sender's Random Number (32 bits) |
+---------------------------------------------------------------+
]]></artwork>
</figure>
<t>When receiving a SYN with an MP_JOIN option that contains <t>When receiving a SYN with an MP_JOIN option that contains
a valid token for an existing MPTCP connection, the recipient a valid token for an existing MPTCP connection, the recipient
SHOULD respond with a SYN/ACK also containing an MP_JOIN <bcp14>SHOULD</bcp14> respond with a SYN/ACK also containing an MP_JOIN
option containing a random number and a truncated (leftmost 64 option containing a random number and a truncated (leftmost 64&nbsp;bits
bits) Hash-based Message Authentication Code (HMAC). This ) HMAC. This
version of the option is shown in <xref target="tcpm_join2"/>. version of the option is shown in <xref target="tcpm_join2" format="defa
If the token is unknown, or the host wants to refuse subflow ult"/>. If the token is unknown or the host wants to refuse subflow
establishment (for example, due to a limit on the number of establishment (for example, due to a limit on the number of
subflows it will permit), the receiver will send back a reset subflows it will permit), the receiver will send back a reset
(RST) signal, analogous to an unknown port in TCP, containing a (RST) signal, analogous to an unknown port in TCP, containing an
MP_TCPRST option (<xref target="sec_reset"/>) with a "MPTCP MP_TCPRST option (<xref target="sec_reset" format="default"/>) with an "
MPTCP
specific error" reason code. Although calculating an HMAC specific error" reason code. Although calculating an HMAC
requires cryptographic operations, it is believed that the requires cryptographic operations, it is believed that the
32-bit token in the MP_JOIN SYN gives sufficient protection against blin d state 32-bit token in the MP_JOIN SYN gives sufficient protection against blin d state
exhaustion attacks; therefore, there is no need to provide exhaustion attacks; therefore, there is no need to provide
mechanisms to allow a responder to operate statelessly at the mechanisms to allow a responder to operate statelessly at the
MP_JOIN stage.</t> MP_JOIN stage.</t>
<figure anchor="tcpm_join2">
<name>Join Connection (MP_JOIN) Option (for Responding SYN/ACK)</name>
<artwork align="left" name="" type="" alt=""><![CDATA[
1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+---------------+---------------+-------+-----+-+---------------+
| Kind | Length = 16 |Subtype|(rsv)|B| Address ID |
+---------------+---------------+-------+-----+-+---------------+
| |
| Sender's Truncated HMAC (64 bits) |
| |
+---------------------------------------------------------------+
| Sender's Random Number (32 bits) |
+---------------------------------------------------------------+ ]]></artwork
>
</figure>
<t>An HMAC is sent by both hosts -- by the initiator (Host A) <t>An HMAC is sent by both hosts -- by the initiator (Host A)
in the third packet (the ACK) and by the responder (Host B) in in the third packet (the ACK) and by the responder (Host B) in
the second packet (the SYN/ACK). Doing the HMAC exchange at this the second packet (the SYN/ACK). Doing the HMAC exchange at this
stage allows both hosts to have first exchanged random data (in the stage allows both hosts to have first exchanged random data (in the
first two SYN packets) that is used as the "message". This first two SYN packets) that is used as the "message". This
specification defines that HMAC as defined in <xref target="RFC2104"/> specification defines that HMAC as defined in <xref target="RFC2104" for
is used, along with the SHA-256 hash algorithm <xref target="RFC6234"/>, mat="default"/>
is used, along with the SHA-256 hash algorithm <xref target="RFC6234" fo
rmat="default"/>,
and that the output is truncated to the leftmost 160 bits (20 octets). and that the output is truncated to the leftmost 160 bits (20 octets).
Due to option space limitations, the HMAC included in Due to option space limitations, the HMAC included in
the SYN/ACK is truncated to the leftmost 64 bits, but this is the SYN/ACK is truncated to the leftmost 64 bits, but this is
acceptable since random numbers are used; thus, an attacker acceptable, since random numbers are used; thus, an attacker
only has one chance to correctly guess the HMAC that matches the random only has one chance to correctly guess the HMAC that matches the random
number previously sent by the peer (if the HMAC is number previously sent by the peer (if the HMAC is
incorrect, the TCP connection is closed, so a new MP_JOIN negotiation incorrect, the TCP connection is closed, so a new MP_JOIN negotiation
with a new random number is required).</t> with a new random number is required).</t>
<t>The initiator's authentication information is sent in its <t>The initiator's authentication information is sent in its
first ACK (the third packet of the handshake), as shown in first ACK (the third packet of the handshake), as shown in
<xref target="tcpm_join3"/>. This data needs to be sent reliably, <xref target="tcpm_join3" format="default"/>. This data needs to be sent reliably,
since it is the only time this HMAC is sent; since it is the only time this HMAC is sent;
therefore, receipt of this packet MUST trigger a regular TCP ACK therefore, receipt of this packet <bcp14>MUST</bcp14> trigger a regular
in response, and the packet MUST be retransmitted if this TCP ACK
in response, and the packet <bcp14>MUST</bcp14> be retransmitted if this
ACK is not received. In other words, sending the ACK/MP_JOIN ACK is not received. In other words, sending the ACK/MP_JOIN
packet places the subflow in the PRE_ESTABLISHED state, and it packet places the subflow in the PRE_ESTABLISHED state, and it
moves to the ESTABLISHED state only on receipt of an ACK from moves to the ESTABLISHED state only on receipt of an ACK from
the receiver. It is not permitted to send data while in the the receiver. It is not permissible to send data while in the
PRE_ESTABLISHED state. The reserved bits in this option MUST be set PRE_ESTABLISHED state. The reserved bits in this option <bcp14>MUST</bcp
to zero by the sender.</t> 14> be set
to 0 by the sender.</t>
<t>The key for the HMAC algorithm, in the case of the message transmitte <figure anchor="tcpm_join3">
d by Host A, will be Key-A followed by Key-B, and in the case of Host B, Key-B f <name>Join Connection (MP_JOIN) Option (for&nbsp;Initiator's&nbsp;Firs
ollowed by Key-A. These are the keys that were exchanged in the original MP_CAPA t&nbsp;ACK)</name>
BLE handshake. The "message" for the HMAC algorithm in each case is the concaten <artwork align="left" name="" type="" alt=""><![CDATA[
ations of random number for each host (denoted by R): for Host A, R-A followed b 1 2 3
y R-B; and for Host B, R-B followed by R-A.</t> 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+---------------+---------------+-------+-----------------------+
<?rfc needLines='10'?> | Kind | Length = 24 |Subtype| (reserved) |
<figure align="center" anchor="tcpm_join2" title="Join Connection (MP_JO +---------------+---------------+-------+-----------------------+
IN) Option (for Responding SYN/ACK)"> | |
<artwork align="left"><![CDATA[ | |
1 2 3 | Sender's Truncated HMAC (160 bits) |
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |
+---------------+---------------+-------+-----+-+---------------+ | |
| Kind | Length = 16 |Subtype|(rsv)|B| Address ID | +---------------------------------------------------------------+ ]]></artwork
+---------------+---------------+-------+-----+-+---------------+ >
| |
| Sender's Truncated HMAC (64 bits) |
| |
+---------------------------------------------------------------+
| Sender's Random Number (32 bits) |
+---------------------------------------------------------------+
]]></artwork>
</figure>
<?rfc needLines='12'?>
<figure align="center" anchor="tcpm_join3" title="Join Connection (MP_JO
IN) Option (for Third ACK)">
<artwork align="left"><![CDATA[
1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+---------------+---------------+-------+-----------------------+
| Kind | Length = 24 |Subtype| (reserved) |
+---------------+---------------+-------+-----------------------+
| |
| |
| Sender's Truncated HMAC (160 bits) |
| |
| |
+---------------------------------------------------------------+
]]></artwork>
</figure> </figure>
<t>The key for the HMAC algorithm, in the case of the message
transmitted by Host A, will be Key-A followed by Key-B; and in the
case of Host B, Key-B followed by Key-A. These are the keys that were
exchanged in the original MP_CAPABLE handshake. The "message" for the
HMAC algorithm in each case is the concatenations of random numbers for
each host (denoted by R): for Host A, R-A followed by R-B; and for
Host B, R-B followed by R-A.</t>
<t>These various MPTCP options fit together to enable authenticated subf
low setup as illustrated in <xref target="fig_tokens" format="default"/>.</t>
<figure anchor="fig_tokens">
<name>Example Use of MPTCP Authentication</name>
<artwork align="left" name="" type="" alt=""><![CDATA[
Host A Host B
------------------------ ----------
Address A1 Address A2 Address B1
---------- ---------- ----------
| | |
| | SYN + MP_CAPABLE |
|--------------------------------------------->|
|<---------------------------------------------|
| SYN/ACK + MP_CAPABLE(Key-B) |
| | |
| ACK + MP_CAPABLE(Key-A, Key-B) |
|--------------------------------------------->|
| | |
| | SYN + MP_JOIN(Token-B, R-A) |
| |------------------------------->|
| |<-------------------------------|
| | SYN/ACK + MP_JOIN(HMAC-B, R-B) |
| | |
| | ACK + MP_JOIN(HMAC-A) |
| |------------------------------->|
| |<-------------------------------|
| | ACK |
<t>These various MPTCP options fit together to enable authenticated subf HMAC-A = HMAC(Key=(Key-A + Key-B), Msg=(R-A + R-B))
low setup as illustrated in <xref target="fig_tokens"/>.</t> HMAC-B = HMAC(Key=(Key-B + Key-A), Msg=(R-B + R-A)) ]]></artwork>
<?rfc needLines='24'?>
<figure align="center" anchor="fig_tokens" title="Example Use of MPTCP A
uthentication">
<artwork align="left"><![CDATA[
Host A Host B
------------------------ ----------
Address A1 Address A2 Address B1
---------- ---------- ----------
| | |
| | SYN + MP_CAPABLE |
|--------------------------------------------->|
|<---------------------------------------------|
| SYN/ACK + MP_CAPABLE(Key-B) |
| | |
| ACK + MP_CAPABLE(Key-A, Key-B) |
|--------------------------------------------->|
| | |
| | SYN + MP_JOIN(Token-B, R-A) |
| |------------------------------->|
| |<-------------------------------|
| | SYN/ACK + MP_JOIN(HMAC-B, R-B) |
| | |
| | ACK + MP_JOIN(HMAC-A) |
| |------------------------------->|
| |<-------------------------------|
| | ACK |
HMAC-A = HMAC(Key=(Key-A+Key-B), Msg=(R-A+R-B))
HMAC-B = HMAC(Key=(Key-B+Key-A), Msg=(R-B+R-A))
]]></artwork>
</figure> </figure>
<t>If the token received at Host B is unknown or local policy <t>If the token received at Host B is unknown or local policy
prohibits the acceptance of the new subflow, the recipient MUST prohibits the acceptance of the new subflow, the recipient <bcp14>MUST</
respond with a TCP RST for the subflow. If appropriate, a MP_TCPRST bcp14>
option with a "Administratively prohibited" reason code respond with a TCP RST for the subflow. If appropriate, an MP_TCPRST
(<xref target="sec_reset"/>) should be included.</t> option with an "Administratively prohibited" reason code
(<xref target="sec_reset" format="default"/>) should be included.</t>
<t>If the token is accepted at Host B, but the HMAC returned to <t>If the token is accepted at Host B but the HMAC returned to
Host A does not match the one expected, Host A MUST close the Host A does not match the one expected, Host A <bcp14>MUST</bcp14> close
subflow with a TCP RST. In this, and all following cases of sending the
a RST in this section, the sender SHOULD send a MP_TCPRST option subflow with a TCP RST. In this and all subsequent cases of sending
(<xref target="sec_reset"/>) on this RST packet with the reason a RST as described in this section, the sender <bcp14>SHOULD</bcp14> sen
code for a "MPTCP specific error".</t> d an MP_TCPRST option
(<xref target="sec_reset" format="default"/>) on this RST packet with th
<t>If Host B does not receive the expected HMAC, or the MP_JOIN e reason
option is missing from the ACK, it MUST close the subflow with a code for an "MPTCP-specific error".</t>
<t>If Host B does not receive the expected HMAC or the MP_JOIN
option is missing from the ACK, it <bcp14>MUST</bcp14> close the subflow
with a
TCP RST.</t> TCP RST.</t>
<t>If the HMACs are verified as correct, then both hosts have <t>If the HMACs are verified as correct, then both hosts have
verified each other as being the same peers as existed at verified each other as being the same peers as those that existed at
the start of the connection, and they have agreed of which the start of the connection, and they have agreed of which
connection this subflow will become a part.</t> connection this subflow will become a part.</t>
<t>If the SYN/ACK as received at Host A does not have an MP_JOIN <t>If the SYN/ACK as received at Host A does not have an MP_JOIN
option, Host A MUST close the subflow with a TCP RST.</t> option, Host A <bcp14>MUST</bcp14> close the subflow with a TCP RST.</t>
<t>This covers all cases of the loss of an MP_JOIN. In more detail, <t>This covers all cases of the loss of an MP_JOIN. In more detail,
if MP_JOIN is stripped from the SYN on the path from A to if an MP_JOIN is stripped from the SYN on the path from A to
B, and Host B does not have a listener on the relevant B and Host&nbsp;B does not have a listener on the relevant
port, it will respond with a RST in the normal way. If in port, it will respond with a RST in the normal way. If in
response to a SYN with an MP_JOIN option, a SYN/ACK is response to a SYN with an MP_JOIN option a SYN/ACK is
received without the MP_JOIN option (either since it was received without the MP_JOIN option (because it was either
stripped on the return path, or it was stripped on the stripped on the return path, or stripped on the
outgoing path but Host B responded as if outgoing path leading to Host B responding as if
it were a new regular TCP session), then the subflow is it was a new regular TCP session), then the subflow is
unusable and Host A MUST close it with a RST.</t> unusable and Host A <bcp14>MUST</bcp14> close it with a RST.</t>
<t>Note that additional subflows can be created <t>Note that additional subflows can be created
between any pair of ports (but see <xref target="heuristics"/> for between any pair of ports (but see <xref target="heuristics" format="def ault"/> for
heuristics); no explicit application-level accept calls or heuristics); no explicit application-level accept calls or
bind calls are required to open additional subflows. To bind calls are required to open additional subflows. To
associate a new subflow with an existing connection, the token associate a new subflow with an existing connection, the token
supplied in the subflow's SYN exchange is used for supplied in the subflow's SYN exchange is used for
demultiplexing. This then binds the 5-tuple of the TCP demultiplexing. This then binds the 5-tuple of the TCP
subflow to the local token of the connection. A consequence is subflow to the local token of the connection. One consequence is
that it is possible to allow any port pairs to be used for a that it is possible to allow any port pairs to be used for a
connection. </t> connection. </t>
<t>Demultiplexing subflow SYNs <bcp14>MUST</bcp14> be done using the tok
<t>Demultiplexing subflow SYNs MUST be done using the token; en;
this is unlike traditional TCP, where the destination port is this is unlike traditional TCP, where the destination port is
used for demultiplexing SYN packets. Once a subflow is set up, used for demultiplexing SYN packets. Once a subflow is set up,
demultiplexing packets is done using the 5-tuple, as in demultiplexing packets is done using the 5-tuple, as in
traditional TCP. The 5-tuples will be mapped to the local traditional TCP. The 5-tuples will be mapped to the local
connection identifier (token). Note that Host A will know its connection identifier (token). Note that Host A will know its
local token for the subflow even though it is not sent on the local token for the subflow even though it is not sent on the
wire -- only the responder's token is sent.</t> wire -- only the responder's token is sent.</t>
</section> </section>
<section anchor="sec_generalop" numbered="true" toc="default">
<section title="General MPTCP Operation" anchor="sec_generalop"> <name>MPTCP Operation and Data Transfer</name>
<t>This section discusses operation of MPTCP for data transfer. At a hig <t>This section discusses the operation of MPTCP for data transfer. At a
h level, an MPTCP implementation will take one input data stream from an applica high level, an MPTCP implementation will take one input data stream from an app
tion, and split it into one or more subflows, with sufficient control informatio lication and split it into one or more subflows, with sufficient control informa
n to allow it to be reassembled and delivered reliably and in order to the recip tion to allow it to be reassembled and delivered reliably and in order to the re
ient application. The following subsections define this behavior in detail.</t> cipient application. The following subsections define this behavior in detail.</
t>
<t>The data sequence mapping and the Data ACK are signaled in the Data S <t>The Data Sequence Mapping and the Data ACK are signaled in the DSS op
equence Signal (DSS) option (<xref target="tcpm_dsn"/>). Either or both can be s tion (<xref target="tcpm_dsn" format="default"/>). Either or both can be signale
ignaled in one DSS, depending on the flags set. The data sequence mapping define d in one DSS, depending on the flags set. The Data Sequence Mapping defines how
s how the sequence space on the subflow maps to the connection level, and the Da the sequence space on the subflow maps to the connection level, and the Data ACK
ta ACK acknowledges receipt of data at the connection level. These functions are acknowledges receipt of data at the connection level. These functions are descr
described in more detail in the following two subsections.</t> ibed in more detail in the following two subsections.</t>
<figure anchor="tcpm_dsn">
<?rfc needLines='18'?> <name>Data Sequence Signal (DSS) Option</name>
<figure align="center" anchor="tcpm_dsn" title="Data Sequence Signal (DS <artwork align="left" name="" type="" alt=""><![CDATA[
S) Option">
<artwork align="left"><![CDATA[
1 2 3 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+---------------+---------------+-------+----------------------+ +---------------+---------------+-------+----------------------+
| Kind | Length |Subtype| (reserved) |F|m|M|a|A| | Kind | Length |Subtype| (reserved) |F|m|M|a|A|
+---------------+---------------+-------+----------------------+ +---------------+---------------+-------+----------------------+
| Data ACK (4 or 8 octets, depending on flags) | | Data ACK (4 or 8 octets, depending on flags) |
+--------------------------------------------------------------+ +--------------------------------------------------------------+
| Data sequence number (4 or 8 octets, depending on flags) | | Data Sequence Number (4 or 8 octets, depending on flags) |
+--------------------------------------------------------------+ +--------------------------------------------------------------+
| Subflow Sequence Number (4 octets) | | Subflow Sequence Number (4 octets) |
+-------------------------------+------------------------------+ +-------------------------------+------------------------------+
| Data-Level Length (2 octets) | Checksum (2 octets) | | Data-Level Length (2 octets) | Checksum (2 octets) |
+-------------------------------+------------------------------+ +-------------------------------+------------------------------+ ]]></artwork>
]]></artwork>
</figure> </figure>
<t>The flags, when set, define the contents of this option, as follows: <t>The flags, when set, define the contents of this option, as follows:
<list style="symbols"> </t>
<t>A = Data ACK present</t> <ul spacing="normal">
<t>a = Data ACK is 8 octets (if not set, Data ACK is 4 octets)</t> <li>A = Data ACK present</li>
<t>M = Data Sequence Number (DSN), Subflow Sequence Number (SSN), Da <li>a = Data ACK is 8 octets (if not set, Data ACK is 4 octets)</li>
ta-Level Length, and Checksum (if negotiated) present</t> <li>M = Data Sequence Number (DSN), Subflow Sequence Number (SSN), Dat
<t>m = Data sequence number is 8 octets (if not set, DSN is 4 octets a-Level Length, and Checksum (if negotiated) present</li>
)</t> <li>m = Data Sequence Number is 8 octets (if not set, DSN is 4 octets)
</list> </li>
</ul>
The flags 'a' and 'm' only have meaning if the corresponding 'A' or 'M' <t>
flags are set; otherwise, they will be ignored. The maximum length of this optio
n, with all flags set, is 28 octets.</t>
<t>The 'F' flag indicates "Data FIN". If present, this means that this m
apping covers the final data from the sender. This is the connection-level equiv
alent to the FIN flag in single-path TCP. A connection is not closed unless ther
e has been a Data FIN exchange, a MP_FASTCLOSE (<xref target="sec_fastclose"/>)
message, or an implementation-specific, connection-level send timeout. The purpo
se of the Data FIN and the interactions between this flag, the subflow-level FIN
flag, and the data sequence mapping are described in <xref target="sec_close"/>
.
The remaining reserved bits MUST be set to zero by an implementation of
this specification.</t>
<t>Note that the checksum is only present in this option if the use of M
PTCP checksumming has been negotiated at the MP_CAPABLE handshake (see <xref tar
get="sec_init"/>). The presence of the checksum can be inferred from the length
of the option. If a checksum is present, but its use had not been negotiated in
the MP_CAPABLE handshake, the receiver MUST close the subflow with a RST as it n
ot behaving as negotiated. If a checksum is not present when its use has been ne
gotiated, the receiver MUST close the subflow with a RST as it is considered bro
ken. In both cases, this RST SHOULD be accompanied with a MP_TCPRST option (<xre
f target="sec_reset"/>) with the reason code for a "MPTCP specific error".</t>
<section title="Data Sequence Mapping" anchor="sec_dsn">
<t>The data stream as a whole can be reassembled through the use of th
e data sequence mapping components of the DSS option (<xref target="tcpm_dsn"/>)
, which define the
mapping from the subflow sequence number to the data sequence number. This is us
ed by the receiver to ensure in-order delivery to the application layer. Meanwhi
le, the subflow-level sequence numbers (i.e., the regular sequence numbers in th
e TCP header) have subflow-only relevance. It is expected (but not mandated) tha
t SACK <xref target='RFC2018'/> is used at the subflow level to improve efficien
cy.</t>
<t>The data sequence mapping specifies a mapping from subflow sequence s
pace to data sequence space. This is expressed in terms of starting sequence num
bers for the subflow and the data level, and a length of bytes for which this ma
pping is valid.
This explicit mapping for a range of data was chosen rather than per-packet sign
aling to assist with compatibility with situations where TCP/IP segmentation or
coalescing is undertaken separately from the stack that is generating the data f
low (e.g., through the use of TCP segmentation offloading on network interface c
ards, or by middleboxes such as performance enhancing proxies). It also allows a
single mapping to cover many packets, which may be useful in bulk transfer situ
ations.</t>
<t>A mapping is fixed, in that the subflow sequence number is bound to t
he data sequence number after the mapping has been processed. A sender MUST NOT
change this mapping
after it has been declared; however, the same data sequence number can be mapped
to by different subflows for retransmission purposes (see <xref target="sec_ret
ransmit"/>). This would also permit the same data to be sent simultaneously on m
ultiple subflows for resilience or efficiency purposes, especially in the case o
f lossy links. Although the detailed specification of such operation is outside
the scope of this document, an implementation SHOULD treat the first data that i
s received at a subflow for the data sequence space as that which should be deli
vered to the application, and any later data for that sequence space SHOULD be i
gnored.</t>
<t>The data sequence number is specified as an absolute value, whereas t
he subflow sequence numbering is relative (the SYN at the start of the subflow h
as relative subflow sequence number 0). This is to allow middleboxes to change t
he initial sequence number of a subflow, such as firewalls that undertake Initia
l Sequence Number (ISN) randomization.</t>
<t>The data sequence mapping also contains a checksum of the data that t
his mapping covers, if use of checksums has been negotiated at the MP_CAPABLE ex
change. Checksums are used to detect if the payload has been adjusted in any way
by a non-MPTCP-aware middlebox. If this checksum fails, it will trigger a failu
re of the subflow, or a fallback to regular TCP, as documented in <xref target="
sec_fallback"/>, since MPTCP can no longer reliably know the subflow sequence sp
ace at the receiver to build data sequence mappings. Without checksumming enable
d, corrupt data may be delivered to the application if a middlebox alters segmen
t boundaries, alters content, or does not deliver all segments covered by a data
sequence mapping. It is therefore RECOMMENDED to use checksumming unless it is
known the network path contains no such devices.</t>
<t>The checksum algorithm used is the standard TCP checksum <xref target
="RFC0793"/>, operating over the data covered by this mapping, along with a pseu
do-header as shown in <xref target="fig_pseudo"/>.</t>
<?rfc needLines='18'?> The flags "a" and "m" only have meaning if the corresponding "A" or "M"
<figure align="center" anchor="fig_pseudo" title="Pseudo-Header for DSS flags are set; otherwise, they will be ignored. The maximum length of this optio
Checksum"> n, with all flags set, is 28 octets.</t>
<artwork align="left"><![CDATA[ <t>The "F" flag indicates "Data FIN". If present, this means that this
mapping covers the final data from the sender. This is the
connection-level equivalent of the FIN flag in single-path TCP. A connec
tion is not closed unless there has been a Data FIN exchange, an MP_FASTCLOSE (<
xref target="sec_fastclose" format="default"/>) message, or an implementation-sp
ecific connection-level send timeout. The purpose of the Data FIN and the intera
ctions between this flag, the subflow-level FIN flag, and the Data Sequence Mapp
ing are described in <xref target="sec_close" format="default"/>.
The remaining reserved bits <bcp14>MUST</bcp14> be set to 0 by an implem
entation of this specification.</t>
<t>Note that the checksum is only present in this option if the use of
MPTCP checksumming has been negotiated at the MP_CAPABLE handshake
(see <xref target="sec_init" format="default"/>). The presence of the
checksum can be inferred from the length of the option. If a checksum
is present but its use had not been negotiated in the MP_CAPABLE
handshake, the receiver <bcp14>MUST</bcp14> close the subflow with a
RST, as it is not behaving as negotiated. If a checksum is not present w
hen its use has been negotiated, the receiver <bcp14>MUST</bcp14> close the subf
low with a RST, as it is considered broken. In both cases, this RST <bcp14>SHOUL
D</bcp14> be accompanied by an MP_TCPRST option (<xref target="sec_reset" format
="default"/>) with the reason code for an "MPTCP-specific error".</t>
<section anchor="sec_dsn" numbered="true" toc="default">
<name>Data Sequence Mapping</name>
<t>The data stream as a whole can be reassembled through the use of th
e Data Sequence Mapping components of the DSS option (<xref target="tcpm_dsn" fo
rmat="default"/>), which define the
mapping from the subflow sequence number to the data sequence number. This is
used by the receiver to ensure in-order delivery to the application
layer. Meanwhile, the subflow-level sequence numbers (i.e., the
regular sequence numbers in the TCP header) are only relevant to the s
ubflow. It is expected (but not mandated) that SACK <xref
target="RFC2018" format="default"/> will be used at the subflow level
to improve efficiency.</t>
<t>The Data Sequence Mapping specifies a mapping from the subflow
sequence space to the data sequence space. This is expressed in terms
of starting sequence numbers for the subflow and the data level, and a length of
bytes for which this mapping is valid.
This explicit mapping for a range of data, rather than per&#8209;packet signalin
g, was chosen to assist with compatibility with
situations where TCP/IP segmentation or coalescing is undertaken
separately from the stack that is generating the data flow (e.g.,
through the use of TCP segmentation offloading on network interface
cards, or by middleboxes such as Performance Enhancing Proxies
(PEPs) <xref target="RFC3135" format="default"/>). It
also allows a single mapping to cover many packets; this may be useful
in bulk&#8209;transfer situations.</t>
<t>A mapping is fixed, in that the subflow sequence number is bound to
the data sequence number after the mapping has been processed. A sender <bcp14>
MUST NOT</bcp14> change this mapping
after it has been declared; however, the same data sequence number can be
mapped to by different subflows for retransmission purposes (see
<xref target="sec_retransmit" format="default"/>). This would also
permit the same data to be sent simultaneously on multiple subflows
for resilience or efficiency purposes, especially in the case of
lossy links. Although the detailed specification of such operation
is outside the scope of this document, an implementation
<bcp14>SHOULD</bcp14> treat the first data that is received at a
subflow for the data sequence space as the data that should be deliver
ed to the application, and any subsequent data for that sequence space <bcp14>SH
OULD</bcp14> be ignored.</t>
<t>The data sequence number is specified as an absolute value,
whereas the subflow sequence numbering is relative (the SYN at the
start of the subflow has a relative subflow sequence number of
0). This is done to allow middleboxes to change the Initial Sequence
Number (ISN) of a subflow, such as firewalls that undertake ISN random
ization.</t>
<t>The Data Sequence Mapping also contains a checksum of the data
that this mapping covers, if the use of checksums has been negotiated
at
the MP_CAPABLE exchange. Checksums are used to detect if the payload
has been adjusted in any way by a non-MPTCP-aware middlebox. If this
checksum fails, it will trigger a failure of the subflow, or a
fallback to regular TCP, as documented in <xref
target="sec_fallback" format="default"/>, since MPTCP can no longer
reliably know the subflow sequence space at the receiver to build
Data Sequence Mappings. Without checksumming enabled, corrupt data
may be delivered to the application if a middlebox alters segment
boundaries, alters content, or does not deliver all segments covered
by a Data Sequence Mapping. It is therefore
<bcp14>RECOMMENDED</bcp14> that checksumming be used, unless it is kno
wn
that the network path contains no such devices.</t>
<t>The checksum algorithm used is the standard TCP checksum <xref targ
et="RFC0793" format="default"/>, operating over the data covered by this mapping
, along with a pseudo&#8209;header as shown in <xref target="fig_pseudo" format=
"default"/>.</t>
<figure anchor="fig_pseudo">
<name>Pseudo-Header for DSS Checksum</name>
<artwork align="left" name="" type="" alt=""><![CDATA[
1 2 3 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+--------------------------------------------------------------+ +--------------------------------------------------------------+
| | | |
| Data Sequence Number (8 octets) | | Data Sequence Number (8 octets) |
| | | |
+--------------------------------------------------------------+ +--------------------------------------------------------------+
| Subflow Sequence Number (4 octets) | | Subflow Sequence Number (4 octets) |
+-------------------------------+------------------------------+ +-------------------------------+------------------------------+
| Data-Level Length (2 octets) | Zeros (2 octets) | | Data-Level Length (2 octets) | Zeros (2 octets) |
+-------------------------------+------------------------------+ +-------------------------------+------------------------------+ ]]></artwork>
]]></artwork> </figure>
</figure> <t>Note that the data sequence number used in the pseudo-header is alw
ays the 64-bit value, irrespective of what length is used in the DSS option itse
<t>Note that the data sequence number used in the pseudo-header is alway lf. The standard TCP checksum algorithm has been chosen, since it will be calcul
s the 64-bit value, irrespective of what length is used in the DSS option itself ated anyway for the TCP subflow, and if calculated first over the data before ad
. The standard TCP checksum algorithm has been chosen since it will be calculate ding the pseudo-headers, it only needs to be calculated once. Furthermore, since
d anyway for the TCP subflow, and if calculated first over the data before addin the TCP checksum is additive, the checksum for a DSN_MAP can be constructed by
g the pseudo-headers, it only needs to be calculated once. Furthermore, since th simply adding together the checksums for the data of each constituent TCP segmen
e TCP checksum is additive, the checksum for a DSN_MAP can be constructed by sim t and adding the checksum for the DSS pseudo&#8209;header.</t>
ply adding together the checksums for the data of each constituent TCP segment, <t>Note that checksumming relies on the TCP subflow containing contigu
and adding the checksum for the DSS pseudo-header.</t> ous data; therefore, a TCP subflow <bcp14>MUST NOT</bcp14> use the Urgent Pointe
r to interrupt an existing mapping. Further note, however, that if Urgent data i
<t>Note that checksumming relies on the TCP subflow containing contiguou s received on a subflow, it <bcp14>SHOULD</bcp14> be mapped to the data sequence
s data; therefore, a TCP subflow MUST NOT use the Urgent Pointer to interrupt an space and delivered to the application, analogous to Urgent data in regular TCP
existing mapping. Further note, however, that if Urgent data is received on a s .</t>
ubflow, it SHOULD be mapped to the data sequence space and delivered to the appl <t>To avoid possible deadlock scenarios, subflow-level
ication analogous to Urgent data in regular TCP.</t> processing should be undertaken separately from processing at the
<t>To avoid possible deadlock scenarios, subflow-level
processing should be undertaken separately from that at
connection level. Therefore, even if a mapping does not exist connection level. Therefore, even if a mapping does not exist
from the subflow space to the data-level space, the data from the subflow space to the data&#8209;level space, the data
SHOULD still be ACKed at the subflow (if it is in-window). <bcp14>SHOULD</bcp14> still be ACKed at the subflow (if it is in-window)
.
This data cannot, however, be acknowledged at the data level This data cannot, however, be acknowledged at the data level
(<xref target="sec_dataack"/>) because its data sequence (<xref target="sec_dataack" format="default"/>) because its data sequenc
numbers are unknown. Implementations MAY hold onto such e
unmapped data for a short while in the expectation that a numbers are unknown. Implementations <bcp14>MAY</bcp14> hold onto such
unmapped data for a short while, in the expectation that a
mapping will arrive shortly. Such unmapped data cannot be mapping will arrive shortly. Such unmapped data cannot be
counted as being within the connection level receive window because this is counted as being within the connection-level receive window because this is
relative to the data sequence numbers, so if the receiver runs relative to the data sequence numbers, so if the receiver runs
out of memory to hold this data, it will have to be discarded. out of memory to hold this data, it will have to be discarded.
If a mapping for that subflow-level sequence space does not If a mapping for that subflow-level sequence space does not
arrive within a receive window of data, that subflow SHOULD be arrive within a receive window of data, that subflow <bcp14>SHOULD</bcp1 4> be
treated as broken, closed with a RST, and any unmapped data treated as broken, closed with a RST, and any unmapped data
silently discarded.</t> silently discarded.</t>
<t>Data sequence numbers are always 64-bit quantities and
<t>Data sequence numbers are always 64-bit quantities, and <bcp14>MUST</bcp14> be maintained as such in implementations. If a
MUST be maintained as such in implementations. If a
connection is progressing at a slow rate, so protection connection is progressing at a slow rate, so protection
against wrapped sequence numbers is not required, against wrapped sequence numbers is not required,
then an implementation MAY include just the lower 32 then an implementation <bcp14>MAY</bcp14> include just the lower 32
bits of the data sequence number in the data sequence mapping and/or bits of the data sequence number in the Data Sequence Mapping and&wj;/or
Data ACK as an optimization, and an implementation can make this choice Data ACK as an optimization, and an implementation can make this choice
independently for each packet. An implementation MUST be able to receive independently for each packet. An implementation <bcp14>MUST</bcp14> be
and process both 64-bit or 32-bit sequence number values, but it is not able to receive
required that an implementation is able to send both.</t> and process both 64-bit and 32-bit sequence number values, but it is not
required that an implementation be able to send both.</t>
<t>An implementation MUST send the full 64-bit data sequence number <t>An implementation <bcp14>MUST</bcp14> send the full 64-bit data seq
uence number
if it is transmitting at a sufficiently high rate that the 32-bit value if it is transmitting at a sufficiently high rate that the 32-bit value
could wrap within the Maximum Segment Lifetime could wrap within the Maximum Segment Lifetime
(MSL) <xref target="RFC7323"/>. The lengths of the DSNs used in these (MSL) <xref target="RFC7323" format="default"/>. The lengths of the DSNs used in these
values (which may be different) are declared with flags in the values (which may be different) are declared with flags in the
DSS option. Implementations MUST accept a 32-bit DSN and implicitly DSS option. Implementations <bcp14>MUST</bcp14> accept a 32-bit DSN and implicitly
promote it to a 64-bit quantity by incrementing the upper 32 promote it to a 64-bit quantity by incrementing the upper 32
bits of sequence number each time the lower 32 bits of the sequence number each time the lower 32
bits wrap. A sanity check MUST be implemented to ensure that bits wrap. A sanity check <bcp14>MUST</bcp14> be implemented to ensure t
hat
a wrap occurs at an expected time (e.g., the sequence number jumps a wrap occurs at an expected time (e.g., the sequence number jumps
from a very high number to a very low number) and is not triggered from a very high number to a very low number) and is not triggered
by out-of-order packets.</t> by out&#8209;of-order packets.</t>
<t>As with the standard TCP sequence number, the data sequence
<t>As with the standard TCP sequence number, the data sequence
number should not start at zero, but at a random value to make number should not start at zero, but at a random value to make
blind session hijacking harder. This specification requires blind session hijacking harder. This specification requires
setting the initial data sequence number (IDSN) of each host to the setting the IDSN of each host to the
least significant 64 bits of the SHA-256 hash of the host's key, as least significant 64&nbsp;bits of the SHA-256 hash of the host's key, as
described in <xref target="sec_init"/>. This is required also in described in <xref target="sec_init" format="default"/>. This is also re
order for the receiver to know what the expected IDSN is, and thus quired in
order for the receiver to know what the expected IDSN is and thus
determine if any initial connection-level packets are missing; this determine if any initial connection-level packets are missing; this
is particularly relevant if two subflows start transmitting simultaneous ly.</t> is particularly relevant if two subflows start transmitting simultaneous ly.</t>
<t>A Data Sequence Mapping does not need to be included in
<t>A data sequence mapping does not need to be included in
every MPTCP packet, as long as the subflow sequence space in every MPTCP packet, as long as the subflow sequence space in
that packet is covered by a mapping known at the receiver. This that packet is covered by a mapping known at the receiver. This
can be used to reduce overhead in cases where the mapping is can be used to reduce overhead in cases where the mapping is
known in advance; one such case is when there is a single known in advance. One such case is when there is a single
subflow between the hosts, another is when segments of subflow between the hosts, and another is when segments of
data are scheduled in larger than packet-sized chunks.</t> data are scheduled in larger-than-packet-sized chunks.</t>
<t>An "infinite" mapping can be used to fall back to regular TCP by
<t>An "infinite" mapping can be used to fall back to regular TCP by
mapping the subflow-level data to the connection-level data mapping the subflow-level data to the connection-level data
for the remainder of the connection (see for the remainder of the connection (see
<xref target="sec_fallback"/>). This is achieved by setting <xref target="sec_fallback" format="default"/>). This is achieved by set ting
the Data-Level Length field of the DSS option to the reserved value of 0 . The the Data-Level Length field of the DSS option to the reserved value of 0 . The
checksum, in such a case, will also be set to zero.</t> checksum, in such a case, will also be set to 0.</t>
</section> </section>
<section anchor="sec_dataack" numbered="true" toc="default">
<section title="Data Acknowledgments" anchor="sec_dataack"> <name>Data Acknowledgments</name>
<t>To provide full end-to-end resilience, MPTCP provides a <t>To provide full end-to-end resilience, MPTCP provides a
connection-level acknowledgment, to act as a cumulative ACK for connection-level acknowledgment, to act as a cumulative ACK for
the connection as a whole. This is the "Data ACK" field of the connection as a whole. This is done via the "Data ACK" field of
the DSS option (<xref target="tcpm_dsn"/>). The Data ACK the DSS option (<xref target="tcpm_dsn" format="default"/>). The Data AC
K
is analogous to the behavior is analogous to the behavior
of the standard TCP cumulative ACK -- indicating of the standard TCP cumulative ACK -- indicating
how much data has been successfully received (with no how much data has been successfully received (with no
holes). This is in comparison to the subflow-level ACK, which holes). This can be compared to the subflow-level ACK, which
acts analogous to TCP SACK, given that there may still be acts in a fashion analogous to TCP SACK, given that there may still be
holes in the data stream at the connection level. holes in the data stream at the connection level.
The Data ACK specifies the next data sequence number The Data ACK specifies the next data sequence number
it expects to receive.</t> it expects to receive.</t>
<t>The Data ACK, as for the DSN, can be sent as the full 64-bit
<t>The Data ACK, as for the DSN, can be sent as the full 64-bit value or as the lower 32 bits. If data is received with a 64-bit DSN,
value, or as the lower 32 bits. If data is received with a 64-bit DSN, it <bcp14>MUST</bcp14> be acknowledged with a 64-bit Data ACK. If the D
it MUST be acknowledged with a 64-bit Data ACK. If the DSN received SN received
is 32 bits, an implementation can choose whether to send a 32-bit or is 32&nbsp;bits, an implementation can choose whether to send a 32-bit o
64-bit Data ACK, and an implementation MUST accept either in this situat r
ion.</t> 64-bit Data ACK, and an implementation <bcp14>MUST</bcp14> accept either
in this situation.</t>
<t>The Data ACK proves that the data, and all required MPTCP <t>The Data ACK proves that the data, and all required MPTCP
signaling, has been received and accepted by the remote end. signaling, have been received and accepted by the remote end.
One key use of the Data ACK signal is that it is used to indicate One key use of the Data ACK signal is that it is used to indicate
the left edge of the advertised receive window. As explained in the left edge of the advertised receive window. As explained in
<xref target="sec_rwin"/>, the receive window is shared by all <xref target="sec_rwin" format="default"/>, the receive window is shared by all
subflows and is relative to the Data ACK. Because of this, an subflows and is relative to the Data ACK. Because of this, an
implementation MUST NOT use the RCV.WND field of a TCP segment implementation <bcp14>MUST NOT</bcp14> use the RCV.WND field of a TCP se gment
at the connection level if it does not also carry a DSS option with at the connection level if it does not also carry a DSS option with
a Data ACK field. Furthermore, a Data ACK field. Furthermore,
separating the connection-level acknowledgments from the separating the connection-level acknowledgments from the
subflow level allows processing to be done separately, and subflow level allows processing to be done separately, and
a receiver has the freedom to drop segments after acknowledgment a receiver has the freedom to drop segments after acknowledgment
at the subflow level, for example, due to memory constraints at the subflow level -- for example, due to memory constraints
when many segments arrive out of order.</t> when many segments arrive out of order.</t>
<t>An MPTCP sender <bcp14>MUST NOT</bcp14> free data from the send buf
<t>An MPTCP sender MUST NOT free data from the send buffer until fer until
it has been acknowledged by both a Data ACK received on any subflow it has been acknowledged by both a Data ACK received on any subflow
and at the subflow level by all subflows on which the data was sent. and at the subflow level by all subflows on which the data was sent.
The former condition ensures liveness of the The former condition ensures liveness of the
connection and the latter condition ensures liveness and connection, and the latter condition ensures liveness and
self-consistence of a subflow when data needs to be self-consistence of a subflow when data needs to be
retransmitted. retransmitted.
Note, however, that if some data needs to be retransmitted multiple Note, however, that if some data needs to be retransmitted multiple
times over a subflow, there is a risk of blocking the sending times over a subflow, there is a risk of blocking the send
window. In this case, the MPTCP sender can decide to terminate the window. In this case, the MPTCP sender can decide to terminate the
subflow that is behaving badly by sending a RST, using an appropriate subflow that is behaving badly by sending a RST, using an appropriate
MP_TCPRST (<xref target="sec_reset"/>) error code.</t> MP_TCPRST (<xref target="sec_reset" format="default"/>) error code.</t>
<t>The Data ACK <bcp14>MAY</bcp14> be included in all segments; howeve
<t>The Data ACK MAY be included in all segments; however, optimizations r, optimizations
SHOULD be considered in more advanced implementations, where the <bcp14>SHOULD</bcp14> be considered in more advanced implementations, wh
ere the
Data ACK is present in segments Data ACK is present in segments
only when the Data ACK value advances, and this behavior MUST only when the Data ACK value advances, and this behavior <bcp14>MUST</bc
be treated as valid. This behavior ensures the sender buffer p14>
be treated as valid. This behavior ensures that the send buffer
is freed, while reducing overhead when the data transfer is is freed, while reducing overhead when the data transfer is
unidirectional.</t> unidirectional.</t>
</section> </section>
<section anchor="sec_close" numbered="true" toc="default">
<section title="Closing a Connection" anchor="sec_close"> <name>Closing a Connection</name>
<t>In regular TCP, a FIN announces the receiver that the sender has no m <t>In regular TCP, a FIN announces to the receiver that the sender has
ore data to send. no more data to send.
In order to allow subflows to operate independently and to keep the appearance o f TCP over the wire, In order to allow subflows to operate independently and to keep the appearance o f TCP over the wire,
a FIN in MPTCP only affects the subflow on which it is sent. This a FIN in MPTCP only affects the subflow on which it is sent. This
allows nodes to exercise considerable freedom over which paths are in use at any one time. allows nodes to exercise considerable freedom over which paths are in use at any one time.
The semantics of a FIN remain as for regular TCP; i.e., it is not until both sid es have ACKed The semantics of a FIN remain as for regular TCP; i.e., it is not until both sid es have ACKed
each other's FINs that the subflow is fully closed.</t> each other's FINs that the subflow is fully closed.</t>
<t>When an application calls close() on a socket, this indicates that it has no more <t>When an application calls close() on a socket, this indicates that it has no more
data to send; for regular TCP, this would result in a FIN on the connection. For MPTCP, an data to send; for regular TCP, this would result in a FIN on the connection. For MPTCP, an
equivalent mechanism is needed, and this is referred to as the DATA_FIN.</t> equivalent mechanism is needed; this is referred to as the DATA_FIN.</t>
<t>A DATA_FIN is an indication that the sender has no more data to sen
<t>A DATA_FIN is an indication that the sender has no more data to send, d, and
and as such it can be used to verify that all data has been successfully rec
as such can be used to verify that all data has been successfully receiv eived. A DATA_FIN,
ed. A DATA_FIN,
as with the FIN on a regular TCP connection, is a unidirectional signal. </t> as with the FIN on a regular TCP connection, is a unidirectional signal. </t>
<t>The DATA_FIN is signaled by setting the "F" flag in the DSS
<t>The DATA_FIN is signaled by setting the 'F' flag in the Data Sequence option (<xref target="tcpm_dsn" format="default"/>)
Signal option (<xref target="tcpm_dsn"/>) to 1. A DATA_FIN occupies 1 octet (th to 1. A DATA_FIN occupies 1 octet (the final octet) of the
e final octet) of the connection-level sequence space. Note that the DATA_FIN is connection-level sequence space. Note that the
included in the Data-Level Length, but not at the subflow level: for example, a DATA_FIN is included in the Data-Level Length but not at the subflow
segment with DSN 80, and Data-Level Length 11, with DATA_FIN set, would map 10 level: for example, a segment with a DSN value of 80 and a
octets from the subflow into data sequence space 80-89, the DATA_FIN is DSN 90; Data-Level Length of 11, with DATA_FIN set, would map 10 octets from
therefore, this segment including DATA_FIN would be acknowledged with a DATA_ACK the subflow into data sequence space 80-89, and the DATA_FIN would
of 91.</t> be DSN 90; therefore, this segment, including DATA_FIN, would be
acknowledged with a DATA_ACK of&nbsp;91.</t>
<t>Note that when the DATA_FIN is not attached to a TCP segment containi <t>Note that when the DATA_FIN is not attached to a TCP segment contai
ng data, the Data Sequence Signal MUST have a subflow sequence number of 0, a Da ning data, the DSS <bcp14>MUST</bcp14> have a subflow sequence number of 0, a Da
ta-Level Length of 1, and the data sequence number that corresponds with the DAT ta-Level Length of 1, and the data sequence number that corresponds with the DAT
A_FIN itself. The checksum in this case will only cover the pseudo-header.</t> A_FIN itself. The checksum in this case will only cover the pseudo-header.</t>
<t>A DATA_FIN has the same semantics and behavior as a regular TCP FIN
<t>A DATA_FIN has the semantics and behavior as a regular TCP FIN, but a , but at the connection level. Notably, it is only DATA_ACKed once all data has
t the connection level. Notably, it is only DATA_ACKed once all data has been su been successfully received at the connection level. Note, therefore, that a DATA
ccessfully received at the connection level. Note, therefore, that a DATA_FIN is _FIN is decoupled from a subflow FIN. It is only permissible to combine these si
decoupled from a subflow FIN. It is only permissible to combine these signals o gnals on one subflow if there is no data outstanding on other subflows. Otherwis
n one subflow if there is no data outstanding on other subflows. Otherwise, it m e, it may be necessary to retransmit data on different subflows. Essentially, a
ay be necessary to retransmit data on different subflows. Essentially, a host MU host <bcp14>MUST NOT</bcp14> close all functioning subflows unless it is safe to
ST NOT close all functioning subflows unless it is safe to do so, i.e., until al do so, i.e., until all outstanding data has been DATA_ACKed or until the segmen
l outstanding data has been DATA_ACKed, or until the segment with the DATA_FIN f t with the DATA_FIN flag set is the only outstanding segment.</t>
lag set is the only outstanding segment.</t> <t>Once a DATA_FIN has been acknowledged, all remaining subflows
<bcp14>MUST</bcp14> be closed with standard FIN exchanges. Both
<t>Once a DATA_FIN has been acknowledged, all remaining subflows MUST be hosts <bcp14>SHOULD</bcp14> send FINs on all subflows, as a courtesy,
closed with standard FIN exchanges. Both hosts SHOULD send FINs on all subflows to allow middleboxes to clean up state even if an individual subflow
, as a courtesy to allow middleboxes to clean up state even if an individual sub has failed. Reducing the timeouts (MSL) on subflows at end hosts after
flow has failed. It is also encouraged to reduce the timeouts (Maximum Segment L receiving a
ifetime) on subflows at end hosts after receiving a DATA_FIN. In particular, any DATA_FIN is also encouraged. In particular, any subflows where there i
subflows where there is still outstanding data queued (which has been retransmi s still
tted on other subflows in order to get the DATA_FIN acknowledged) MAY be closed outstanding data queued (which has been retransmitted on other
with a RST with MP_TCPRST (<xref target="sec_reset"/>) error code for "too much subflows in order to get the DATA_FIN acknowledged)
outstanding data".</t> <bcp14>MAY</bcp14> be closed with a RST with an MP_TCPRST (<xref targe
t="sec_reset" format="default"/>) error code for "too much outstanding data".</t
<t>A connection is considered closed once both hosts' DATA_FINs have bee >
n acknowledged by DATA_ACKs.</t> <t>A connection is considered closed once both hosts' DATA_FINs have b
een acknowledged by DATA_ACKs.</t>
<t>As specified above, a standard TCP FIN on an individual subflow only <t>As specified above, a standard TCP FIN on an individual subflow
shuts down the subflow on which it was sent. If all subflows have been closed wi only shuts down the subflow on which it was sent. If all subflows
th a FIN exchange, but no DATA_FIN has been received and acknowledged, the MPTCP have been closed with a FIN exchange but no DATA_FIN has been
connection is treated as closed only after a timeout. This implies that an impl received and acknowledged, the MPTCP connection is treated as closed
ementation will have TIME_WAIT states at both the subflow and connection levels only after a timeout. This implies that an implementation will have
(see <xref target="app_fsm"/>). This permits "break-before-make" scenarios where TIME_WAIT states at both the subflow level and the connection level (s
connectivity is lost on all subflows before a new one can be re-established.</t ee <xref target="app_fsm" format="default"/>). This permits "break-before-make"
> scenarios where connectivity is lost on all subflows before a new one can be re&
</section> #8209;established.</t>
</section>
<section title="Receiver Considerations" anchor="sec_rwin"> <section anchor="sec_rwin" numbered="true" toc="default">
<name>Receiver Considerations</name>
<t>Regular TCP advertises a receive window in each packet, telling the sender how much data the receiver <t>Regular TCP advertises a receive window in each packet, telling the sender how much data the receiver
is willing to accept past the cumulative ack. The receive window is used to impl ement flow control, throttling is willing to accept past the cumulative ACK. The receive window is used to impl ement flow control, throttling
down fast senders when receivers cannot keep up. </t> down fast senders when receivers cannot keep up. </t>
<t>MPTCP also uses a unique receive window, shared between the subflow s. The idea is to allow any <t>MPTCP also uses a unique receive window, shared between the subflow s. The idea is to allow any
subflow to send data as long as the receiver is willing to accept it. The altern subflow to send data as long as the receiver is willing to accept it. The
ative, maintaining per subflow alternative -- maintaining per-subflow
receive windows, could end up stalling some subflows while others would not use receive windows -- could end up stalling some subflows while others would not us
up their window.</t> e up their window.</t>
<t>The receive window is relative to the DATA_ACK. As in TCP, a receiv
<t>The receive window is relative to the DATA_ACK. As in TCP, a receiv er <bcp14>MUST NOT</bcp14> shrink the right edge of the receive window (i.e., DA
er MUST NOT shrink the right edge of the receive window (i.e., DATA_ACK + receiv TA_ACK + receive window). The receiver will
e window). The receiver will
use the data sequence number to tell if a packet should be accepted at the conne ction level.</t> use the data sequence number to tell if a packet should be accepted at the conne ction level.</t>
<t>When deciding to accept packets at the subflow level, regular TCP c
<t>When deciding to accept packets at subflow level, regular TCP check hecks
s
the sequence number in the packet against the allowed receive window. the sequence number in the packet against the allowed receive window.
With multipath, such a check is done using only the connection-level window. A s With MPTCP, such a check is done using only the connection-level window. A sanit
anity y
check SHOULD be performed at subflow level to ensure that the subflow and mapped check <bcp14>SHOULD</bcp14> be performed at the subflow level to ensure that the
sequence subflow and mapped sequence
numbers meet the following test: SSN - SUBFLOW_ACK &lt;= DSN - DATA_ACK, where S SN is the subflow sequence number of the received packet and SUBFLOW_ACK is the RCV.NXT (next expected sequence number) of the subflow (with the equivalent conn ection-level definitions for DSN and DATA_ACK).</t> numbers meet the following test: SSN - SUBFLOW_ACK &lt;= DSN - DATA_ACK, where S SN is the subflow sequence number of the received packet and SUBFLOW_ACK is the RCV.NXT (next expected sequence number) of the subflow (with the equivalent conn ection-level definitions for DSN and DATA_ACK).</t>
<t>In regular TCP, once a segment is deemed in-window, it is put in ei
<t>In regular TCP, once a segment is deemed in-window, it is put either ther
in the in-order receive queue or in the out-of-order queue. the in-order receive queue or the out-of-order queue.
In Multipath TCP, the same happens but at the connection level: a segment In Multipath TCP, the same thing happens, but at the connection level: a segment
is placed in the connection level in-order or out-of-order queue if is placed in the connection-level in-order or out-of-order queue if
it is in-window at both connection and subflow levels. it is in-window at both the connection level and the subflow level.
The stack still has to remember, for each subflow, which segments were The stack still has to remember, for each subflow, which segments were
received successfully so that it can ACK them at subflow level appropriately. received successfully so that it can ACK them at the subflow level appropriately
Typically, this will be implemented by keeping per subflow out-of-order .
queues (containing only message headers, not the payloads) and remembering Typically, this will be implemented by keeping per-subflow out-of-order
queues (containing only message headers -- not the payloads) and remembering
the value of the cumulative ACK. the value of the cumulative ACK.
</t> </t>
<t>It is important for implementers to understand how large <t>It is important for implementers to understand how large
a receiver buffer is appropriate. The lower bound for full a receive buffer is appropriate. The lower bound for full
network utilization is the maximum bandwidth-delay product network utilization is the maximum bandwidth-delay product
of any one of the paths. However, this might be insufficient of any one of the paths. However, this might be insufficient
when a packet is lost on a slower subflow and needs to be when a packet is lost on a slower subflow and needs to be
retransmitted (see <xref target="sec_retransmit"/>). A tight retransmitted (see <xref target="sec_retransmit" format="default"/>). A tight
upper bound would be the maximum round-trip time (RTT) of any path mul tiplied upper bound would be the maximum round-trip time (RTT) of any path mul tiplied
by the total bandwidth available across all paths. This by the total bandwidth available across all paths. This
permits all subflows to continue at full speed while a permits all subflows to continue at full speed while a
packet is fast-retransmitted on the maximum RTT path. Even packet is fast-retransmitted on the maximum RTT path. Even
this might be insufficient to maintain full performance in this might be insufficient to maintain full performance in
the event of a retransmit timeout on the maximum RTT path. the event of a retransmit timeout on the maximum RTT path.
It is for future study to determine the relationship between Determining the relationship between
retransmission strategies and receive buffer sizing.</t> retransmission strategies and receive buffer sizing is left for future
study.</t>
</section> </section>
<section anchor="sec_sender" numbered="true" toc="default">
<section title="Sender Considerations" anchor="sec_sender"> <name>Sender Considerations</name>
<t>The sender remembers receiver window advertisements from the receiv <t>The sender remembers receive window advertisements from the
er. It should only update its local receive window values when the largest seque receiver. It should only update its local receive window values when
nce number allowed (i.e., DATA_ACK + receive window) increases, on the receipt o the largest sequence number allowed (i.e., DATA_ACK + receive
f a DATA_ACK. This is important to allow using paths with different RTTs, and th window) increases on the receipt of a DATA_ACK. This is important
us different feedback loops. </t> for allowing the use of paths with different RTTs and thus different f
eedback loops. </t>
<t>MPTCP uses a single receive window across all subflows, and if the <t>MPTCP uses a single receive window across all subflows, and if
receive window was guaranteed to be unchanged end-to-end, a host could always re the receive window was guaranteed to be unchanged end to end, a host c
ad the most recent receive window value. However, some classes of middleboxes ma ould always read the most recent receive window value. However, some classes of
y alter the TCP-level receive window. Typically, these will middleboxes may alter the TCP-level receive window. Typically, these will
shrink the offered window, although for short periods of time it may be possible for the window to be larger (however, shrink the offered window, although for short periods of time it may be possible for the window to be larger (however,
note that this would not continue for long periods since ultimately the middlebo x must keep up with note that this would not continue for long periods, since ultimately the middleb ox must keep up with
delivering data to the receiver). Therefore, if receive window sizes differ on m ultiple subflows, delivering data to the receiver). Therefore, if receive window sizes differ on m ultiple subflows,
when sending data MPTCP SHOULD take the largest of the most recent window sizes as the one to use in calculations. when sending data MPTCP <bcp14>SHOULD</bcp14> take the largest of the most recen t window sizes as the one to use in calculations.
This rule is implicit in the requirement not to reduce the right edge of the win dow.</t> This rule is implicit in the requirement not to reduce the right edge of the win dow.</t>
<t>The sender <bcp14>MUST</bcp14> also remember the receive windows ad
<t>The sender MUST also remember the receive windows advertised by eac vertised by each subflow.
h subflow.
The allowed window for subflow i is (ack_i, ack_i + rcv_wnd_i), where ack_i is t he The allowed window for subflow i is (ack_i, ack_i + rcv_wnd_i), where ack_i is t he
subflow-level cumulative ACK of subflow i. This ensures data will not be sent to a middlebox subflow-level cumulative ACK of subflow i. This ensures that data will not be se nt to a middlebox
unless there is enough buffering for the data. </t> unless there is enough buffering for the data. </t>
<t>Putting the two rules together, we get the following: a sender is a llowed to send <t>Putting the two rules together, we get the following: a sender is a llowed to send
data segments with data-level sequence numbers between (DATA_ACK, DATA_ACK + rec eive_window). data segments with data-level sequence numbers between (DATA_ACK, DATA_ACK + rec eive_window).
Each of these segments will be mapped onto subflows, as long as subflow sequence numbers Each of these segments will be mapped onto subflows, as long as subflow sequence numbers
are in the allowed windows for those subflows. Note that subflow sequence number s do not are in the allowed windows for those subflows. Note that subflow sequence number s do not
generally affect flow control if the same receive window is advertised across al l subflows. generally affect flow control if the same receive window is advertised across al l subflows.
They will perform flow control for those subflows with a smaller advertised rece ive window. They will perform flow control for those subflows with a smaller advertised rece ive window.
</t> </t>
<t>The send buffer <bcp14>MUST</bcp14>, at a minimum, be as big as the
<t>The send buffer MUST, at a minimum, be as big as the receive buffer receive buffer, to enable the sender to reach maximum throughput.</t>
, to enable the sender to reach maximum throughput.</t>
</section> </section>
<section anchor="sec_retransmit" numbered="true" toc="default">
<section title="Reliability and Retransmissions" anchor="sec_retransmit" <name>Reliability and Retransmissions</name>
> <t>The Data Sequence Mapping allows senders to resend data with the
same data sequence number on a different subflow. When doing this, a
<t>The data sequence mapping allows senders to resend data with the sa host <bcp14>MUST</bcp14> still retransmit the original data on the
me data sequence number on a different subflow. When doing this, a host MUST sti original subflow, in order to preserve the subflow's integrity
ll retransmit the original data on the original subflow, in order to preserve th (middleboxes could replay old data and&wj;/or could reject holes in
e subflow integrity (middleboxes could replay old data, and/or could reject hole subflows), and a receiver will ignore these retransmissions. While
s in subflows), and a receiver will ignore these retransmissions. While this is this is clearly suboptimal, for compatibility reasons this is
clearly suboptimal, for compatibility reasons this is sensible behavior. Optimiz sensible behavior. Optimizations could be negotiated in future
ations could be negotiated in future versions of this protocol. Note also that t versions of this protocol. Note also that this property would also per
his property would also permit a sender to always send the same data, with the s mit a sender to always send the same data, with the same data sequence number, o
ame data sequence number, on multiple subflows, if desired for reliability reaso n multiple subflows, if desired for reliability reasons.</t>
ns.</t>
<t>This protocol specification does not mandate any mechanisms for han dling retransmissions, and much will be dependent upon local policy <t>This protocol specification does not mandate any mechanisms for han dling retransmissions, and much will be dependent upon local policy
(as discussed in <xref target="sec_policy"/>). One can imagine aggressive connec (as discussed in <xref target="sec_policy" format="default"/>). One can imagine
tion-level retransmissions policies where every packet lost at subflow level is aggressive connection-level retransmission policies where every packet lost at t
retransmitted on he subflow level is retransmitted on
a different subflow (hence, wasting bandwidth but possibly reducing application- a different subflow (hence wasting bandwidth but possibly reducing application-t
to-application delays), or conservative retransmission policies where connection o-application delays) or conservative retransmission policies where connection-l
-level retransmits evel retransmissions
are only used after a few subflow-level retransmission timeouts occur.</t> are only used after a few subflow-level retransmission timeouts occur.</t>
<t>It is envisaged that a standard connection-level retransmission mec hanism <t>It is envisaged that a standard connection-level retransmission mec hanism
would be implemented around a connection-level data queue: all segments that hav en't would be implemented around a connection-level data queue: all segments that hav en't
been DATA_ACKed are stored. A timer is set when been DATA_ACKed are stored. A timer is set when
the head of the connection-level is ACKed at subflow level but its corresponding the head of the connection level is ACKed at the subflow level but is not DATA_A
data CKed at the data level. This timer will guard against retransmission failures
is not ACKed at data level. This timer will guard against failures in retransmis
sion
by middleboxes that proactively ACK data.</t> by middleboxes that proactively ACK data.</t>
<t>The sender <bcp14>MUST</bcp14> keep data in its send buffer as
<t>The sender MUST keep data in its send buffer as long as the data ha long as the data has not been acknowledged both (1)&nbsp;at the
s not been acknowledged at both connection level and on all subflows on which it connection level and (2)&nbsp;on all subflows on which it
has been sent. In this way, the sender can always retransmit the data if needed, on the same subflow or on a different one. A special case is when a subflow fai ls: the sender has been sent. In this way, the sender can always retransmit the data if needed, on the same subflow or on a different one. A special case is when a subflow fai ls: the sender
will typically resend the data on other working subflows after a timeout, and wi will typically resend the data on other working subflows after a timeout and wil
ll keep trying to retransmit the data l keep trying to retransmit the data
on the failed subflow too. The sender will declare the subflow failed after a pr on the failed subflow too. The sender will declare the subflow failed after a pr
edefined upper bound on retransmissions is reached (which MAY be lower than the edefined upper bound on retransmissions is reached (which <bcp14>MAY</bcp14> be
usual TCP limits of the Maximum Segment Life), or on the receipt of an ICMP erro lower than the usual TCP limits of the MSL) or on the receipt of an ICMP error,
r, and only then delete the outstanding data segments. </t> and only then delete the outstanding data segments. </t>
<t>If multiple retransmissions that indicate that a
<t>If multiple retransmissions are triggered that indicate that a subf subflow is performing badly are triggered, this <bcp14>MAY</bcp14> lea
low performs badly, this MAY lead to a host resetting the subflow with a RST. Ho d to a host resetting the subflow with a RST. However, additional research is re
wever, additional research is required to understand the heuristics of how and w quired to understand the heuristics of how and when to reset underperforming sub
hen to reset underperforming subflows. For example, a highly asymmetric path may flows. For example, a highly asymmetric path may be misdiagnosed as underperform
be misdiagnosed as underperforming. A RST for this purpose SHOULD be accompanie ing. A RST for this purpose <bcp14>SHOULD</bcp14> be accompanied by an "Unaccept
d with an "Unacceptable performance" MP_TCPRST option (<xref target="sec_reset"/ able performance" MP_TCPRST option (<xref target="sec_reset" format="default"/>)
>).</t> .</t>
</section> </section>
<section anchor="sec_cc" numbered="true" toc="default">
<section title="Congestion Control Considerations" anchor="sec_cc"> <name>Congestion Control Considerations</name>
<t>Different subflows in an MPTCP connection have different congestion windows. <t>Different subflows in an MPTCP connection have different congestion windows.
To achieve fairness at bottlenecks and resource pooling, it is necessary to coup le the To achieve fairness at bottlenecks and resource pooling, it is necessary to coup le the
congestion windows in use on each subflow, in order to push most traffic to unco ngested links. congestion windows in use on each subflow, in order to push most traffic to unco ngested links.
One algorithm for achieving this is presented in <xref target="RFC6356"/>; One algorithm for achieving this is presented in <xref target="RFC6356" format=" default"/>;
the algorithm does not achieve perfect resource pooling but is "safe" in that it is readily the algorithm does not achieve perfect resource pooling but is "safe" in that it is readily
deployable in the current Internet. By this, we mean that it does not take up mo re capacity deployable in the current Internet. By this we mean that it does not take up mor e capacity
on any one path than if it was a single path flow using only that route, so this ensures on any one path than if it was a single path flow using only that route, so this ensures
fair coexistence with single-path TCP at shared bottlenecks.</t> fair coexistence with single-path TCP at shared bottlenecks.</t>
<t>It is foreseeable that different congestion controllers will be
<t>It is foreseeable that different congestion controllers will be imp implemented for MPTCP, each aiming to achieve different properties
lemented for MPTCP, each aiming to achieve different properties in the resource in the resource pooling / fairness / stability design space, as well a
pooling/fairness/stability design space, as well as those for achieving differen s those for achieving different properties in quality of service, reliability, a
t properties in quality of service, reliability, and resilience.</t> nd resilience.</t>
<t>Regardless of the algorithm used, <t>Regardless of the algorithm used,
the design of the MPTCP protocol aims to provide the congestion control implemen the design of MPTCP aims to provide the congestion control
tations sufficient information implementations with sufficient information
to take the right decisions; this information includes, for each subflow, which to make the right decisions; this information includes, for each subflow, which
packets were lost and when. </t> packets were lost and when. </t>
</section> </section>
<section anchor="sec_policy" numbered="true" toc="default">
<section title="Subflow Policy" anchor="sec_policy"> <name>Subflow Policy</name>
<t>Within a local MPTCP implementation, a host may use any local polic y it wishes to decide how to share the traffic to be sent over the available pat hs.</t> <t>Within a local MPTCP implementation, a host may use any local polic y it wishes to decide how to share the traffic to be sent over the available pat hs.</t>
<t>In the typical use case, where the goal is to maximize throughput, <t>In the typical use case, where the goal is to maximize throughput,
all available paths will be used simultaneously for data transfer, using coupled all available paths will be used simultaneously for data transfer, using coupled
congestion control as described in <xref target="RFC6356"/>. It is expected, ho congestion control as described in <xref target="RFC6356" format="default"/>. I
wever, that other use cases will appear.</t> t is expected, however, that other use cases will appear.</t>
<t>For instance, a possibility is an 'all-or-nothing' approach, i.e., <t>For instance, one possibility is an "all-or-nothing" approach, i.e.
have a second path ready for use in the event of , have a second path ready for use in the event of
failure of the first path, but alternatives could include entirely saturating on e path before using an additional failure of the first path, but alternatives could include entirely saturating on e path before using an additional
path (the 'overflow' case). Such choices would be most likely based on the monet ary cost of links, but may also be path (the "overflow" case). Such choices would be most likely based on the monet ary cost of links but may also be
based on properties such as the delay or jitter of links, where stability (of de lay or bandwidth) is more important than throughput. Application based on properties such as the delay or jitter of links, where stability (of de lay or bandwidth) is more important than throughput. Application
requirements such as these are discussed in detail in <xref target="RFC6897"/>.< /t> requirements such as these are discussed in detail in <xref target="RFC6897" for mat="default"/>.</t>
<t>The ability to make effective choices at the sender requires full k nowledge of the path "cost", which <t>The ability to make effective choices at the sender requires full k nowledge of the path "cost", which
is unlikely to be the case. It would be desirable for a receiver to be able to s ignal their own preferences for paths, is unlikely to be the case. It would be desirable for a receiver to be able to s ignal their own preferences for paths,
since they will often be the multihomed party, and may have to pay for metered i since they will often be the multihomed party and may have to pay for metered in
ncoming bandwidth.</t> coming bandwidth.</t>
<t>To enable this, the MP_JOIN option (see <xref target="sec_join"/>) <t>To enable this behavior, the MP_JOIN option (see <xref
contains the 'B' bit, which allows a host to indicate to its peer that this path target="sec_join" format="default"/>) contains the "B"&nbsp;bit,
should be treated as a backup path to use only in the event of failure of other which allows a host to indicate to its peer that this path should be
working subflows (i.e., a subflow where the receiver has indicated B=1 SHOULD N treated as a backup path to use only in the event of failure of
OT be used to send data unless there are no usable subflows where B=0).</t> other working subflows (i.e., a subflow where the receiver has
<t>In the event that the available set of paths changes, a host may wi indicated that B=1 <bcp14>SHOULD NOT</bcp14> be used to send data unle
sh to signal a change in priority of subflows to the peer (e.g., a subflow that ss there are no usable subflows where B=0).</t>
was previously set as backup should now take priority over all remaining subflow <t>In the event that the available set of paths changes, a host may
s). Therefore, the MP_PRIO option, shown in <xref target="tcpm_prio"/>, can be u wish to signal a change in priority of subflows to the peer (e.g., a
sed to change the 'B' flag of the subflow on which it is sent.</t> subflow that was previously set as a backup should now take priority
<t>Another use of the MP_PRIO option is to set the 'B' flag on a subfl over all remaining subflows). Therefore, the MP_PRIO option, shown
ow to cleanly retire its use before closing it and removing it with REMOVE_ADDR in <xref target="tcpm_prio" format="default"/>, can be used to
<xref target="sec_remove_addr"/>, for example to support make-before-break sessi change the "B" flag of the subflow on which it is sent.</t>
on continuity, where new subflows are added before the previously used ones are <figure anchor="tcpm_prio">
closed.</t> <name>Change Subflow Priority (MP_PRIO) Option</name>
<?rfc needLines='8'?> <artwork align="left" name="" type="" alt=""><![CDATA[
<figure align="center" anchor="tcpm_prio" title="Change Subflow Priori 1 2 3
ty (MP_PRIO) Option"> 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
<artwork align="left"><![CDATA[ +---------------+---------------+-------+-----+-+
1 2 3 | Kind | Length |Subtype|(rsv)|B|
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +---------------+---------------+-------+-----+-+ ]]></artwork>
+---------------+---------------+-------+-----+-+
| Kind | Length |Subtype|(rsv)|B|
+---------------+---------------+-------+-----+-+
]]></artwork>
</figure> </figure>
<t>Another use of the MP_PRIO option is to set the "B" flag on a
<t>It should be noted that the backup flag is a request from a data receiver to subflow to cleanly "retire" its use before closing it and removing it
a data sender only, and the data sender SHOULD adhere to these requests. A host with REMOVE_ADDR (<xref target="sec_remove_addr" format="default"/>) -
cannot assume that the data sender will do so, however, since local policies -- - for example, to support make-before-break session continuity, where new subflo
or technical difficulties -- may override MP_PRIO requests. Note also that this ws are added before the previously used subflows are closed.</t>
signal applies to a single direction, and so the sender of this option could cho <t>It should be noted that the backup flag is a request from a data re
ose to continue using the subflow to send data even if it has signaled B=1 to th ceiver to a data sender only, and the data sender <bcp14>SHOULD</bcp14> adhere t
e other host.</t> o these requests. A host cannot assume that the data sender will do so, however,
since local policies -- or technical difficulties -- may override MP_PRIO reque
sts. Note also that this signal applies to a single direction, and so the sender
of this option could choose to continue using the subflow to send data even if
it has signaled B=1 to the other host.</t>
</section> </section>
</section> </section>
<section anchor="sec_pm" numbered="true" toc="default">
<section title="Address Knowledge Exchange (Path Management)" anchor="sec_ <name>Address Knowledge Exchange (Path Management)</name>
pm"> <t>We use the term "path management" to refer to the exchange of informa
<t>We use the term "path management" to refer to the exchange of informa tion about additional paths between hosts, which in this design is managed by mu
tion about additional paths between hosts, which in this design is managed by mu ltiple addresses at hosts. For more details regarding the architectural thinking
ltiple addresses at hosts. For more detail of the architectural thinking behind behind this design, see the MPTCP architecture document <xref target="RFC6182"
this design, see the MPTCP Architecture document <xref target="RFC6182"/>.</t> format="default"/>.</t>
<t>This design makes use of two methods of sharing such <t>This design makes use of two methods of sharing such
information, and both can be used on a connection. information, and both can be used on a connection.
The first is the direct The first is the direct
setup of new subflows, already described in setup of new subflows (described in
<xref target="sec_join"/>, where the initiator has an <xref target="sec_join" format="default"/>), where the initiator has an
additional address. The second method, described in the additional address. The second method (described in the
following subsections, signals addresses explicitly to the following subsections) signals addresses explicitly to the
other host to allow it to initiate new subflows. The other host to allow it to initiate new subflows. The
two mechanisms are complementary: the first is implicit and two mechanisms are complementary: the first is implicit and
simple, while the explicit is more complex but is more simple, while the second (explicit) is more complex but is more
robust. Together, the mechanisms allow addresses to change in robust. Together, these mechanisms allow addresses to change in
flight (and thus support operation through NATs, since the flight (and thus support operation through NATs, since the
source address need not be known), and also allow the source address need not be known); they also allow the
signaling of previously unknown addresses, and of addresses signaling of previously unknown addresses and of addresses
belonging to other address families (e.g., both IPv4 and IPv6).</t> belonging to other address families (e.g., both IPv4 and IPv6).</t>
<t>Here is an example of typical operation of the protocol: <t>Here is an example of typical operation of the protocol:
<list style="symbols"> </t>
<t>An MPTCP connection is initially set up between address/port A1 o <ul spacing="normal">
f Host A <li>An MPTCP connection is initially set up between address&wj;/port A
and address/port B1 of Host B.&nbsp; If Host A is multihomed and 1 of Host A
and address&wj;/port B1 of Host B. If Host A is multihomed and
multiaddressed, it can start an additional subflow from multiaddressed, it can start an additional subflow from
its address A2 to B1, by sending a SYN with a Join its address A2 to B1, by sending a SYN with an MP_JOIN
option from A2 to B1, using B's previously declared option from A2 to B1, using B's previously declared
token for this connection. Alternatively, if B is token for this connection. Alternatively, if B is
multihomed, it can try to set up a new subflow from B2 to multihomed, it can try to set up a new subflow from B2 to
A1, using A's previously declared token. In either A1, using A's previously declared token. In either
case, the SYN will be sent to the port already in use case, the SYN will be sent to the port already in use
for the original subflow on the receiving host.</t> for the original subflow on the receiving host.</li>
<li>Simultaneously (or after a timeout), an ADD_ADDR option
<t>Simultaneously (or after a timeout), an ADD_ADDR option (<xref target="sec_add_address" format="default"/>) is sent on an existing subfl
(<xref target="sec_add_address"/>) is sent on an existing subflow, informing ow, informing
the receiver of the sender's alternative address(es). The recipient can use the receiver of the sender's alternative address(es). The recipient can use
this information to open a new subflow to the sender's additional address. this information to open a new subflow to the sender's additional address(es).
In our example, A will send ADD_ADDR option informing B of address/port A2. In our example, A will send the ADD_ADDR option informing B of address&wj;/port
The mix of using the SYN-based option and the ADD_ADDR option, including A2.
timeouts, is implementation specific and can be tailored to agree with local pol The mix of using the SYN&#8209;based option and the ADD_ADDR option, including
icy.</t> timeouts, is implementation specific and can be tailored to agree with local pol
icy.</li>
<t>If subflow A2-B1 is successfully set up, Host B can use the Addre <li>If subflow A2-B1 is successfully set up, Host B can use the Addres
ss ID in s ID in
the Join option to correlate this with the ADD_ADDR option that will also arrive the MP_JOIN option to correlate this source address with the ADD_ADDR option tha
on t will also arrive on
an existing subflow; now B knows not to open A2-B1, ignoring the ADD_ADDR. an existing subflow; now B knows not to open A2-B1, ignoring the ADD_ADDR.
Otherwise, if B has not received the A2-B1 MP_JOIN SYN but received the ADD_ADDR , Otherwise, if B has not received the A2-B1 MP_JOIN SYN but received the ADD_ADDR ,
it can try to initiate a new subflow from one or more of its addresses to addres s it can try to initiate a new subflow from one or more of its addresses to addres s
A2. This permits new sessions to be opened if one host is behind a NAT.</t> A2. This permits new sessions to be opened if one host is behind a NAT.</li>
</list> </ul>
<t>
Other ways of using the two signaling mechanisms are possible; for instan ce, Other ways of using the two signaling mechanisms are possible; for instan ce,
signaling addresses in other address families can only be done explicitly using signaling addresses in other address families can only be done explicitly
the Add Address option. using the Add Address (ADD_ADDR) option.
</t> </t>
<section anchor="sec_add_address" numbered="true" toc="default">
<section title="Address Advertisement" anchor="sec_add_address"> <name>Address Advertisement</name>
<t>The Add Address (ADD_ADDR) MPTCP option announces additional addresse <t>The ADD_ADDR MPTCP option announces additional addresses (and, opti
s (and optionally, ports) on which a onally, ports) on which a
host can be reached (<xref target="tcpm_address"/>). host can be reached (<xref target="tcpm_address" format="default"/>).
This option can be used at any time during a connection, depending on when the This option can be used at any time during a connection, depending on when the
sender wishes to enable multiple paths and/or when paths become available. As wi sender wishes to enable multiple paths and&wj;/or when paths become available. A
th all MPTCP s with all MPTCP
signals, the receiver MUST undertake standard TCP validity checks, e.g. <xref ta signals, the receiver <bcp14>MUST</bcp14> undertake standard TCP validity
rget="RFC5961"/>, before acting upon it.</t> checks, e.g., per <xref target="RFC5961" format="default"/>, before
acting upon&nbsp;it.</t>
<t>Every address has an Address ID that can be used for uniquely identif <figure anchor="tcpm_address">
ying the address within a connection for address removal. The Address ID is also <name>Add Address (ADD_ADDR) Option</name>
used to identify MP_JOIN options (see <xref target="sec_join"/>) relating to <artwork align="left" name="" type="" alt=""><![CDATA[
the same address, even when address translators are in use. The Address ID MUST 1 2 3
uniquely 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
identify the address for the sender of the option (within the scope of the conne +---------------+---------------+-------+-------+---------------+
ction), but the mechanism for | Kind | Length |Subtype|(rsv)|E| Address ID |
allocating such IDs is implementation specific.</t> +---------------+---------------+-------+-------+---------------+
| Address (IPv4: 4 octets / IPv6: 16 octets) |
+-------------------------------+-------------------------------+
| Port (2 octets, optional) | |
+-------------------------------+ |
| Truncated HMAC (8 octets, if E=0) |
| +-------------------------------+
| |
+-------------------------------+ ]]></artwork>
</figure>
<t>All address IDs learned via either MP_JOIN or ADD_ADDR <t>Every address has an Address ID that can be used for uniquely ident
SHOULD be stored by the receiver in a data structure that gathers all th ifying the address within a connection for address removal. The Address ID is al
e Address ID so
to address mappings for a connection (identified by a token pair). In th used to identify MP_JOIN options (see <xref target="sec_join" format="default"/>
is way, there is ) relating to
a stored mapping between Address ID, observed source address, and token the same address, even when address translators are in use. The Address ID <bcp1
pair for 4>MUST</bcp14> uniquely
identify the address for the sender of the option (within the scope of the conne
ction); the mechanism for
allocating such IDs is implementation specific.</t>
<t>All Address IDs learned via either MP_JOIN or ADD_ADDR
<bcp14>SHOULD</bcp14> be stored by the receiver in a data structure
that gathers all the Address-ID-to-address mappings for a connection
(identified by a token pair). In this way, there is
a stored mapping between the Address ID, observed source address, and to
ken pair for
future processing of control information for a connection. Note that an implementation future processing of control information for a connection. Note that an implementation
MAY discard incoming address advertisements at will, for example, for av <bcp14>MAY</bcp14> discard incoming address advertisements at will -- fo
oiding updating r example, to avoid updating
mapping state, or because advertised addresses are of no use to it (for mapping state or because advertised addresses are of no use to it (for
example, IPv6 addresses when it has IPv4 only). Therefore, a host MUST t example, IPv6 addresses when it has IPv4 only). Therefore, a host <bcp14
reat address >MUST</bcp14> treat address
advertisements as soft state, and it MAY choose to refresh advertisement advertisements as soft state, and it <bcp14>MAY</bcp14> choose to refres
s periodically. h advertisements periodically.
Note also that an implementation MAY choose to cache these address adver Note also that an implementation <bcp14>MAY</bcp14> choose to cache thes
tisements even e address advertisements even
if they are not currently relevant but may be relevant in the future, su ch as IPv4 if they are not currently relevant but may be relevant in the future, su ch as IPv4
addresses when IPv6 connectivity is available but IPv4 is awaiting DHCP. </t> addresses when IPv6 connectivity is available but IPv4 is awaiting DHCP. </t>
<t>This option is shown in <xref target="tcpm_address" format="default
<t>This option is shown in <xref target="tcpm_address"/>. The illustrati "/>. The illustration is sized for
on is sized for IPv4 addresses. For IPv6, the length of the address will be 16&nbsp;octe
IPv4 addresses. For IPv6, the length of the address will be 16 octets (i ts (instead of 4).</t>
nstead of 4).</t> <t>The 2 octets that specify the TCP port number to use are optional,
and their presence
<t>The 2 octets that specify the TCP port number to use are optional and
their presence
can be inferred from the length of the option. Although it is expected t hat the majority of can be inferred from the length of the option. Although it is expected t hat the majority of
use cases will use the same port pairs as used for the initial subflow ( e.g., port use cases will use the same port pairs as those used for the initial sub flow (e.g., port
80 remains port 80 on all subflows, as does the ephemeral port at the cl ient), there 80 remains port 80 on all subflows, as does the ephemeral port at the cl ient), there
may be cases (such as port-based load balancing) where the explicit spec ification of may be cases (such as port-based load balancing) where the explicit spec ification of
a different port is required. If no port is specified, MPTCP SHOULD atte a different port is required. If no port is specified, MPTCP <bcp14>SHOU
mpt to LD</bcp14> attempt to
connect to the specified address on the same port as is already in use b connect to the specified address on the same port as the port that is al
y the subflow ready in use by the subflow
on which the ADD_ADDR signal was sent; this is discussed in more detail on which the ADD_ADDR signal was sent; this is discussed in more detail
in <xref target="heuristics"/>.</t> in <xref target="heuristics" format="default"/>.</t>
<t>The Truncated HMAC parameter present in this option is the rightmos
<t>The Truncated HMAC present in this Option is the rightmost 64 bits of t 64 bits of an HMAC, negotiated and
an HMAC, negotiated and calculated in the same way as for MP_JOIN as described in <xref target="
calculated in the same way as for MP_JOIN as described in <xref target=" sec_join" format="default"/>. For this
sec_join"/>. For this
specification of MPTCP, as there is only one hash algorithm option speci fied, this will be HMAC as specification of MPTCP, as there is only one hash algorithm option speci fied, this will be HMAC as
defined in <xref target="RFC2104"/>, using the SHA-256 hash algorithm <x ref target="RFC6234"/>. defined in <xref target="RFC2104" format="default"/>, using the SHA-256 hash algorithm <xref target="RFC6234" format="default"/>.
In the same way as for MP_JOIN, the key for the HMAC In the same way as for MP_JOIN, the key for the HMAC
algorithm, in the case of the message transmitted by Host A, will be Key -A followed by Key-B, and in algorithm, in the case of the message transmitted by Host A, will be Key -A followed by Key-B, and in
the case of Host B, Key-B followed by Key-A. These are the keys that we re exchanged in the original the case of Host B, Key-B followed by Key-A. These are the keys that we re exchanged in the original
MP_CAPABLE handshake. The message for the HMAC is the Address ID, IP Add ress, and Port which precede MP_CAPABLE handshake. The message for the HMAC is the Address ID, IP add ress, and port that precede
the HMAC in the ADD_ADDR option. If the port is not present in the ADD_A DDR option, the HMAC message the HMAC in the ADD_ADDR option. If the port is not present in the ADD_A DDR option, the HMAC message
will nevertheless include two octets of value zero. The rationale for th e HMAC is to will nevertheless include 2 octets of value zero. The rationale for the HMAC is to
prevent unauthorized entities from injecting ADD_ADDR signals in an atte mpt to hijack a connection. prevent unauthorized entities from injecting ADD_ADDR signals in an atte mpt to hijack a connection.
Note that additionally the presence of this HMAC prevents the address be Note that, additionally, the presence of this HMAC prevents the
ing changed in flight unless address from being changed in flight unless
the key is known by an intermediary. If a host receives an ADD_ADDR opti on for which it cannot the key is known by an intermediary. If a host receives an ADD_ADDR opti on for which it cannot
validate the HMAC, it SHOULD silently ignore the option.</t> validate the HMAC, it <bcp14>SHOULD</bcp14> silently ignore the option.<
/t>
<t>A set of four flags are present after the subtype and before the Addr <t>A set of four flags is present after the subtype and before the Add
ess ID. Only the rightmost ress ID. Only the rightmost
bit - labelled 'E' - is assigned in this specification. The other bits a bit -- labeled "E" -- is assigned in this specification. The other
re currently unassigned and MUST bits are currently unassigned; they <bcp14>MUST</bcp14>
be set to zero by a sender and MUST be ignored by the receiver.</t> be set to 0 by a sender and <bcp14>MUST</bcp14> be ignored by the receiv
er.</t>
<t>The 'E' flag exists to provide reliability for this option. Because t <t>The "E" flag exists to provide reliability for this option. Because
his option will often be sent this option will often be sent
on pure ACKs, there is no guarantee of reliability. Therefore, a receive r receiving a fresh ADD_ADDR on pure ACKs, there is no guarantee of reliability. Therefore, a receive r receiving a fresh ADD_ADDR
option (where E=0), will send the same option back to the sender, but no option (where E=0) will send the same option back to the sender, but not
t including the HMAC, and including the HMAC and
with E=1, to indicate receipt. The lack of this echo can be used by the with E=1, to indicate receipt. According to local policy, the lack of
initial ADD_ADDR sender to this type of "echo" can indicate to the initial ADD_ADDR sender that the
retransmit the ADD_ADDR according to local policy.</t> ADD_ADDR needs to be retransmitted.</t>
<?rfc needLines='11'?>
<figure align="center" anchor="tcpm_address" title="Add Address (ADD_ADD
R) Option">
<artwork align="left"><![CDATA[
1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+---------------+---------------+-------+-------+---------------+
| Kind | Length |Subtype|(rsv)|E| Address ID |
+---------------+---------------+-------+-------+---------------+
| Address (IPv4 - 4 octets / IPv6 - 16 octets) |
+-------------------------------+-------------------------------+
| Port (2 octets, optional) | |
+-------------------------------+ |
| Truncated HMAC (8 octets, if E=0) |
| +-------------------------------+
| |
+-------------------------------+
]]></artwork>
</figure>
<t>Due to the proliferation of NATs, it is reasonably likely that one ho
st may attempt to advertise private addresses <xref target="RFC1918"/>. It is no
t desirable to prohibit this, since there may be cases where both hosts have add
itional interfaces on the same private network, and a host MAY advertise such ad
dresses. The MP_JOIN handshake to create a new subflow (<xref target="sec_join"/
>) provides mechanisms to minimize security risks. The MP_JOIN message contains
a 32-bit token that uniquely identifies the connection to the receiving host. If
the token is unknown, the host will return with a RST. In the unlikely event th
at the token is valid at the receiving host, subflow setup will continue, but th
e HMAC exchange must occur for authentication. This will fail, and will provide
sufficient protection against two unconnected hosts accidentally setting up a ne
w subflow upon the signal of a private address. Further security considerations
around the issue of ADD_ADDR messages that accidentally misdirect, or maliciousl
y direct, new MP_JOIN attempts are discussed in <xref target="sec_security"/>.</
t>
<t>A host that receives an ADD_ADDR but finds a connection set up to tha
t IP address and port number is unsuccessful SHOULD NOT perform further connecti
on attempts to this address/port combination for this connection. A sender that
wants to trigger a new incoming connection attempt on a previously advertised ad
dress/port combination can therefore refresh ADD_ADDR information by sending the
option again.</t>
<t>A host can therefore send an ADD_ADDR message with an already assigne
d Address ID, but the Address MUST be the same as previously assigned to this Ad
dress ID. A new ADD_ADDR may have the same, or different, port number. If the po
rt number is different, the receiving host SHOULD try to set up a new subflow to
this new address/port combination.</t>
<t>A host wishing to replace an existing Address ID MUST first remove th
e existing one (<xref target="sec_remove_addr"/>).</t>
<t>During normal MPTCP operation, it is unlikely that there will be suff <t>Due to the proliferation of NATs, it is reasonably likely that
icient TCP option space for ADD_ADDR to be included along with those for data se one host may attempt to advertise private addresses <xref
quence numbering (<xref target="sec_dsn"/>). Therefore, it is expected that an M target="RFC1918" format="default"/>. It is not desirable to prohibit
PTCP implementation will send the ADD_ADDR option on separate ACKs. As discussed this behavior, since there may be cases where both hosts have additional
earlier, however, an MPTCP implementation MUST NOT treat duplicate ACKs with an interfaces on the same private network, and a host
y MPTCP option, with the exception of the DSS option, as indications of congesti <bcp14>MAY</bcp14> advertise such addresses. The MP_JOIN handshake
on <xref target="RFC5681"/>, and an MPTCP implementation SHOULD NOT send more th to create a new subflow (<xref target="sec_join" format="default"/>)
an two duplicate ACKs in a row for signaling purposes.</t> provides mechanisms to minimize security risks. The MP_JOIN message
contains a 32-bit token that uniquely identifies the connection to
the receiving host. If the token is unknown, the host will respond
with a RST. In the unlikely event that the token is valid at the
receiving host, subflow setup will continue, but the HMAC exchange
must occur for authentication. The HMAC exchange
will fail and will provide
sufficient protection against two unconnected hosts accidentally
setting up a new subflow upon the signal of a private address.
Further security considerations around the issue of ADD_ADDR messages that acci
dentally misdirect, or maliciously direct, new MP_JOIN attempts are discussed in
<xref target="sec_security" format="default"/>.</t>
<t>A host that receives an ADD_ADDR but finds that a connection set up
to that IP address and port number is unsuccessful <bcp14>SHOULD NOT</bcp14> pe
rform further connection attempts to this address&wj;/port combination for this
connection. A sender that wants to trigger a new incoming connection attempt on
a previously advertised address&wj;/port combination can therefore refresh ADD_A
DDR information by sending the option again.</t>
<t>A host can therefore send an ADD_ADDR message with an
already-assigned Address ID, but the address <bcp14>MUST</bcp14> be
the same as the address previously assigned to this Address ID. A
new ADD_ADDR may have the same port number or a different port number.
If the port number is different, the receiving host <bcp14>SHOULD</bcp14> try t
o set up a new subflow to this new address&wj;/port combination.</t>
<t>A host wishing to replace an existing Address ID <bcp14>MUST</bcp14
> first remove the existing one (<xref target="sec_remove_addr" format="default"
/>).</t>
<t>During normal MPTCP operation, it is unlikely that there will be su
fficient TCP option space for ADD_ADDR to be included along with those for data
sequence numbering (<xref target="sec_dsn" format="default"/>). Therefore, it is
expected that an MPTCP implementation will send the ADD_ADDR option on separate
ACKs. As discussed earlier, however, an MPTCP implementation <bcp14>MUST NOT</b
cp14> treat duplicate ACKs with any MPTCP option, with the exception of the DSS
option, as indications of congestion <xref target="RFC5681" format="default"/>,
and an MPTCP implementation <bcp14>SHOULD NOT</bcp14> send more than two duplica
te ACKs in a row for signaling purposes.</t>
</section>
<section anchor="sec_remove_addr" numbered="true" toc="default">
<name>Remove Address</name>
<t>If, during the lifetime of an MPTCP connection, a previously
announced address becomes invalid (e.g., if the interface
disappears or an IPv6 address is no longer preferred), the affected
host <bcp14>SHOULD</bcp14> announce this situation so that the peer ca
n remove
subflows related to this address. Even if an address is not in use
by an MPTCP connection, if it has been previously announced, an
implementation <bcp14>SHOULD</bcp14> announce its removal. A host
<bcp14>MAY</bcp14> also choose to announce that a valid IP address
should not be used any longer -- for example, for make&#8209;before-br
eak session continuity.</t>
<t>This is achieved through the Remove Address (REMOVE_ADDR) option
(<xref target="tcpm_remove" format="default"/>), which will remove a
previously added address (or list of addresses) from a connection
and terminate any subflows currently using that address.</t>
</section> <figure anchor="tcpm_remove">
<section title="Remove Address" anchor="sec_remove_addr"> <name>Remove Address (REMOVE_ADDR) Option</name>
<t>If, during the lifetime of an MPTCP connection, a previously announce <artwork align="left" name="" type="" alt=""><![CDATA[
d address becomes invalid (e.g., if the interface disappears, or an IPv6 address 1 2 3
is no longer preferred), the affected host SHOULD announce this so that the pee 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
r can remove subflows related to this address. Even if an address is not in use +---------------+---------------+-------+-------+---------------+
by a MPTCP connection, if it has been previously announced, an implementation SH | Kind |Length = 3 + n |Subtype|(resvd)| Address ID | ...
OULD announce its removal. A host MAY also choose to announce that a valid IP ad +---------------+---------------+-------+-------+---------------+
dress should not be used any longer, for example for make-before-break session c (followed by n-1 Address IDs, if required) ]]></artwo
ontinuity.</t> rk>
<t>This is achieved through the Remove Address (REMOVE_ADDR) option (<xr </figure>
ef target="tcpm_remove"/>), which will remove a previously added address (or lis
t of addresses) from a connection and terminate any subflows currently using tha
t address.</t>
<t>For security purposes, if a host receives a REMOVE_ADDR option, it mu
st ensure the affected path(s) are no longer in use before it instigates closure
. The receipt of REMOVE_ADDR SHOULD first trigger the sending of a TCP keepalive
<xref target="RFC1122"/> on the path, and if a response is received the path SH
OULD NOT be removed. If the path is found to still be alive, the receiving host
SHOULD no longer use the specified address for future connections, but it is the
responsibility of the host which sent the REMOVE_ADDR to shut down the subflow.
The requesting host MAY also use MP_PRIO (<xref target="sec_policy"/>) to reque
st a path is no longer used, before removal. Typical TCP validity tests on the s
ubflow (e.g., ensuring sequence and ACK numbers are correct) MUST also be undert
aken. An implementation can use indications of these test failures as part of in
trusion detection or error logging.</t>
<t>The sending and receipt (if no keepalive response was received) of th
is message SHOULD trigger the sending of RSTs by both hosts on the affected subf
low(s) (if possible), as a courtesy to cleaning up middlebox state, before clean
ing up any local state.</t>
<t>Address removal is undertaken by ID, so as to permit the use of NATs
and other middleboxes that rewrite source addresses. If there is no address at t
he requested ID, the receiver will silently ignore the request.</t>
<t>A subflow that is still functioning MUST be closed with a FIN exchang
e as in regular TCP, rather than using this option. For more information, see <x
ref target="sec_close"/>.</t>
<?rfc needLines='8'?>
<figure align="center" anchor="tcpm_remove" title="Remove Address (REMOV
E_ADDR) Option">
<artwork align="left"><![CDATA[
1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+---------------+---------------+-------+-------+---------------+
| Kind | Length = 3+n |Subtype|(resvd)| Address ID | ...
+---------------+---------------+-------+-------+---------------+
(followed by n-1 Address IDs, if required)
]]></artwork>
</figure>
</section>
<t>For security purposes, if a host receives a REMOVE_ADDR option,
it must ensure that the affected path or paths are no longer in use
before it instigates closure. The receipt of REMOVE_ADDR
<bcp14>SHOULD</bcp14> first trigger the sending of a TCP keepalive
<xref target="RFC1122" format="default"/> on the path, and if a
response is received, the path <bcp14>SHOULD NOT</bcp14> be
removed. If the path is found to still be alive, the receiving host
<bcp14>SHOULD</bcp14> no longer use the specified address for future
connections, but it is the responsibility of the host that sent the
REMOVE_ADDR to shut down the subflow. Before the address is removed,
the requesting host
<bcp14>MAY</bcp14> also use MP_PRIO (<xref target="sec_policy"
format="default"/>) to request that a path no longer be used. Typical
TCP validity tests on the subflow (e.g., ensuring
that sequence and ACK numbers are correct) <bcp14>MUST</bcp14> also be
undertaken. An implementation can use indications of these test failures as par
t of intrusion detection or error logging.</t>
<t>The sending and receipt (if no keepalive response was received)
of this message <bcp14>SHOULD</bcp14> trigger the sending of RSTs by
both hosts on the affected subflow(s) (if possible), as a courtesy,
to allow the cleanup of middlebox state before cleaning up any local s
tate.</t>
<t>Address removal is undertaken according to the Address ID, so as to
permit the use of NATs and other middleboxes that rewrite source
addresses. If an Address ID is not known, the receiver will
silently ignore the request.</t>
<t>A subflow that is still functioning <bcp14>MUST</bcp14> be closed w
ith a FIN exchange as in regular TCP, rather than using this option. For more in
formation, see <xref target="sec_close" format="default"/>.</t>
</section>
</section> </section>
<section anchor="sec_fastclose" numbered="true" toc="default">
<section title="Fast Close" anchor="sec_fastclose"> <name>Fast Close</name>
<t>Regular TCP has the means of sending a reset (RST) signal to abruptly <t>Regular TCP has the means of sending a RST signal to abruptly
close a connection. With MPTCP, a regular RST only has the scope of the close a connection. With MPTCP, a regular RST only has the scope of
subflow the subflow; it
and will only close the concerned subflow but not affect the remaining will only close the applicable subflow and will not affect the remaining
subflows. MPTCP's connection will stay alive at the data level, in order subflows. MPTCP's connection will stay alive at the data level, in order
to permit break-before-make handover between subflows. It is therefore to permit break-before-make handover between subflows. It is therefore
necessary to provide an MPTCP-level "reset" to allow the abrupt closure necessary to provide an MPTCP-level "reset" to allow the abrupt closure
of the whole MPTCP connection, and this is the MP_FASTCLOSE option.</t> of the whole MPTCP connection; this is done via the MP_FASTCLOSE option.
</t>
<t>MP_FASTCLOSE is used to indicate to the peer that the connection will be <t>MP_FASTCLOSE is used to indicate to the peer that the connection will be
abruptly closed and no data will be accepted anymore. The reasons for abruptly closed and no data will be accepted anymore. The reasons for
triggering an MP_FASTCLOSE are implementation specific. Regular TCP does triggering an MP_FASTCLOSE are implementation specific. Regular TCP does
not allow sending a RST while the connection is in a synchronized not allow the sending of a RST while the connection is in a synchronized
state <xref target="RFC0793"/>. Nevertheless, implementations allow state <xref target="RFC0793" format="default"/>. Nevertheless, implement
the sending of a RST in this state, if, for example, the operating ations allow
the sending of a RST in this state if, for example, the operating
system is running out of resources. In these cases, MPTCP should send system is running out of resources. In these cases, MPTCP should send
the MP_FASTCLOSE. This option is illustrated in <xref target="tcpm_fastc the MP_FASTCLOSE. This option is illustrated in <xref target="tcpm_fastc
lose"/>.</t> lose" format="default"/>.</t>
<figure anchor="tcpm_fastclose">
<?rfc needLines='12'?> <name>Fast Close (MP_FASTCLOSE) Option</name>
<figure align="center" anchor="tcpm_fastclose" title="Fast Close (MP_FAS <artwork align="left" name="" type="" alt=""><![CDATA[
TCLOSE) Option"> 1 2 3
<artwork align="left"><![CDATA[ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1 2 3 +---------------+---------------+-------+-----------------------+
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | Kind | Length |Subtype| (reserved) |
+---------------+---------------+-------+-----------------------+ +---------------+---------------+-------+-----------------------+
| Kind | Length |Subtype| (reserved) | | Option Receiver's Key |
+---------------+---------------+-------+-----------------------+ | (64 bits) |
| Option Receiver's Key | | |
| (64 bits) | +---------------------------------------------------------------+ ]]></artwork
| | >
+---------------------------------------------------------------+
]]></artwork>
</figure> </figure>
<t>If Host A wants to force the closure of an MPTCP connection, it can
<t>If Host A wants to force the closure of an MPTCP connection, it has t do so via two
wo options:
different options: </t>
<list style="symbols"> <ul spacing="normal">
<t>Option A (ACK) : Host A sends an ACK containing the MP_FASTCLOSE <li>Option A (ACK): Host A sends an ACK containing the MP_FASTCLOSE
option on one subflow, containing the key of Host B as declared in option on one subflow, containing the key of Host B as declared in
the initial connection handshake. On all the other subflows, Host A the initial connection handshake. On all the other subflows, Host&n
sends a regular TCP RST to close these subflows, and tears them down. bsp;A
Host A now enters FASTCLOSE_WAIT state.</t> sends a regular TCP RST to close these subflows and tears them down.
Host A now enters FASTCLOSE_WAIT state.</li>
<t>Option R (RST) : Host A sends a RST containing the MP_FASTCLOSE <li>Option R (RST): Host A sends a RST containing the MP_FASTCLOSE
option on all subflows, containing the key of Host B as declared in option on all subflows, containing the key of Host B as declared in
the initial connection handshake. Host A can tear the subflows and the initial connection handshake. Host A can tear down the subflows
the connection down immediately.</t> and
</list> the connection immediately.</li>
</t> </ul>
<t>If Host A decides to force the closure by using Option A and sending
<t>If host A decides to force the closure by using Option A and sending an ACK with the MP_FASTCLOSE option, the connection shall proceed as fol
an ACK with the MP_FASTCLOSE option, the connection shall proceed as foll lows:
ows: </t>
<list style="symbols"> <ul spacing="normal">
<t>Upon receipt of an ACK with MP_FASTCLOSE by Host B, containing th <li>Upon receipt of an ACK with MP_FASTCLOSE by Host B, containing the
e valid key, Host B answers valid key, Host B answers
on the same subflow with a TCP RST and tears down all subflows also on the same subflow with a TCP RST and tears down all subflows
through sending TCP RST signals. Host B can also through sending TCP RST signals. Host B can
now close the whole MPTCP connection (it transitions directly to CLO now close the whole MPTCP connection (it transitions directly to CLO
SED state).</t> SED state).</li>
<li>As soon as Host A has received the TCP RST on the remaining subflo
<t>As soon as Host A has received the TCP RST on the remaining subfl w, it
ow, it
can close this subflow and tear down the whole connection (transitio n from can close this subflow and tear down the whole connection (transitio n from
FASTCLOSE_WAIT to CLOSED states). If Host A receives an MP_FASTCLOSE instead FASTCLOSE_WAIT state to CLOSED state). If Host A receives an MP_FAST CLOSE instead
of a TCP RST, both hosts attempted fast closure simultaneously. Host A should of a TCP RST, both hosts attempted fast closure simultaneously. Host A should
reply with a TCP RST and tear down the connection.</t> reply with a TCP RST and tear down the connection.</li>
<li>If Host A does not receive a TCP RST in reply to its MP_FASTCLOSE
<t>If Host A does not receive a TCP RST in reply to its MP_FASTCLOSE after one
after one retransmission timeout (RTO) (the RTO of the subflow where the MP_FA
retransmission timeout (RTO) (the RTO of the subflow where the MP_FA STCLOSE has been sent), it <bcp14>SHOULD</bcp14>
STCLOSE has been sent), it SHOULD retransmit the MP_FASTCLOSE. To keep this connection from being
retransmit the MP_FASTCLOSE. The number of retransmissions SHOULD be retained for a long time, the number of retransmissions <bcp14>SHOUL
limited to avoid this connection from being retained for a long time D</bcp14> be
, but limited;
this limit is implementation specific. A RECOMMENDED number is 3. If this limit is implementation specific. A <bcp14>RECOMMENDED</bcp14>
no TCP RST number is 3. If no TCP RST
is received in response, Host A SHOULD send a TCP RST with the MP_FA is received in response, Host A <bcp14>SHOULD</bcp14> send a TCP RST
STCLOSE option with the MP_FASTCLOSE option
itself when it releases state in order to clear any remaining state a itself when it releases state in order to clear any remaining state
t middleboxes.</t> at middleboxes.</li>
</list> </ul>
</t> <t>If, however, Host A decides to force the closure by using Option R an
d
<t>If however host A decides to force the closure by using Option R and sending a RST with the MP_FASTCLOSE option, Host B will act as follows:
sending a RST with the MP_FASTCLOSE option, Host B will act as follows: upon receipt of a RST with MP_FASTCLOSE, containing the valid key,
Upon receipt of a RST with MP_FASTCLOSE, containing the valid key, Host B tears down all subflows by sending a TCP RST. Host&nbsp;B can now
Host B tears down all subflows by sending a TCP RST. Host B can now close close the whole MPTCP
the whole MPTCP connection (it transitions directly to CLOSED state).</t>
connection (it transitions directly to CLOSED state).</t>
</section> </section>
<section anchor="sec_reset" numbered="true" toc="default">
<section title="Subflow Reset" anchor="sec_reset"> <name>Subflow Reset</name>
<t>An implementation of MPTCP may also need to send a regular TCP RST to <t>An implementation of MPTCP may also need to send a regular TCP RST to
force force
the closure of a subflow. A host sends a TCP RST in order to close a subf the closure of a subflow. A host sends a TCP RST in order to close a sub
low flow
or reject an attempt to open a subflow (MP_JOIN). In order to inform the or reject an attempt to open a subflow (MP_JOIN). In order to let the
receiving host why a subflow is being closed or rejected, the TCP RST pac receiving host know why a subflow is being closed or rejected, the TCP R
ket ST packet
MAY include the MP_TCPRST Option. The host MAY use this information to <bcp14>MAY</bcp14> include the MP_TCPRST option (<xref target="tcpm_rese
decide, for example, whether it tries to re-establish the subflow t"/>). The host <bcp14>MAY</bcp14> use this information to
immediately, later, or never.</t> decide, for example, whether it tries to re-establish the subflow
immediately, later, or never.</t>
<?rfc needLines='8'?> <figure anchor="tcpm_reset">
<figure align="center" anchor="tcpm_reset" title="TCP RST Reason (MP_TCP <name>TCP RST Reason (MP_TCPRST) Option</name>
RST) Option"> <artwork align="left" name="" type="" alt=""><![CDATA[
<artwork align="left"><![CDATA[ 1 2 3
1 2 3 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +---------------+---------------+-------+-----------------------+
+---------------+---------------+-------+-----------------------+ | Kind | Length |Subtype|U|V|W|T| Reason |
| Kind | Length |Subtype|U|V|W|T| Reason | +---------------+---------------+-------+-----------------------+ ]]></artwork
+---------------+---------------+-------+-----------------------+ >
]]></artwork>
</figure> </figure>
<t>The MP_TCPRST option contains a reason code that allows the <t>The MP_TCPRST option contains a reason code that allows the
sender of the option to provide more information about the reason for sender of the option to provide more information about the reason for
the termination of the subflow. Using 12 bits of option space, the the termination of the subflow. Using 12 bits of option space, the
first four bits are reserved for flags (only one of which is currently first 4&nbsp;bits are reserved for flags (only one of which is currently
defined), and the remaining octet is used to express a reason code for defined), and the remaining octet is used to express a reason code for
this subflow termination, from which a receiver MAY infer information this subflow termination, from which a receiver <bcp14>MAY</bcp14> infer information
about the usability of this path.</t> about the usability of this path.</t>
<t>The "T" flag is used by the sender to indicate whether the error <t>The "T" flag is used by the sender to indicate whether the error
condition that is reported is Transient (T bit set to 1) or Permanent condition that is reported is Transient ("T" bit set to 1) or Permanent
(T bit set to 0). If the error condition is considered to be ("T" bit set to 0). If the error condition is considered to be
Transient by the sender of the RST segment, the recipient of this Transient by the sender of the RST segment, the recipient of this
segment MAY try to reestablish a subflow for this connection over the segment <bcp14>MAY</bcp14> try to re-establish a subflow for this connec
failed path. The time at which a receiver may try to re-establish this tion over the
is implementation-specific, but SHOULD take into account the properties failed path. The time at which a receiver may try to
of the failure defined by the following reason code. If the error condi re&#8209;establish this subflow
tion is implementation specific but <bcp14>SHOULD</bcp14> take into account t
is considered to be permanent, the receiver of the RST segment SHOULD NO he properties
T try of the failure as defined by the provided reason code. If the error con
to reestablish a subflow for this connection over this path. The "U", " dition
V" is considered to be Permanent, the receiver of the RST segment <bcp14>SH
OULD NOT</bcp14> try
to re&#8209;establish a subflow for this connection over this path. The
"U", "V",
and "W" flags are not defined by this specification and are reserved for and "W" flags are not defined by this specification and are reserved for
future use. An implementation of this specification MUST set these flags future use. An implementation of this specification <bcp14>MUST</bcp14>
to 0, and a receiver MUST ignore them.</t> set these flags
to 0, and a receiver <bcp14>MUST</bcp14> ignore them.</t>
<t>The "Reason" code is an 8-bit field that indicates the reason for <t>"Reason" is an 8-bit field that indicates the reason code for
the termination of the subflow. The following codes are defined in the termination of the subflow. The following codes are defined in
this document: this document:
<list style="symbols"> </t>
<t>Unspecified error (code 0x0). This is the default error implying <ul spacing="normal">
the <li>Unspecified error (code 0x00). This is the default error;
it implies that the
subflow is no longer available. The presence of this option shows subflow is no longer available. The presence of this option shows
that the RST was generated by a MPTCP-aware device.</t> that the RST was generated by an MPTCP-aware device.</li>
<li>MPTCP-specific error (code 0x01). An error has been detected in t
<t>MPTCP specific error (code 0x01). An error has been detected in he
the
processing of MPTCP options. This is the usual reason code to retur n processing of MPTCP options. This is the usual reason code to retur n
in the cases where a RST is being sent to close a subflow for reason in the cases where a RST is being sent to close a subflow because
s of an invalid response.</li>
of an invalid response.</t> <li>Lack of resources (code 0x02). This code indicates that the
<t>Lack of resources (code 0x02). This code indicates that the
sending host does not have enough resources to support the sending host does not have enough resources to support the
terminated subflow.</t> terminated subflow.</li>
<li>Administratively prohibited (code 0x03). This code indicates that
<t>Administratively prohibited (code 0x03). This code indicates tha
t
the requested subflow is prohibited by the policies of the sending the requested subflow is prohibited by the policies of the sending
host.</t> host.</li>
<li>Too much outstanding data (code 0x04). This code indicates that
<t>Too much outstanding data (code 0x04). This code indicates that there is an excessive amount of data that needs to be transmitted
there is an excessive amount of data that need to be transmitted
over the terminated subflow while having already been acknowledged over the terminated subflow while having already been acknowledged
over one or more other subflows. This may occur if a path has been over one or more other subflows. This may occur if a path has been
unavailable for a short period and it is more efficient to reset and unavailable for a short period and it is more efficient to reset and
start again than it is to retransmit the queued data.</t> start again than it is to retransmit the queued data.</li>
<li>Unacceptable performance (code 0x05). This code indicates that
<t>Unacceptable performance (code 0x05). This code indicates that
the performance of this subflow was too low compared to the other the performance of this subflow was too low compared to the other
subflows of this Multipath TCP connection.</t> subflows of this Multipath TCP connection.</li>
<li>Middlebox interference (code 0x06). Middlebox interference has
<t>Middlebox interference (code 0x06). Middlebox interference has been detected over this subflow, making MPTCP signaling invalid. Fo
been detected over this subflow making MPTCP signaling invalid. For r
example, this may be sent if the checksum does not validate.</t> example, this may be sent if the checksum does not validate.</li>
</list> </ul>
</t>
</section> </section>
<section anchor="sec_fallback" numbered="true" toc="default">
<section title="Fallback" anchor="sec_fallback"> <name>Fallback</name>
<t>Sometimes, middleboxes will exist on a path that could prevent the op <t>Sometimes, middleboxes will exist on a path that could prevent the
eration of MPTCP. MPTCP has been designed in order to cope with many middlebox m operation of MPTCP. MPTCP has been designed to cope with many
odifications (see <xref target="sec_middleboxes"/>), but there are still some ca middlebox modifications (see <xref target="sec_middleboxes"
ses where a subflow could fail to operate within the MPTCP requirements. These c format="default"/>), but there are still some cases where a subflow
ases are notably the following: the loss of MPTCP options on a path, and the mod could fail to operate within the MPTCP requirements. Notably, these case
ification of payload data. If such an event occurs, it is necessary to "fall bac s are the following: the loss of MPTCP options on a path and the modification of
k" to the previous, safe operation. This may be either falling back to regular T payload data. If such an event occurs, it is necessary to "fall back" to the pr
CP or removing a problematic subflow.</t> evious, safe operation. This may be either falling back to regular TCP or removi
ng a problematic subflow.</t>
<t>At the start of an MPTCP connection (i.e., the first subflow), it is <t>At the start of an MPTCP connection (i.e., the first subflow), it is
important to ensure that the path is fully MPTCP capable and the necessary MPTCP important to ensure that the path is fully MPTCP capable and the necessary MPTCP
options can reach each host. The handshake as described in <xref target="sec_in options can reach each host. The handshake as described in <xref target="sec_in
it"/> SHOULD fall back to regular TCP if either of the SYN messages do not have it" format="default"/> <bcp14>SHOULD</bcp14> fall back to regular TCP if either
the MPTCP options: this is the same, and desired, behavior in the case where a h of the SYN messages does not have the MPTCP options: this is the same, and desir
ost is not MPTCP capable, or the path does not support the MPTCP options. When a ed, behavior in the case where a host is not MPTCP capable or the path does not
ttempting to join an existing MPTCP connection (<xref target="sec_join"/>), if a support the MPTCP options. When attempting to join an existing MPTCP connection
path is not MPTCP capable and the MPTCP options do not get through on the SYNs, (<xref target="sec_join" format="default"/>), if a path is not MPTCP capable and
the subflow will be closed according to the MP_JOIN logic.</t> the MPTCP options do not get through on the SYNs, the subflow will be closed ac
cording to the MP_JOIN logic.</t>
<t>There is, however, another corner case that should be addressed. That <t>There is, however, another corner case that should be addressed:
is one of MPTCP options getting through on the SYN, but not on regular packets. the case where MPTCP options get through on the SYN but not on regular
This can be resolved if the subflow is the first subflow, and thus all data in packets. If the subflow is the first subflow and thus all data in
flight is contiguous, using the following rules.</t> flight is contiguous, this situation can be resolved by using the follow
ing rules:</t>
<t>A sender MUST include a DSS option with data sequence mapping in ever <ul spacing="normal">
y segment until one of the sent segments has been acknowledged with a DSS option <li>A sender <bcp14>MUST</bcp14> include a DSS option with Data Sequence Mapping
containing a Data ACK. Upon reception of the acknowledgment, the sender has the in every segment until one of the sent segments has been acknowledged with a DS
confirmation that the DSS option passes in both directions and may choose to se S option containing a Data ACK. Upon reception of the acknowledgment, the sender
nd fewer DSS options than once per segment.</t> has the confirmation that the DSS option passes in both directions and may choo
se to send fewer DSS options than once per segment.</li>
<t>If, however, an ACK is received for data (not just for the SYN) witho <li>If, however, an ACK is received for data (not just for the SYN)
ut a DSS option containing a Data ACK, the sender determines the path is not MPT without a DSS option containing a Data ACK, the sender determines that t
CP capable. In the case of this occurring on an additional subflow (i.e., one st he path is not MPTCP capable. In the case of this occurring on an additional sub
arted with MP_JOIN), the host MUST close the subflow with a RST, which SHOULD co flow (i.e., one started with MP_JOIN), the host <bcp14>MUST</bcp14> close the su
ntain a MP_TCPRST option (<xref target="sec_reset"/>) with a "Middlebox interfer bflow with a RST, which <bcp14>SHOULD</bcp14> contain an MP_TCPRST option (<xref
ence" reason code.</t> target="sec_reset" format="default"/>) with a "Middlebox interference" reason c
ode.</li>
<t>In the case of such an ACK being received on the first subflow (i.e., <li>In the case of such an ACK being received on the first subflow
that started with MP_CAPABLE), before any additional subflows are added, the im (i.e., that started with MP_CAPABLE), before any additional subflows
plementation MUST drop out of an MPTCP mode, back to regular TCP. The sender wil are added, the implementation <bcp14>MUST</bcp14> drop out of MPTCP
l send one final data sequence mapping, with the Data-Level Length value of 0 in mode and fall back to regular TCP. The sender will send one final Data S
dicating an infinite mapping (to inform the other end in case the path drops opt equence Mapping, with the Data-Level Length value of 0 indicating an infinite ma
ions in one direction only), and then revert to sending data on the single subfl pping (to inform the other end in case the path drops options in one direction o
ow without any MPTCP options.</t> nly), and then revert to sending data on the single subflow without any MPTCP op
tions.</li>
<t>If a subflow breaks during operation, e.g. if it is re-routed and MPT <li>If a subflow breaks during operation, e.g., if it is rerouted and
CP options are no longer permitted, then once this is detected (by the subflow-l MPTCP options are no longer permitted, then once this is detected (by
evel receive buffer filling up, since there is no mapping available in order to the subflow-level receive buffer filling up, since there is no mapping
DATA_ACK this data), the subflow SHOULD be treated as broken and closed with a R available in order to DATA_ACK this data), the subflow
ST, since no data can be delivered to the application layer, and no fallback sig <bcp14>SHOULD</bcp14> be treated as broken and closed with a RST,
nal can be reliably sent. This RST SHOULD include the MP_TCPRST option (<xref ta since no data can be delivered to the application layer and no
rget="sec_reset"/>) with a "Middlebox interference" reason code.</t> fallback signal can be reliably sent. This RST <bcp14>SHOULD</bcp14>
include the MP_TCPRST option (<xref target="sec_reset"
<t>These rules should cover all cases where such a failure could happen: format="default"/>) with a "Middlebox interference" reason code.</li>
whether it's on the forward or reverse path and whether the server or the clien </ul>
t first sends data.</t> <t>These rules should cover all cases where such a failure could
happen -- whether it's on the forward or reverse path and whether the se
<t>So far this section has discussed the loss of MPTCP options, either i rver or the client first sends data.</t>
nitially, or during the course of the connection. As described in <xref target=" <t>So far, this section has discussed the loss of MPTCP options,
sec_generalop"/>, each portion of data for which there is a mapping is protected either initially or during the course of the connection. As described
by a checksum, if checksums have been negotiated. This mechanism is used to det in <xref target="sec_generalop" format="default"/>, each portion of
ect if middleboxes have made any adjustments to the payload (added, removed, or data for which there is a mapping is protected by a checksum, if
changed data). A checksum will fail if the data has been changed in any way. Thi checksums have been negotiated. This mechanism is used to detect if
s will also detect if the length of data on the subflow is increased or decrease middleboxes have made any adjustments to the payload (added, removed,
d, and this means the data sequence mapping is no longer valid. The sender no lo or changed data). A checksum will fail if the data has been changed in
nger knows what subflow-level sequence number the receiver is genuinely operatin any way. The use of a checksum will also detect whether the length of da
g at (the middlebox will be faking ACKs in return), and it cannot signal any fur ta on the subflow is
ther mappings. Furthermore, in addition to the possibility of payload modificati increased or decreased, and this means the Data Sequence Mapping is no
ons that are valid at the application layer, there is the possibility that such longer valid. The sender no longer knows what subflow-level sequence
modifications could be triggered across MPTCP segment boundaries, corrupting the number the receiver is genuinely operating at (the middlebox will be
data. Therefore, all data from the start of the segment that failed the checksu faking ACKs in return), and it cannot signal any further
m onwards is not trustworthy.</t> mappings. Furthermore, in addition to the possibility of payload
modifications that are valid at the application layer, it is possible th
<t>Note that if checksum usage has not been negotiated, this fallback me at such modifications could be triggered across MPTCP segment boundaries, corrup
chanism cannot be used unless there is some higher or lower layer signal to info ting the data. Therefore, all data from the start of the segment that failed the
rm the MPTCP implementation that the payload has been tampered with.</t> checksum onward is not trustworthy.</t>
<t>Note that if checksum usage has not been negotiated, this fallback me
<t>When multiple subflows are in use, the data in flight on a subflow wi chanism cannot be used unless there is some higher-layer or lower&#8209;layer si
ll likely involve data that is not contiguously part of the connection-level str gnal to inform the MPTCP implementation that the payload has been tampered with.
eam, since segments will be spread across the multiple subflows. Due to the prob </t>
lems identified above, it is not possible to determine what adjustment has done <t>When multiple subflows are in use, the data in flight on a subflow
to the data (notably, any changes to the subflow sequence numbering). Therefore, will likely involve data that is not contiguously part of the
it is not possible to recover the subflow, and the affected subflow must be imm connection-level stream, since segments will be spread across the
ediately closed with a RST, featuring an MP_FAIL option (<xref target="tcpm_fall multiple subflows. Due to the problems identified above, it is not
back"/>), which defines the data sequence number at the start of the segment (de possible to determine what adjustments have been done to the data (notab
fined by the data sequence mapping) that had the checksum failure. Note that the ly,
MP_FAIL option requires the use of the full 64-bit sequence number, even if 32- any changes to the subflow sequence numbering). Therefore, it is not
bit sequence numbers are normally in use in the DSS signals on the path.</t> possible to recover the subflow, and the affected subflow must be
immediately closed with a RST that includes an MP_FAIL option (<xref tar
<?rfc needLines='8'?> get="tcpm_fallback" format="default"/>), which defines the data sequence number
<figure align="center" anchor="tcpm_fallback" title="Fallback (MP_FAIL) at the start of the segment (defined by the Data Sequence Mapping) that had the
Option"> checksum failure. Note that the MP_FAIL option requires the use of the full 64-b
<artwork align="left"><![CDATA[ it sequence number, even if 32-bit sequence numbers are normally in use in the D
1 2 3 SS signals on the path.</t>
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 <figure anchor="tcpm_fallback">
+---------------+---------------+-------+----------------------+ <name>Fallback (MP_FAIL) Option</name>
| Kind | Length=12 |Subtype| (reserved) | <artwork align="left" name="" type="" alt=""><![CDATA[
+---------------+---------------+-------+----------------------+ 1 2 3
| | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
| Data Sequence Number (8 octets) | +---------------+---------------+-------+----------------------+
| | | Kind | Length=12 |Subtype| (reserved) |
+--------------------------------------------------------------+ +---------------+---------------+-------+----------------------+
| |
]]></artwork> | Data Sequence Number (8 octets) |
| |
+--------------------------------------------------------------+ ]]></artwork>
</figure> </figure>
<t>The receiver of this option <bcp14>MUST</bcp14> discard all data foll
owing the data sequence number specified.
Failed data <bcp14>MUST NOT</bcp14> be DATA_ACKed and so will be retrans
mitted on other subflows (<xref target="sec_retransmit" format="default"/>). </t
>
<t>A special case is when there is a single subflow and it fails with a
checksum error. If it is known that all unacknowledged data in
flight is contiguous (which will usually be the case with a single
subflow), an infinite mapping can be applied to the subflow without
the need to close it first, essentially turning off all further
MPTCP signaling.
<t>The receiver of this option MUST discard all data following the data In this case, if a receiver identifies a checksum failure
sequence number specified.
Failed data MUST NOT be DATA_ACKed and so will be retransmitted on other
subflows (<xref target="sec_retransmit"/>). </t>
<t>A special case is when there is a single subflow and it fails with a
checksum error.
If it is known that all unacknowledged data in flight is
contiguous (which will usually be the case with a single subflow), an infinite m
apping can be applied to the subflow without the need to close it first, and
essentially turn off all further MPTCP signaling. In this case, if a receiver id
entifies a checksum failure
when there is only one path, it will send back an MP_FAIL option on the subflow- level ACK, referring to the data-level sequence number of the start of the when there is only one path, it will send back an MP_FAIL option on the subflow- level ACK, referring to the data-level sequence number of the start of the
segment on which the checksum error was detected. The sender will receive segment on which the checksum error was detected. The sender will receive
this, and if all unacknowledged data in flight is contiguous, will signal an inf this information and, if all unacknowledged data in flight is contiguous, will s
inite mapping. ignal an infinite mapping.
This infinite mapping will be a DSS option (<xref target="sec_generalop"/>) This infinite mapping will be a DSS option (<xref target="sec_generalop" format=
on the first new packet, containing a data sequence mapping that acts retroactiv "default"/>)
ely, referring to the start of the subflow sequence on the first new packet, containing a Data Sequence Mapping that acts retroactiv
number of the most recent segment that was known to be delivered intact (i.e. wa ely, referring to the start of the subflow sequence
s successfully DATA_ACKed). From that point onwards, data can be altered number of the most recent segment that was known to be delivered intact (i.e., w
as successfully DATA_ACKed). From that point onward, data can be altered
by a middlebox without affecting MPTCP, as the data stream is equivalent to a re gular, legacy TCP session. by a middlebox without affecting MPTCP, as the data stream is equivalent to a re gular, legacy TCP session.
Whilst in theory paths may only be damaged in one direction, and the MP_FAIL sig While in theory paths may only be damaged in one direction -- and the MP_FAIL
nal affects only one direction of traffic, signal affects only one direction of traffic --
for implementation simplicity, the receiver of an MP_FAIL MUST also respond with for simplicity of implementation, the receiver of an MP_FAIL <bcp14>MUST</bcp14>
an MP_FAIL in the reverse direction and entirely revert to a regular TCP sessio also respond with an MP_FAIL in the reverse direction and entirely revert to a
n.</t> regular TCP session.</t>
<t>In the rare case that the data is not contiguous (which could happen when there is only one subflow but it is retransmitting data from a subflow <t>In the rare case that the data is not contiguous (which could happen when there is only one subflow but it is retransmitting data from a subflow
that has recently been uncleanly closed), the receiver MUST close the subflow wi that has recently been uncleanly closed), the receiver <bcp14>MUST</bcp14> close
th a RST with MP_FAIL. The receiver MUST discard all data that follows the the subflow with a RST with MP_FAIL. The receiver <bcp14>MUST</bcp14> discard a
data sequence number specified. The sender MAY attempt to create a new subflow b ll data that follows the
elonging to the same connection, and, if it chooses to do so, SHOULD place data sequence number specified. The sender <bcp14>MAY</bcp14> attempt to
the single subflow immediately in single-path mode by setting an infinite data s create a new subflow belonging to the same connection and, if it chooses to do
equence mapping. This mapping will begin from the data-level sequence number so, <bcp14>SHOULD</bcp14> immediately place
the single subflow in single-path mode by setting an infinite Data Sequence Mapp
ing. This mapping will begin from the data-level sequence number
that was declared in the MP_FAIL.</t> that was declared in the MP_FAIL.</t>
<t>After a sender signals an infinite mapping, it <bcp14>MUST</bcp14> on
<t>After a sender signals an infinite mapping, it MUST only use subflow ly use subflow ACKs to clear its send buffer.
ACKs to clear its send buffer.
This is because Data ACKs may become misaligned with the subflow ACKs when middl eboxes insert or delete data. This is because Data ACKs may become misaligned with the subflow ACKs when middl eboxes insert or delete data.
The receive SHOULD stop generating Data ACKs after it receives an infinite mappi The receiver <bcp14>SHOULD</bcp14> stop generating Data ACKs after it receives
ng. </t> an infinite mapping.</t>
<t>When a connection has fallen back with an infinite mapping, only one
<t>When a connection has fallen back with an infinite mapping, only one subflow can send data; otherwise, the receiver would not know how to reorder the
subflow can send data; otherwise, the receiver would not know how to reorder the data. In practice, this means that all MPTCP subflows will have to be terminate
data. In practice, this means that all MPTCP subflows will have to be terminate d except one. Once MPTCP falls back to regular TCP, it <bcp14>MUST NOT</bcp14> r
d except one. Once MPTCP falls back to regular TCP, it MUST NOT revert to MPTCP evert to MPTCP later in the connection.</t>
later in the connection.</t>
<t>It should be emphasized that MPTCP is not attempting to prevent the u se of middleboxes that want to adjust the payload. An MPTCP-aware middlebox coul d provide such functionality by also rewriting checksums.</t> <t>It should be emphasized that MPTCP is not attempting to prevent the u se of middleboxes that want to adjust the payload. An MPTCP-aware middlebox coul d provide such functionality by also rewriting checksums.</t>
</section> </section>
<section anchor="sec_errors" numbered="true" toc="default">
<section title="Error Handling" anchor="sec_errors"> <name>Error Handling</name>
<t>In addition to the fallback mechanism as described above, the standar <t>In addition to the fallback mechanism described above, the standard c
d classes of TCP errors may need to be handled in an MPTCP-specific way. Note th lasses of TCP errors may need to be handled in an MPTCP&#8209;specific way. Note
at changing semantics -- such as the relevance of a RST -- are covered in <xref that changing semantics -- such as the relevance of a RST -- are covered in <xr
target="sec_semantics"/>. Where possible, we do not want to deviate from regular ef target="sec_semantics" format="default"/>. Where possible, we do not want to
TCP behavior.</t> deviate from regular TCP behavior.</t>
<t>The following list covers possible errors and the appropriate MPTCP b ehavior: <t>The following list covers possible errors and the appropriate MPTCP b ehavior:
<list style="symbols">
<t>Unknown token in MP_JOIN (or HMAC failure in MP_JOIN ACK, or miss
ing MP_JOIN in SYN/ACK response): send RST (analogous to TCP's behavior on an un
known port)</t>
<t>DSN out of window (during normal operation): drop the data, do no
t send Data ACKs</t>
<t>Remove request for unknown address ID: silently ignore</t>
</list>
</t> </t>
<ul spacing="normal">
<li>Unknown token in MP_JOIN (or HMAC failure in MP_JOIN ACK, or missi
ng MP_JOIN in SYN/ACK response): send RST (analogous to TCP's behavior on an unk
nown port)</li>
<li>DSN out of window (during normal operation): drop the data; do not
send Data ACKs</li>
<li>Remove request for unknown Address ID: silently ignore</li>
</ul>
</section> </section>
<section anchor="heuristics" numbered="true" toc="default">
<section title="Heuristics" anchor="heuristics"> <name>Heuristics</name>
<t>There are a number of heuristics that are needed for <t>There are a number of heuristics that are needed for
performance or deployment but that are not required for performance or deployment but that are not required for
protocol correctness. In this section, we detail such protocol correctness. In this section, we detail such
heuristics. Note that discussion of buffering and certain heuristics. Note that discussions of buffering and certain
sender and receiver window behaviors are presented in Sections sender and receiver window behaviors are presented in Sections
<xref target="sec_rwin" format="counter"/> and <xref target="sec_sender" <xref target="sec_rwin" format="counter"/> and <xref
format="counter"/>, target="sec_sender" format="counter"/>,
as well as retransmission in <xref target="sec_retransmit"/>.</t> and retransmission is discussed in <xref target="sec_retransmit" format=
"default"/>.</t>
<section title="Port Usage"> <section numbered="true" toc="default">
<t>Under typical operation, an MPTCP implementation SHOULD use <name>Port Usage</name>
the same ports as already in use. In other words, the <t>Under typical operation, an MPTCP implementation <bcp14>SHOULD</bcp
destination port of a SYN containing an MP_JOIN option SHOULD 14> use
the same ports as the ports that are already in use. In other words, t
he
destination port of a SYN containing an MP_JOIN option <bcp14>SHOULD</
bcp14>
be the same as the remote port of the first subflow in the be the same as the remote port of the first subflow in the
connection. The local port for such SYNs SHOULD also be the connection. The local port for such SYNs <bcp14>SHOULD</bcp14> also b
same as for the first subflow (and as such, an e the
implementation SHOULD reserve ephemeral ports across all same as the port for the first subflow (and as such, an
implementation <bcp14>SHOULD</bcp14> reserve ephemeral ports across al
l
local IP addresses), although there may be cases where this local IP addresses), although there may be cases where this
is infeasible. This strategy is intended to maximize the is infeasible. This strategy is intended to maximize the
probability of the SYN being permitted by a firewall or NAT probability of the SYN being permitted by a firewall or NAT
at the recipient and to avoid confusing any network at the recipient and to avoid confusing any network-monitoring softwar
monitoring software.</t> e.</t>
<t>There may also be cases, however, where a host wishes to <t>There may also be cases, however, where a host wishes to
signal that a specific port should be used, and this facility signal that a specific port should be used; this facility
is provided in the ADD_ADDR option as documented in is provided in the ADD_ADDR option as documented in
<xref target="sec_add_address"/>. It is therefore feasible <xref target="sec_add_address" format="default"/>. It is therefore fe asible
to allow multiple subflows between the same two addresses to allow multiple subflows between the same two addresses
but using different port pairs, and but using different port pairs, and
such a facility could be used to allow load balancing within such a facility could be used to allow load balancing within
the network based on 5-tuples (e.g., some ECMP implementations <xref t arget="RFC2992"/>).</t> the network based on 5-tuples (e.g., some ECMP implementations <xref t arget="RFC2992" format="default"/>).</t>
</section> </section>
<section numbered="true" toc="default">
<section title="Delayed Subflow Start and Subflow Symmetry"> <name>Delayed Subflow Start and Subflow Symmetry</name>
<t>Many TCP connections are short-lived and consist only of a few <t>Many TCP connections are short-lived and consist only of a few
segments, and so the overheads segments, and so the overhead
of using MPTCP outweigh any benefits. A heuristic is required, of using MPTCP outweighs any benefits. A heuristic is required,
therefore, to decide when to start using additional subflows in therefore, to decide when to start using additional subflows in
an MPTCP connection. Experimental deployments have shown that an MPTCP connection. Experimental deployments have shown that
MPTCP can be applied in a range of scenarios so an implementation MPTCP can be applied in a range of scenarios, so an implementation
is likely to need to take into account factors including the type of will likely need to take into account such factors as the type of
traffic being sent and duration of session, and this information traffic being sent and the duration of the session; this information
MAY be signalled by the application layer.</t> <bcp14>MAY</bcp14> be signaled by the application layer.</t>
<t>However, for standard TCP traffic, a suggested general-purpose <t>However, for standard TCP traffic, a suggested general-purpose
heuristic that an implementation MAY choose to employ is as follows.</ heuristic that an implementation <bcp14>MAY</bcp14> choose to employ i
t> s as follows.</t>
<t>If a host has data buffered for its peer (which implies that the <t>If a host has data buffered for its peer (which implies that the
application has received a request for data), the host opens one application has received a request for data), the host opens one
subflow for each initial window's worth of data that is buffered.</t> subflow for each initial window's worth of data that is buffered.</t>
<t>Consideration should also be given to limiting the rate of adding <t>Consideration should also be given to limiting the rate of adding
new subflows, as well as limiting the total number of subflows open new subflows, as well as limiting the total number of subflows open
for a particular connection. A host may choose to vary these values for a particular connection. A host may choose to vary these values
based on its load or knowledge of traffic and path characteristics.</t > based on its load or knowledge of traffic and path characteristics.</t >
<t>Note that this heuristic alone is probably insufficient. Traffic <t>Note that this heuristic alone is probably insufficient. Traffic
for many common applications, such as downloads, is highly asymmetric and for many common applications, such as downloads, is highly asymmetric, and
the host that is multihomed may well be the client that will never fil l the host that is multihomed may well be the client that will never fil l
its buffers, and thus never use MPTCP according to this heuristic. Adv anced APIs that allow an its buffers and thus never use MPTCP according to this heuristic. Adva nced APIs that allow an
application to signal its traffic requirements would aid in these deci sions.</t> application to signal its traffic requirements would aid in these deci sions.</t>
<t>An additional time-based heuristic could be applied, opening additi onal <t>An additional time-based heuristic could be applied, opening additi onal
subflows after a given period of time has passed. This would alleviate the subflows after a given period of time has passed. This would alleviate the
above issue, and also provide resilience for low-bandwidth but long-li ved above issue and also provide resilience for low&#8209;bandwidth but lo ng-lived
applications.</t> applications.</t>
<t>Another issue is that both communicating hosts may simultaneously t ry to <t>Another issue is that both communicating hosts may simultaneously t ry to
set up a subflow between the same pair of addresses. This leads to an set up a subflow between the same pair of addresses. This leads to an
inefficient use of resources.</t> inefficient use of resources.</t>
<t>If the same ports are used on all subflows, as recommended above, <t>If the same ports are used on all subflows, as recommended above,
then standard TCP simultaneous open logic should take care of this sit uation then standard TCP simultaneous-open logic should take care of this sit uation
and only one subflow will be established between the address pairs. Ho wever, and only one subflow will be established between the address pairs. Ho wever,
this relies on the same ports being used at both end hosts. If a host does this relies on the same ports being used at both end hosts. If a host does
not support TCP simultaneous open, it is RECOMMENDED that some element not support TCP simultaneous open, it is <bcp14>RECOMMENDED</bcp14> th
of randomization is applied to the time to wait before opening new sub at some element
flows, of randomization be applied to the time to wait before opening new sub
flows,
so that only one subflow is created between a given address pair. If, however, so that only one subflow is created between a given address pair. If, however,
hosts signal additional ports to use (for example, for leveraging ECMP on-path), hosts signal additional ports to use (for example, for leveraging ECMP on-path),
this heuristic is not appropriate.</t> this heuristic is not appropriate.</t>
<t>This section has shown some of the factors that an implementer
<t>This section has shown some of the considerations that an implement should consider when developing MPTCP heuristics, but it is not intend
er ed to be
should give when developing MPTCP heuristics, but is not intended to b
e
prescriptive.</t> prescriptive.</t>
</section> </section>
<section numbered="true" toc="default">
<section title="Failure Handling"> <name>Failure Handling</name>
<t>Requirements for MPTCP's handling of unexpected signals have been <t>Requirements for MPTCP's handling of unexpected signals are
given in <xref target="sec_errors"/>. There are other failure cases, given in <xref target="sec_errors" format="default"/>. There are other
however, where a hosts can choose appropriate behavior.</t> failure cases,
however, where hosts can choose appropriate behavior.</t>
<t>For example, <xref target="sec_init"/> suggests that a host SHOULD <t>For example, <xref target="sec_init" format="default"/> suggests th
at a host <bcp14>SHOULD</bcp14>
fall back to trying regular TCP SYNs after one or more failures of MPT CP fall back to trying regular TCP SYNs after one or more failures of MPT CP
SYNs for a connection. A host may keep a system-wide cache of such SYNs for a connection. A host may keep a system-wide cache of such
information, so that it can back off from using MPTCP, firstly for tha t information, so that it can back off from using MPTCP, firstly for tha t
particular destination host, and eventually on a whole interface, if particular destination host and, eventually, on a whole interface, if
MPTCP connections continue failing. The duration of such a cache would MPTCP connections continue to fail. The duration of such a cache would
be implementation-specific.</t> be implementation specific.</t>
<t>Another failure could occur when the MP_JOIN handshake fails. <t>Another failure could occur when the MP_JOIN handshake fails.
<xref target="sec_errors"/> specifies that an incorrect handshake MUST <xref target="sec_errors" format="default"/> specifies that an incorre ct handshake <bcp14>MUST</bcp14>
lead to the subflow being closed with a RST. A host operating an activ e lead to the subflow being closed with a RST. A host operating an activ e
intrusion detection system may choose to start blocking MP_JOIN packet s intrusion-detection system may choose to start blocking MP_JOIN packet s
from the source host if multiple failed MP_JOIN attempts are seen. Fro m from the source host if multiple failed MP_JOIN attempts are seen. Fro m
the connection initiator's point of view, if an MP_JOIN fails, it SHOU the connection initiator's point of view, if an MP_JOIN fails, it
LD <bcp14>SHOULD NOT</bcp14>
NOT attempt to connect to the same IP address and port during the life attempt to connect to the same IP address and port during the lifetime
time
of the connection, unless the other host refreshes the information wit h of the connection, unless the other host refreshes the information wit h
another ADD_ADDR option. Note that the ADD_ADDR option is informationa l another ADD_ADDR option. Note that the ADD_ADDR option is informationa l
only, and does not guarantee the other host will attempt a connection. only and does not guarantee that the other host will attempt a connect
</t> ion.</t>
<t>In addition, an implementation may learn, over a number of connecti ons, <t>In addition, an implementation may learn, over a number of connecti ons,
that certain interfaces or destination addresses consistently fail and that certain interfaces or destination addresses consistently fail and
may default to not trying to use MPTCP for these. Behavior could also may default to not trying to use MPTCP for such interfaces or
be learned for particularly badly performing subflows or subflows that addresses. The behavior of subflows that perform particularly badly
regularly fail during use, in order to temporarily choose not to use or subflows that regularly fail during use could also
be learned, so that an implementation can temporarily choose not to us
e
these paths.</t> these paths.</t>
</section> </section>
</section> </section>
</section> </section>
<section anchor="sec_semantics" numbered="true" toc="default">
<section title="Semantic Issues" anchor="sec_semantics"> <name>Semantic Issues</name>
<t>In order to support multipath operation, the semantics of some TCP comp <t>In order to support multipath operation, the semantics of some TCP
onents have changed. To aid clarity, this section collects these semantic change components have changed. To help clarify, this section lists these
s as a reference. semantic changes as a point of reference.
<list style="hanging"> </t>
<t hangText="Sequence number:"> The (in-header) TCP sequence <dl newline="false" spacing="normal" indent="3">
<dt>Sequence number:</dt>
<dd> The (in-header) TCP sequence
number is specific to the subflow. To allow the receiver to number is specific to the subflow. To allow the receiver to
reorder application data, an additional data-level reorder application data, an additional data-level
sequence space is used. In this data-level sequence space, the initi sequence space is used. In this data&#8209;level sequence space, the
al SYN and initial SYN and
the final DATA_FIN occupy 1 octet of sequence space. This is to ensu the final DATA_FIN occupy 1 octet of sequence space. This is done to
re these ensure that these
signals are acknowledged at the connection level. There is an explic it signals are acknowledged at the connection level. There is an explic it
mapping of data sequence space to subflow sequence space, mapping of data sequence space to subflow sequence space,
which is signaled through TCP options in data which is signaled through TCP options in data
packets.</t> packets.</dd>
<dt>ACK:</dt>
<t hangText="ACK:"> The ACK field in the TCP header <dd> The ACK field in the TCP header
acknowledges only the subflow sequence number, not the acknowledges only the subflow sequence number -- not the
data-level sequence space. Implementations SHOULD NOT data-level sequence space. Implementations <bcp14>SHOULD NOT</bcp14>
attempt to infer a data-level acknowledgment from the attempt to infer a data-level acknowledgment from the
subflow ACKs. subflow ACKs.
This separates subflow- and connection-level processing This separates subflow-level and connection-level processing
at an end host.</t> at an end host.</dd>
<dt>Duplicate ACK:</dt>
<t hangText="Duplicate ACK:"> A duplicate ACK that includes any MPTCP <dd> A duplicate ACK that includes any MPTCP signaling
signaling (with the exception of the DSS option) <bcp14>MUST NOT</bcp14> be tr
(with the exception of the DSS option) MUST NOT be treated as a sign eated as a signal of congestion.
al of congestion.
To limit the chances of non-MPTCP-aware entities mistakenly interpre ting duplicate To limit the chances of non-MPTCP-aware entities mistakenly interpre ting duplicate
ACKs as a signal of congestion, MPTCP SHOULD NOT send more than two ACKs as a signal of congestion, MPTCP <bcp14>SHOULD NOT</bcp14> send
duplicate ACKs more than two duplicate ACKs
containing (non-DSS) MPTCP signals in a row.</t> containing (non-DSS) MPTCP signals in a row.</dd>
<dt>Receive Window:</dt>
<t hangText="Receive Window:">The receive window in the TCP <dd>The receive window in the TCP
header indicates the amount of free buffer space for the header indicates the amount of free buffer space for the
whole data-level connection (as opposed to for this whole data-level connection (as opposed to the amount of space for t
subflow) that is available at the receiver. This is the his
same semantics as regular TCP, but to maintain these subflow) that is available at the receiver. The
semantics are the same as for regular TCP, but to maintain these
semantics the receive window must be interpreted at the semantics the receive window must be interpreted at the
sender as relative to the sequence number given in the sender as relative to the sequence number given in the
DATA_ACK rather than the subflow ACK in the TCP header. DATA_ACK rather than the subflow ACK in the TCP header.
In this way, the original flow control role is preserved. In this way, the original role of flow control is preserved.
Note that some middleboxes may change the receive window, Note that some middleboxes may change the receive window,
and so a host SHOULD use the maximum value of those recently and so a host <bcp14>SHOULD</bcp14> use the maximum value of those r ecently
seen on the constituent subflows for the connection-level seen on the constituent subflows for the connection-level
receive window, and also needs to maintain a subflow-level receive window and also needs to maintain a subflow-level
window for subflow-level processing.</t> window for subflow-level processing.</dd>
<dt>FIN:</dt>
<t hangText="FIN:"> The FIN flag in the TCP header applies <dd> The FIN flag in the TCP header applies
only to the subflow it is sent on, not to the whole only to the subflow it is sent on -- not to the whole
connection. For connection-level FIN semantics, the connection. For connection-level FIN semantics, the
DATA_FIN option is used.</t> DATA_FIN option is used.</dd>
<dt>RST:</dt>
<t hangText="RST:"> The RST flag in the TCP header applies <dd> The RST flag in the TCP header applies
only to the subflow it is sent on, not to the whole only to the subflow it is sent on -- not to the whole
connection. The MP_FASTCLOSE option provides the fast close connection. The MP_FASTCLOSE option provides the Fast Close
functionality of a RST at the MPTCP connection level.</t> functionality of a RST at the MPTCP connection level.</dd>
<dt>Address List:</dt>
<t hangText="Address List:"> Address list management (i.e., <dd> Address list management (i.e.,
knowledge of the local and remote hosts' lists of knowledge of the local and remote hosts' lists of
available IP addresses) is handled available IP addresses) is handled
on a per-connection basis (as opposed to per subflow, per on a per-connection basis (as opposed to per subflow, per
host, or per pair of communicating hosts). This permits host, or per pair of communicating hosts). This permits
the application of per-connection local policy. Adding an the application of per-connection local policy. Adding an
address to one connection (either explicitly through an Add address to one connection (either explicitly through an
Address message, or implicitly through a Join) has no implication ADD_ADDR message or implicitly through an MP_JOIN) has no implicatio
for other connections between the same pair of hosts.</t> ns
for other connections between the same pair of hosts.</dd>
<t hangText="5-tuple:"> The 5-tuple (protocol, local <dt>5-tuple:</dt>
<dd> The 5-tuple (protocol, local
address, local port, remote address, remote port) address, local port, remote address, remote port)
presented by kernel APIs to the application layer in a presented by kernel APIs to the application layer in a
non-multipath-aware application is that of the first non-multipath-aware application is that of the first
subflow, even if the subflow has since been closed and subflow, even if the subflow has since been closed and
removed from the connection. This decision, and other removed from the connection. This decision, and other
related API issues, are discussed in more detail in related API issues, are discussed in more detail in
<xref target="RFC6897"/>.</t> <xref target="RFC6897" format="default"/>.</dd>
</list> </dl>
</t>
</section> </section>
<section anchor="sec_security" numbered="true" toc="default">
<section title="Security Considerations" anchor="sec_security"> <name>Security Considerations</name>
<t>As identified in <xref target="RFC6181"/>, the addition of multipath ca <t>As identified in <xref target="RFC6181" format="default"/>, the
pability to TCP will bring with it a number of new classes of threat. In order t addition of multipath capability to TCP will bring with it a number of
o prevent these, <xref target="RFC6182"/> presents a set of requirements for a s new classes of threats. In order to prevent these threats, <xref target="R
ecurity solution for MPTCP. The fundamental goal is for the security of MPTCP to FC6182"
be "no worse" than regular TCP today, and the key security requirements are: format="default"/> presents a set of requirements for a security
<list style="symbols"> solution for MPTCP. The fundamental goal is for the security of MPTCP to
<t>Provide a mechanism to confirm that the parties in a subflow handsh be "no worse" than regular TCP today. The key security requirements
ake are the same as in the original connection setup.</t> are as follows:
<t>Provide verification that the peer can receive traffic at a new add </t>
ress before using it as part of a connection.</t> <ul spacing="normal">
<t>Provide replay protection, i.e., ensure that a request to add/remov <li>Provide a mechanism to confirm that the parties in a subflow
e a subflow is 'fresh'.</t> handshake are the same as the parties in the original connection setup.<
</list> /li>
<li>Provide verification that the peer can receive traffic at a new addr
In order to achieve these goals, MPTCP includes a hash-based handshake a ess before using it as part of a connection.</li>
lgorithm documented in Sections <xref target="sec_init" format="counter"/> and < <li>Provide replay protection, i.e., ensure that a request to add&wj;/re
xref target="sec_join" format="counter"/>.</t> move a subflow is "fresh".</li>
</ul>
<t>The security of the MPTCP connection hangs on the use of keys that are <t>
shared once at the start of the first subflow, and are never sent again over the In order to achieve these goals, MPTCP includes a hash-based handshake
network (unless used in the fast close mechanism, <xref target="sec_fastclose"/ algorithm, as documented in Sections <xref target="sec_init" format="count
>). To ease demultiplexing while not giving away any cryptographic material, fu er"/> and <xref target="sec_join" format="counter"/>.</t>
ture subflows use a truncated cryptographic hash of this key as the connection i <t>The security of the MPTCP connection hangs on the use of keys that
dentification "token". The keys are concatenated and used as keys for creating are shared once at the start of the first subflow and are never sent
Hash-based Message Authentication Codes (HMACs) used on subflow setup, in order again over the network (unless used in the Fast Close mechanism (<xref
to verify that the parties in the handshake are the same as in the original conn target="sec_fastclose" format="default"/>)). To ease demultiplexing
ection setup. It also provides verification that the peer can receive traffic a while not giving away any cryptographic material, future subflows use a
t this new address. Replay attacks would still be possible when only keys are u truncated cryptographic hash of this key as the connection
sed; therefore, the handshakes use single-use random numbers (nonces) at both en identification "token". The keys are concatenated and used as keys for
ds -- this ensures the HMAC will never be the same on two handshakes. Guidance o creating Hash-based Message Authentication Codes (HMACs) used on subflow
n generating random numbers suitable for use as keys is given in <xref target="R setup, in order to verify that the parties in the handshake are the same
FC4086"/> and discussed in <xref target="sec_init"/>. The nonces are valid for t as the parties in the original connection setup. It also provides verific
he lifetime of the TCP connection attempt. HMAC is also used to secure the ADD_A ation that
DDR option, due to the threats identified in <xref target="RFC7430"/>.</t> the peer can receive traffic at this new address. Replay attacks would
<t>The use of crypto capability bits in the initial connection handshake t still be possible when only keys are used; therefore, the handshakes use
o negotiate use of a particular algorithm allows the deployment of additional cr single-use random numbers (nonces) at both ends -- this ensures that the H
ypto mechanisms in the future. This negotiation would nevertheless be susceptib MAC will never be the same on two handshakes. Guidance on generating random numb
le to a bid-down attack by an on-path active attacker who could modify the crypt ers suitable for use as keys is given in <xref target="RFC4086" format="default"
o capability bits in the response from the receiver to use a less secure crypto /> and discussed in <xref target="sec_init" format="default"/>. The nonces are v
mechanism. The security mechanism presented in this document should therefore pr alid for the lifetime of the TCP connection attempt. HMAC is also used to secure
otect against all forms of flooding and hijacking attacks discussed in <xref tar the ADD_ADDR option, due to the threats identified in <xref target="RFC7430" fo
get="RFC6181"/>.</t> rmat="default"/>.</t>
<t>The use of crypto capability bits in the initial connection handshake
<t>The version negotiation specified in <xref target="sec_init"/>, if diff to negotiate the use of a particular algorithm allows the deployment of ad
ering MPTCP versions shared a common negotiation format, would allow an on-path ditional crypto mechanisms in the future. This negotiation would nevertheless b
attacker to apply a theoretical bid-down attack. Since the v1 and v0 protocols h e susceptible to a bid-down attack by an on-path active attacker who could modif
ave a different handshake, such an attack would require the client to re-establi y the crypto capability bits in the response from the receiver to use a less sec
sh the connection using v0, and this being supported by the server. Note that an ure crypto mechanism. The security mechanism presented in this document should t
on-path attacker would have access to the raw data, negating any other TCP-leve herefore protect against all forms of flooding and hijacking attacks discussed i
l security mechanisms. n <xref target="RFC6181" format="default"/>.</t>
Also a change from RFC6824 has removed the subflow identifier from the MP_ <t>The version negotiation specified in <xref target="sec_init"
PRIO option (<xref target="sec_policy"/>), to remove the theoretical attack wher format="default"/>, if differing MPTCP versions shared a common
e a subflow could be placed in "backup" mode by an attacker.</t> negotiation format, would allow an on-path attacker to apply a
theoretical bid-down attack. Since the v1 and v0 protocols have a
<t>During normal operation, regular TCP protection mechanisms (such as ens different handshake, such an attack would require that the client
uring sequence numbers are in-window) will provide the same level of protection re-establish the connection using v0 and that the server support v0.
against attacks on individual TCP subflows as exists for regular TCP today. Impl Note that an on-path attacker would have access to the raw data, negating any o
ementations will introduce additional buffers compared to regular TCP, to reasse ther TCP-level security mechanisms. As also noted in <xref target="app_changelog
mble data at the connection level. The application of window sizing will minimiz "/>, this document specifies the removal of the AddrID field <xref target="RFC68
e the risk of denial-of-service attacks consuming resources.</t> 24"/> in the MP_PRIO option (<xref target="sec_policy" format="default"/>).
This change eliminates the possibility of a theoretical attack where
<t>As discussed in <xref target="sec_add_address"/>, a host may advertise a subflow could be placed in "backup" mode by an attacker.</t>
its private addresses, but these might point to different hosts in the receiver' <t>During normal operation, regular TCP protection mechanisms (such as
s network. The MP_JOIN handshake (<xref target="sec_join"/>) will ensure that th ensuring that sequence numbers are in-window) will provide the same
is does not succeed in setting up a subflow to the incorrect host. However, it c level of protection against attacks on individual TCP subflows as the
ould still create unwanted TCP handshake traffic. This feature of MPTCP could be level of protection that exists for regular TCP today. Implementations wil
a target for denial-of-service exploits, with malicious participants in MPTCP c l introduce additional buffers compared to regular TCP, to reassemble data at th
onnections encouraging the recipient to target other hosts in the network. There e connection level. The application of window sizing will minimize the risk of d
fore, implementations should consider heuristics (<xref target="heuristics"/>) a enial-of-service attacks consuming resources.</t>
t both the sender and receiver to reduce the impact of this.</t> <t>As discussed in <xref target="sec_add_address" format="default"/>, a ho
st may advertise its private addresses, but these might point to different hosts
in the receiver's network. The MP_JOIN handshake (<xref target="sec_join" forma
t="default"/>) will ensure that this does not succeed in setting up a subflow to
the incorrect host. However, it could still create unwanted TCP handshake traff
ic. This feature of MPTCP could be a target for denial-of-service exploits, with
malicious participants in MPTCP connections encouraging the recipient to target
other hosts in the network. Therefore, implementations should consider heuristi
cs (<xref target="heuristics" format="default"/>) at both the sender and receive
r to reduce the impact of this.</t>
<t>To further protect against malicious ADD_ADDR messages sent by an off-p ath attacker, the ADD_ADDR includes an HMAC using the keys negotiated during the handshake. This effectively prevents an attacker from diverting an MPTCP connec tion through an off-path ADD_ADDR injection into the stream.</t> <t>To further protect against malicious ADD_ADDR messages sent by an off-p ath attacker, the ADD_ADDR includes an HMAC using the keys negotiated during the handshake. This effectively prevents an attacker from diverting an MPTCP connec tion through an off-path ADD_ADDR injection into the stream.</t>
<t>A small security risk could theoretically exist with key reuse, but in
<t>A small security risk could theoretically exist with key reuse, but in order to accomplish a replay attack, both the sender and receiver keys, and the
order to accomplish a replay attack, both the sender and receiver keys, and the sender and receiver random numbers, in the MP_JOIN handshake (<xref target="sec_
sender and receiver random numbers, in the MP_JOIN handshake (<xref target="sec_ join" format="default"/>) would have to match.</t>
join"/>) would have to match.</t> <t>While this specification defines a "medium" security solution,
meeting the criteria specified at the start of this section and in the
<t>Whilst this specification defines a "medium" security solution, meeting threat analysis document <xref target="RFC6181" format="default"/>, since
the criteria specified at the start of this section and the threat analysis (<x attacks
ref target="RFC6181"/>), since attacks only ever get worse, it is likely that a only ever get worse, it is likely that a future version of MPTCP would
future version of MPTCP would need to be able to support stronger security. Ther need to be able to support stronger security.
e are several ways the security of MPTCP could potentially be improved; some of There are several ways the security of MPTCP could potentially be improved; som
these would be compatible with MPTCP as defined in this document, whilst others e of these would be compatible with MPTCP as defined in this document, while oth
may not be. For now, the best approach is to get experience with the current app ers may not be. For now, the best approach is to gain experience with the curren
roach, establish what might work, and check that the threat analysis is still ac t approach, establish what might work, and check that the threat analysis is sti
curate.</t> ll accurate.</t>
<t>Possible ways of improving MPTCP security could include:</t>
<t>Possible ways of improving MPTCP security could include:<list style="symbols" <ul spacing="normal">
> <li>defining a new MPTCP cryptographic algorithm, as negotiated in
<t>defining a new MPCTP cryptographic algorithm, as negotiated in MP_CAPABLE. A MP_CAPABLE. If an implementation was being deployed in a controlled
sub-case could be to include an additional deployment assumption, such as statef environment where additional assumptions could be made, such as the
ul servers, in order to allow a more powerful algorithm to be used.</t> ability for the servers to store state during the TCP handshake, then
<t>defining how to secure data transfer with MPTCP, whilst not changing the sign it may be possible to use a stronger cryptographic algorithm than
aling part of the protocol.</t> would otherwise be possible.</li>
<t>defining security that requires more option space, perhaps in conjunction wit <li>defining how to secure data transfer with MPTCP, while not changing
h a "long options" proposal for extending the TCP options space (such as those s the signaling part of the protocol.</li>
urveyed in <xref target="TCPLO"/>), or perhaps building on the current approach <li>defining security that requires more option space, perhaps in
with a second stage of MPTCP-option-based security.</t> conjunction with a "long options" proposal for extending the TCP
<t>revisiting the working group's decision to exclusively use TCP options for MP option space (such as those surveyed in <xref
TCP signaling, and instead look at also making use of the TCP payloads.</t> target="I-D.ananth-tcpm-tcpoptext" format="default"/>), or perhaps
</list></t> building on the current approach with a second stage of
security based on MPTCP options.</li>
<t>MPTCP has been designed with several methods available to indicate a new secu <li>revisiting the working group's decision to exclusively use TCP
rity mechanism, including: options for MPTCP signaling and instead looking at the
<list style="symbols"> possibility of using TCP payloads as well.</li>
<t>available flags in MP_CAPABLE (<xref target="tcpm_capable"/>);</t> </ul>
<t>available subtypes in the MPTCP option (<xref target="fig_option"/>);</t> <t>MPTCP has been designed with several methods available to indicate a ne
<t>the version field in MP_CAPABLE (<xref target="tcpm_capable"/>);</t> w security mechanism, including:
</list></t> </t>
<ul spacing="normal">
<li>available flags in MP_CAPABLE (<xref target="tcpm_capable" format="d
efault"/>).</li>
<li>available subtypes in the MPTCP option (<xref target="fig_option" fo
rmat="default"/>).</li>
<li>the Version field in MP_CAPABLE (<xref target="tcpm_capable" format=
"default"/>).</li>
</ul>
</section> </section>
<section anchor="sec_middleboxes" numbered="true" toc="default">
<section title="Interactions with Middleboxes" anchor="sec_middleboxes"> <name>Interactions with Middleboxes</name>
<t>Multipath TCP was designed to be deployable in the present world. Its d
<t>Multipath TCP was designed to be deployable in the present world. Its esign takes into account "reasonable"
design takes into account "reasonable"
existing middlebox behavior. In this section, we outline a few representative mi ddlebox-related failure scenarios and existing middlebox behavior. In this section, we outline a few representative mi ddlebox-related failure scenarios and
show how Multipath TCP handles them. Next, we list the design decisions multipat show how Multipath TCP handles them. Next, we list the design decisions
h has made to accommodate the different Multipath TCP has made to accommodate the different
middleboxes.</t> middleboxes.</t>
<t>A primary concern is our use of a new TCP option. Middleboxes should fo
<t>A primary concern is our use of a new TCP option. Middleboxes should rward packets
forward packets with unknown options unchanged, yet there are some that don't. We expect these
with unknown options unchanged, yet there are some that don't. These we expect w middleboxes to strip options and pass the data,
ill either strip options and pass the data,
drop packets with new options, copy the same option into multiple segments (e.g. , when doing segmentation), or drop drop packets with new options, copy the same option into multiple segments (e.g. , when doing segmentation), or drop
options during segment coalescing.</t> options during segment coalescing.</t>
<t>MPTCP uses a single new TCP option called "Kind", and all message types
are defined by "subtype" values (see <xref target="IANA" format="default"/>). T
his should reduce the chances of only some types of MPTCP options being passed;
instead, the key differing characteristics are different paths and the presence
of the SYN flag.</t>
<t>MPTCP SYN packets on the first subflow of a connection contain the MP_C
APABLE option (<xref target="sec_init" format="default"/>). If this is dropped,
MPTCP <bcp14>SHOULD</bcp14> fall back to regular TCP. If packets with the MP_JOI
N option (<xref target="sec_join" format="default"/>) are dropped, the paths wil
l simply not be used.</t>
<t>If a middlebox strips options but otherwise passes the packets
unchanged, MPTCP will behave safely. If an MP_CAPABLE option is dropped
on either the outgoing path or the return path, the initiating host can
fall back to regular TCP, as illustrated in <xref target="fig_syn"
format="default"/> and discussed in <xref target="sec_init"
format="default"/>.</t>
<figure anchor="fig_syn">
<name>Connection Setup with Middleboxes That Strip Options from Packets<
/name>
<artwork align="left" name="" type="" alt=""><![CDATA[
Host A Host B
| Middlebox M |
| | |
| SYN (MP_CAPABLE) | SYN |
|-------------------|---------------->|
| SYN/ACK |
|<------------------------------------|
a) MP_CAPABLE option stripped on outgoing path
<t>MPTCP uses a single new TCP option "Kind", and all message types are Host A Host B
defined by "subtype" values (see <xref target="IANA"/>). This should reduce the | SYN (MP_CAPABLE) |
chances of only some types of MPTCP options being passed, and instead the key di |-------------------------------------->|
ffering characteristics are different paths, and the presence of the SYN flag.</ | Middlebox M |
t> | | |
| SYN/ACK |SYN/ACK (MP_CAPABLE)|
<t>MPTCP SYN packets on the first subflow of a connection contain the MP |<-----------------|--------------------|
_CAPABLE option (<xref target="sec_init"/>). If this is dropped, MPTCP SHOULD fa b) MP_CAPABLE option stripped on return path ]]></artwork>
ll back to regular TCP. If packets with the MP_JOIN option (<xref target="sec_jo </figure>
in"/>) are dropped, the paths will simply not be used.</t> <t>Subflow SYNs contain the MP_JOIN option. If this option is stripped on
the outgoing path,
<t>If a middlebox strips options but otherwise passes the packets unchan the SYN will appear to be a regular SYN to Host&nbsp;B. Depending on whether th
ged, MPTCP will behave safely. If an MP_CAPABLE option is dropped on either the ere is a listening socket on
outgoing or the return path, the initiating host can fall back to regular TCP, a the target port, Host B will reply with either a SYN/ACK or a RST (subflow conne
s illustrated in <xref target="fig_syn"/> and discussed in <xref target="sec_ini ction fails). When Host A
t"/>.</t> receives the SYN/ACK, it sends a RST because the SYN/ACK does not contain the MP
_JOIN option and its token.
<t>Subflow SYNs contain the MP_JOIN option. If this option is stripped Either way, the subflow setup fails but otherwise does not affect the MPTCP conn
on the outgoing path, ection as a whole.</t>
the SYN will appear to be a regular SYN to Host B.&nbsp; Depending on whether th <t>We now examine data flow with MPTCP, assuming that the flow is
ere is a listening socket on correctly set up, which implies that the options in the SYN
the target port, Host B will reply either with SYN/ACK or RST (subflow connectio
n fails). When Host A
receives the SYN/ACK it sends a RST because the SYN/ACK does not contain the MP_
JOIN option and its token.
Either way, the subflow setup fails, but otherwise does not affect the MPTCP con
nection as a whole.</t>
<figure align="center" anchor="fig_syn" title="Connection Setup with Mid
dleboxes that Strip Options from Packets">
<artwork align="left"><![CDATA[
Host A Host B
| Middlebox M |
| | |
| SYN(MP_CAPABLE) | SYN |
|-------------------|---------------->|
| SYN/ACK |
|<------------------------------------|
a) MP_CAPABLE option stripped on outgoing path
Host A Host B
| SYN(MP_CAPABLE) |
|------------------------------------>|
| Middlebox M |
| | |
| SYN/ACK |SYN/ACK(MP_CAPABLE)|
|<----------------|-------------------|
b) MP_CAPABLE option stripped on return path
]]></artwork>
</figure>
<t>We now examine data flow with MPTCP, assuming the flow is correctly s
et up, which implies the options in the SYN
packets were allowed through by the relevant middleboxes. If options are allowed through and there is no resegmentation or packets were allowed through by the relevant middleboxes. If options are allowed through and there is no resegmentation or
coalescing to TCP segments, Multipath TCP flows can proceed without problems.</t > coalescing to TCP segments, Multipath TCP flows can proceed without problems.</t >
<t>The case when options get stripped on data packets is discussed
<t>The case when options get stripped on data packets has been discussed in <xref target="sec_fallback" format="default"/>.
in the Fallback section. If only some MPTCP options are stripped, behavior is not deterministic.
If only some MPTCP options are stripped, behavior is not deterministic. If some Data Sequence Mappings are lost, the connection can continue so long as
If some data sequence mappings are lost, the connection can continue so long as mappings exist for the subflow-level data (e.g., if multiple maps have been sent
mappings exist for the subflow-level data (e.g., if multiple maps have been sent that reinforce each other). If some subflow-level space is left unmapped, howev
that reinforce each other). If some subflow-level space is left unmapped, howev er, the subflow is treated as broken and is closed, using the process described
er, the subflow is treated as broken and is closed, through the process describe in <xref target="sec_fallback" format="default"/>. MPTCP should survive with a l
d in <xref target="sec_fallback"/>. MPTCP should survive with a loss of some Dat oss of some Data ACKs, but performance will degrade as the fraction of stripped
a ACKs, but performance will degrade as the fraction of stripped options increas options increases.
es.
We do not expect such cases to appear in practice, though: most We do not expect such cases to appear in practice, though: most
middleboxes will either strip all options or let them all through.</t> middleboxes will either strip all options or let them all through.</t>
<t>We end this section with a list of middlebox classes, their behavior, a
<t>We end this section with a list of middlebox classes, their behavior, nd the elements in the MPTCP design
and the elements in the MPTCP design
that allow operation through such middleboxes. Issues surrounding dropping packe ts with options that allow operation through such middleboxes. Issues surrounding dropping packe ts with options
or stripping options were discussed above, and are not included here: or stripping options were discussed above and are not included here:
<list style="symbols"> </t>
<t>NATs <xref target="RFC3022"/> (Network Address (and Port) Translato <ul spacing="normal">
rs) change the source address (and often source port) of packets. This means tha <li>NATs (Network Address (and port) Translators) <xref
t a host will not know its target="RFC3022" format="default"/> change the source address (and
often the source port) of packets. This means that a host will not know
its
public-facing address for signaling in MPTCP. Therefore, MPTCP permits impli cit address addition via the MP_JOIN option, public-facing address for signaling in MPTCP. Therefore, MPTCP permits impli cit address addition via the MP_JOIN option,
and the handshake mechanism ensures that connection attempts to private addr and the handshake mechanism ensures that connection attempts to private addr
esses <xref target="RFC1918"/>, since they are authenticated, will only set up s esses <xref target="RFC1918" format="default"/>, since they are authenticated, w
ubflows to the correct hosts. ill only set up subflows to the correct hosts.
Explicit address removal is undertaken by an Address ID to allow no knowledg Explicit address removal is undertaken by an Address ID to allow no knowledg
e of the source address.</t> e of the source address.</li>
<li>Performance Enhancing Proxies (PEPs) <xref target="RFC3135" format="
<t>Performance Enhancing Proxies (PEPs) <xref target="RFC3135"/> might default"/> might proactively ACK data to increase performance. MPTCP, however, r
proactively ACK data to increase performance. MPTCP, however, relies on accurat elies on accurate congestion control signals from the end host, and non&#8209;MP
e congestion control signals from the end host, and non-MPTCP-aware PEPs will no TCP-aware PEPs will not be able to provide such signals. MPTCP will, therefore,
t be able to provide such signals. MPTCP will, therefore, fall back to single-pa fall back to single-path TCP or close the problematic subflow (see <xref target=
th TCP, or close the problematic subflow (see <xref target="sec_fallback"/>).</t "sec_fallback" format="default"/>).</li>
> <li>Traffic normalizers <xref target="norm" format="default"/> may not
allow holes in sequence numbers, and they may cache packets and retransm
<t>Traffic Normalizers <xref target="norm"/> may not allow holes in se it the same data.
quence numbers, and may cache packets and retransmit the same data. MPTCP looks like standard TCP on the wire and will not retransmit different data
MPTCP looks like standard TCP on the wire, and will not retransmit different dat on the same subflow sequence number. In the event of a retransmission, the same
a on the same subflow sequence number. In the event of a retransmission, the sam data will be retransmitted on the original TCP subflow even if it is additional
e data will be retransmitted on the original TCP subflow even if it is additiona ly retransmitted at the connection level on a different subflow.</li>
lly retransmitted at the connection level on a different subflow.</t> <li>Firewalls <xref target="RFC2979" format="default"/> might perform
Initial Sequence Number (ISN) randomization on TCP connections. MPTCP us
<t>Firewalls <xref target="RFC2979"/> might perform initial sequence n es relative
umber randomization on TCP connections. MPTCP uses relative sequence numbers in Data Sequence Mappings to cope with this. Like NATs, firewal
sequence numbers in data sequence mapping to cope with this. Like NATs, firewall ls will not permit many incoming connections, so
s will not permit many incoming connections, so
MPTCP supports address signaling (ADD_ADDR) so that a multiaddressed host can in vite its peer behind the firewall/NAT to connect MPTCP supports address signaling (ADD_ADDR) so that a multiaddressed host can in vite its peer behind the firewall/NAT to connect
out to its additional interface.</t> out to its additional interface.</li>
<li>Intrusion Detection Systems / Intrusion Prevention Systems (IDSs&wj;
<t>Intrusion Detection/Prevention Systems (IDS/IPS) observe packet str /IPSs) observe packet streams for patterns and content that could threaten a net
eams for patterns and content that could threaten a network. MPTCP may require t work. MPTCP may require the
he instrumentation of additional paths, and an MPTCP-aware IDS or IPS would need to
instrumentation of additional paths, and an MPTCP-aware IDS/IPS would need to re read MPTCP tokens to correlate data from multiple subflows to maintain comparab
ad MPTCP tokens to correlate data from mutliple subflows to maintain comparable le visibility into all of the traffic between devices. Without such changes, an
visibility into all of the traffic between devices. Without such changes, an IDS IDS would get an incomplete view of the traffic, increasing the risk of missing
would get an incomplete view of the traffic, increasing the risk of missing tra traffic of interest (false negatives) and increasing the chances of erroneously
ffic of interest (false negatives), and increasing the chances of erroneously id identifying a subflow as a risk due to only seeing partial data (false positives
entifying a subflow as a risk due to only seeing partial data (false positives). ).</li>
</t> <li>Application-level middleboxes such as content-aware firewalls may
alter the payload within a subflow -- for example, rewriting URIs in
<t>Application-level middleboxes such as content-aware firewalls may a HTTP traffic. MPTCP will detect such changes using the checksum
lter the payload within a subflow, such as rewriting URIs in HTTP traffic. MPTCP and close the affected subflow(s), if there are other subflows that can be used.
will detect these using the checksum If all subflows are affected, MPTCP
and close the affected subflow(s), if there are other subflows that can be used. will fall back to TCP, allowing such middleboxes to change the payload. MPTCP-aw
If all subflows are affected, multipath are middleboxes should be able to adjust the payload and MPTCP metadata in order
will fall back to TCP, allowing such middleboxes to change the payload. MPTCP-aw not to break the connection.</li>
are middleboxes should be able to adjust the payload and MPTCP metadata in order </ul>
not to break the connection.</t> <t>
</list>
In addition, all classes of middleboxes may affect TCP traffic in the fo llowing ways: In addition, all classes of middleboxes may affect TCP traffic in the fo llowing ways:
<list style="symbols">
<t>TCP options may be removed, or packets with unknown options dropped
, by many classes of middleboxes. It is intended
that the initial SYN exchange, with a TCP option, will be sufficient to identify
the path capabilities. If such a packet does
not get through, MPTCP will end up falling back to regular TCP.</t>
<t>Segmentation/Coalescing (e.g., TCP segmentation offloading) might c
opy options between packets and might
strip some options. MPTCP's data sequence mapping includes the relative subflow
sequence number instead of using the sequence
number in the segment. In this way, the mapping is independent of the packets th
at carry it.</t>
<t>The receive window may be shrunk by some middleboxes at the subflow
level. MPTCP will use the maximum window at data level, but will also obey
subflow-specific windows.</t>
</list>
</t> </t>
<ul spacing="normal">
</section> <li>TCP options may be removed, or packets with unknown options dropped,
by many classes of middleboxes. It is intended
<section anchor="Acknowledgments" title="Acknowledgments"> that the initial SYN exchange, with a TCP option, will be sufficient to identify
<!-- <t>The authors were originally supported by Trilogy (http://www.trilo the path's capabilities. If such a packet does
gy-project.org), a research project (ICT-216372) partially funded by the Europea not get through, MPTCP will end up falling back to regular TCP.</li>
n Community under its Seventh Framework Program.</t> <li>Segmentation/coalescing (e.g., TCP segmentation offloading) might co
<t>Alan Ford was originally supported by Roke Manor Research and later Cis py options between packets and might
co Systems.</t> --> strip some options. MPTCP's Data Sequence Mapping includes the relative subflow
<t>The authors gratefully acknowledge significant input into this document sequence number instead of using the sequence
from S&eacute;bastien Barr&eacute; and Andrew McDonald.</t> number in the segment. In this way, the mapping is independent of the packets th
<t>The authors also wish to acknowledge reviews and contributions from Ilj at carry it.</li>
itsch van Beijnum, Lars Eggert, Marcelo Bagnulo, Robert Hancock, Pasi Sarolahti, <li>The receive window may be shrunk by some middleboxes at the
Toby Moncaster, Philip Eardley, Sergio Lembo, Lawrence Conroy, Yoshifumi Nishid subflow level. MPTCP will use the maximum window at the data level but w
a, Bob Briscoe, Stein Gjessing, Andrew McGregor, Georg Hampel, Anumita Biswas, W ill also obey
es Eddy, Alexey Melnikov, Francis Dupont, Adrian Farrel, Barry Leiba, Robert Spa subflow-specific windows.</li>
rks, Sean Turner, Stephen Farrell, Martin Stiemerling, Gregory Detal, Fabien Duc </ul>
hene, Xavier de Foy, Rahul Jadhav, Klemens Schragel, Mirja Kuehlewind, Sheng Jia
ng, Alissa Cooper, Ines Robles, Roman Danyliw, Adam Roach, Barry Leiba, Alexey M
elnikov, Eric Vyncke, and Ben Kaduk.</t>
</section>
<section anchor="IANA" title="IANA Considerations">
<t>This document obsoletes RFC6824 and as such IANA is requested to update
the TCP option space registry to point to this document for Multipath TCP, as f
ollows:</t>
<texttable anchor="table_tcpo" title="TCP Option Kind Numbers">
<ttcol align="center">Kind</ttcol>
<ttcol align="center">Length</ttcol>
<ttcol align="center">Meaning</ttcol>
<ttcol align="center">Reference</ttcol>
<c>30</c>
<c>N</c>
<c>Multipath TCP (MPTCP)</c>
<c>This document</c>
</texttable>
<section anchor="IANA_subtypes" title="MPTCP Option Subtypes">
<t>The 4-bit MPTCP subtype sub-registry ("MPTCP Option Subtypes" under the
"Transmission Control Protocol (TCP) Parameters" registry) was defined in RFC68
24. Since RFC6824 was an Experimental not Standards Track RFC, and since no furt
her entries have occurred beyond those pointing to RFC6824, IANA is requested to
replace the existing registry with <xref target="table_iana"/> and with the fol
lowing explanatory note.</t>
<t>Note: This registry specifies the MPTCP Option Subtypes for MPTCP v1, w
hich obsoletes the Experimental MPTCP v0. For the MPTCP v0 subtypes, please refe
r to RFC6824.</t>
<texttable anchor="table_iana" title="MPTCP Option Subtypes">
<ttcol align="center">Value</ttcol>
<ttcol align="center">Symbol</ttcol>
<ttcol align="center">Name</ttcol>
<ttcol align="center">Reference</ttcol>
<c>0x0</c>
<c>MP_CAPABLE</c>
<c>Multipath Capable</c>
<c>This document, <xref target="sec_init"/></c>
<c>0x1</c>
<c>MP_JOIN</c>
<c>Join Connection</c>
<c>This document, <xref target="sec_join"/></c>
<c>0x2</c>
<c>DSS</c>
<c>Data Sequence Signal (Data ACK and data sequence mapping)</c>
<c>This document, <xref target="sec_generalop"/></c>
<c>0x3</c>
<c>ADD_ADDR</c>
<c>Add Address</c>
<c>This document, <xref target="sec_add_address"/></c>
<c>0x4</c>
<c>REMOVE_ADDR</c>
<c>Remove Address</c>
<c>This document, <xref target="sec_remove_addr"/></c>
<c>0x5</c>
<c>MP_PRIO</c>
<c>Change Subflow Priority</c>
<c>This document, <xref target="sec_policy"/></c>
<c>0x6</c>
<c>MP_FAIL</c>
<c>Fallback</c>
<c>This document, <xref target="sec_fallback"/></c>
<c>0x7</c>
<c>MP_FASTCLOSE</c>
<c>Fast Close</c>
<c>This document, <xref target="sec_fastclose"/></c>
<c>0x8</c>
<c>MP_TCPRST</c>
<c>Subflow Reset</c>
<c>This document, <xref target="sec_reset"/></c>
<c>0xf</c>
<c>MP_EXPERIMENTAL</c>
<c>Reserved for private experiments</c>
<c></c>
</texttable>
<t>Values 0x9 through 0xe are currently unassigned. Option 0xf is reserved
for use by private experiments. Its use may be formalized in a future specifica
tion. Future assignments in this registry are to be defined by Standards Action
as defined by <xref target="RFC8126"/>. Assignments consist of the MPTCP subtyp
e's symbolic name and its associated value, and a reference to its specification
.</t>
</section> </section>
<section anchor="IANA_handshake" title="MPTCP Handshake Algorithms"> <section anchor="IANA" numbered="true" toc="default">
<name>IANA Considerations</name>
<t>The "MPTCP Handshake Algorithms" sub-registry under the "Transmission C
ontrol Protocol (TCP) Parameters" registry was defined in RFC6824. Since RFC6824
was an Experimental not Standards Track RFC, and since no further entries have
occurred beyond those pointing to RFC6824, IANA is requested to replace the exis
ting registry with <xref target="table_crypto"/> and with the following explanat
ory note.</t>
<t>Note: This registry specifies the MPTCP Handshake Algorithms for MPTCP
v1, which obsoletes the Experimental MPTCP v0. For the MPTCP v0 subtypes, please
refer to RFC6824.</t>
<texttable anchor="table_crypto" title="MPTCP Handshake Algorithms">
<ttcol align="center">Flag Bit</ttcol>
<ttcol align="center">Meaning</ttcol>
<ttcol align="center">Reference</ttcol>
<c>A</c>
<c>Checksum required</c>
<c>This document, <xref target="sec_init"/></c>
<c>B</c>
<c>Extensibility</c>
<c>This document, <xref target="sec_init"/></c>
<c>C</c>
<c>Do not attempt to establish new subflows to the source address.</c>
<c>This document, <xref target="sec_init"/></c>
<c>D-G</c> <t>This document obsoletes <xref target="RFC6824"/>. As such, IANA has upd
<c>Unassigned</c> ated
<c></c> several registries to point to this document. In addition, this document
creates one new registry. These topics are described in the following sub
sections.</t>
<c>H</c> <section anchor="IANA-TCP-Option-Kind" numbered="true" toc="default">
<c>HMAC-SHA256</c> <name>TCP Option Kind Numbers</name>
<c>This document, <xref target="sec_join"/></c> <t>IANA has
</texttable> updated the "TCP Option Kind Numbers" registry to point to this document
for Multipath TCP, as shown in <xref target="table_tcpo"/>:</t>
<table anchor="table_tcpo" align="center">
<name>TCP Option Kind Numbers</name>
<thead>
<tr>
<th align="center">Kind</th>
<th align="center">Length</th>
<th align="center">Meaning</th>
<th align="center">Reference</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center">30</td>
<td align="center">N</td>
<td align="center">Multipath TCP (MPTCP)</td>
<td align="center">RFC 8684</td>
</tr>
</tbody>
</table>
</section>
<section anchor="IANA_subtypes" numbered="true" toc="default">
<name>MPTCP Option Subtypes</name>
<t>The 4-bit MPTCP subtype in the "MPTCP Option Subtypes"
subregistry under the "Transmission Control Protocol (TCP) Parameters"
registry was defined in <xref target="RFC6824"/>. Since <xref target="RF
C6824"/> is an
Experimental RFC and not a Standards Track RFC, and since no further
entries have occurred beyond those pointing to <xref target="RFC6824"/>,
IANA has
replaced the existing registry with the contents of
<xref target="table_iana" format="default"/> and with the following
explanatory note.</t>
<t>Note that the meanings of bits D through H can be dependent upon bit B, <t>Note: This registry specifies the MPTCP Option Subtypes for MPTCP v1,
depending on how Extensibility is defined in future specifications; see which obsoletes the Experimental MPTCP v0. For the MPTCP v0 subtypes, please re
<xref target="sec_init"/> for more information.</t> fer to <xref target="RFC6824"/>.</t>
<table anchor="table_iana" align="center">
<name>MPTCP Option Subtypes</name>
<thead>
<tr>
<th align="center">Value</th>
<th align="center">Symbol</th>
<th align="center">Name</th>
<th align="center">Reference</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center">0x0</td>
<td align="center">MP_CAPABLE</td>
<td align="center">Multipath Capable</td>
<td align="center">RFC 8684, <xref target="sec_init" format="defau
lt"/></td>
</tr>
<tr>
<td align="center">0x1</td>
<td align="center">MP_JOIN</td>
<td align="center">Join Connection</td>
<td align="center">RFC 8684, <xref target="sec_join" format="defau
lt"/></td>
</tr>
<tr>
<td align="center">0x2</td>
<td align="center">DSS</td>
<td align="center">Data Sequence Signal (Data ACK and Data Sequenc
e Mapping)</td>
<td align="center">RFC 8684, <xref target="sec_generalop" format="
default"/></td>
</tr>
<tr>
<td align="center">0x3</td>
<td align="center">ADD_ADDR</td>
<td align="center">Add Address</td>
<td align="center">RFC 8684, <xref target="sec_add_address" format
="default"/></td>
</tr>
<tr>
<td align="center">0x4</td>
<td align="center">REMOVE_ADDR</td>
<td align="center">Remove Address</td>
<td align="center">RFC 8684, <xref target="sec_remove_addr" format
="default"/></td>
</tr>
<tr>
<td align="center">0x5</td>
<td align="center">MP_PRIO</td>
<td align="center">Change Subflow Priority</td>
<td align="center">RFC 8684, <xref target="sec_policy" format="def
ault"/></td>
</tr>
<tr>
<td align="center">0x6</td>
<td align="center">MP_FAIL</td>
<td align="center">Fallback</td>
<td align="center">RFC 8684, <xref target="sec_fallback" format="d
efault"/></td>
</tr>
<tr>
<td align="center">0x7</td>
<td align="center">MP_FASTCLOSE</td>
<td align="center">Fast Close</td>
<td align="center">RFC 8684, <xref target="sec_fastclose" format="
default"/></td>
</tr>
<tr>
<td align="center">0x8</td>
<td align="center">MP_TCPRST</td>
<td align="center">Subflow Reset</td>
<td align="center">RFC 8684, <xref target="sec_reset" format="defa
ult"/></td>
</tr>
<tr>
<td align="center">0xf</td>
<td align="center">MP_EXPERIMENTAL</td>
<td align="center">Reserved for Private Use</td>
<td align="center"/>
</tr>
</tbody>
</table>
<t>Values 0x9 through 0xe are currently unassigned. Option 0xf is reserv
ed for use by private experiments. Its use may be formalized in a future specifi
cation. Future assignments in this registry are to be defined by Standards Actio
n as defined by <xref target="RFC8126" format="default"/>. Assignments consist
of the MPTCP subtype's symbolic name, its associated value, and a reference to i
ts specification.</t>
</section>
<section anchor="IANA_handshake" numbered="true" toc="default">
<name>MPTCP Handshake Algorithms</name>
<t>The "MPTCP Handshake Algorithms" subregistry under the
"Transmission Control Protocol (TCP) Parameters" registry was defined
in <xref target="RFC6824"/>. Since <xref target="RFC6824"/> is an Experi
mental RFC and not
a Standards Track RFC, and since no further entries have occurred
beyond those pointing to <xref target="RFC6824"/>, IANA has replaced
the existing registry with the contents of
<xref target="table_crypto" format="default"/> and with the following explanato
ry note.</t>
<t>Note: This registry specifies the MPTCP Handshake Algorithms for MPTC
P v1, which obsoletes the Experimental MPTCP v0. For the MPTCP v0 subtypes, plea
se refer to <xref target="RFC6824"/>.</t>
<table anchor="table_crypto" align="center">
<name>MPTCP Handshake Algorithms</name>
<thead>
<tr>
<th align="center">Flag Bit</th>
<th align="center">Meaning</th>
<th align="center">Reference</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center">A</td>
<td align="center">Checksum required</td>
<td align="center">RFC 8684, <xref target="sec_init" format="defau
lt"/></td>
</tr>
<tr>
<td align="center">B</td>
<td align="center">Extensibility</td>
<td align="center">RFC 8684, <xref target="sec_init" format="defau
lt"/></td>
</tr>
<tr>
<td align="center">C</td>
<td align="center">Do not attempt to establish new subflows to the
source address.</td>
<td align="center">RFC 8684, <xref target="sec_init" format="defau
lt"/></td>
</tr>
<tr>
<td align="center">D-G</td>
<td align="center">Unassigned</td>
<td align="center"/>
</tr>
<tr>
<td align="center">H</td>
<td align="center">HMAC-SHA256</td>
<td align="center">RFC 8684, <xref target="sec_join" format="defau
lt"/></td>
</tr>
</tbody>
</table>
<t>Future assignments in this registry are also <t>Note that the meanings of bits "D" through "H" can be dependent upon
to be defined by Standards Action as defined by <xref target="RFC8126"/>. bit "B",
depending on how the Extensibility parameter is defined in future specific
ations; see
<xref target="sec_init" format="default"/> for more information.</t>
<t>Future assignments in this registry are also
to be defined by Standards Action as defined by <xref target="RFC8126" for
mat="default"/>.
Assignments consist of the value of the flags, a symbolic name for the alg orithm, Assignments consist of the value of the flags, a symbolic name for the alg orithm,
and a reference to its specification.</t> and a reference to its specification.</t>