linux协议栈RDS代码流程

1
2
3
4
5
6
7
8
AF_RDS, PF_RDS, SOL_RDS
AF_RDS and PF_RDS are the domain type to be used with socket(2)
to create RDS sockets. SOL_RDS is the socket-level to be used
with setsockopt(2) and getsockopt(2) for RDS specific socket
options.

fd = socket(PF_RDS, SOCK_SEQPACKET, 0);
This creates a new, unbound RDS socket.

rds发送流程

1
2
3
4
5
6
7
8
9
10
11
12
13
rds_sendmsg

rds_send_xmit

conn->c_trans->xmit -> rds_tcp_xmit

rds_tcp_sendmsg

kernel_sendmsg

sock_sendmsg -> sock_sendmsg -> sock->ops->sendmsg

tcp_sendmsg

rds收包流程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

recvmsg

rds_recvmsg

rds_next_incoming

list_entry(rs->rs_recv_queue.next, struct rds_incoming, i_item);


===================queue==================

rds_recv_worker

cp->cp_conn->c_trans->recv_path(cp)

rds_tcp_recv_path

rds_tcp_read_sock

tcp_read_sock

rds_tcp_data_recv

rds_recv_incoming

list_add_tail(&inc->i_item, &rs->rs_recv_queue);

rds发送时建立tcp连接

1
2
3
4
5
6
7
8
9
rds_sendmsg
rds_conn_create_outgoing
__rds_conn_path_init
rds_connect_worker
conn->c_trans->conn_path_connect
c
sock_create_kern
sock->ops->connect

rds_tcp模块初始化

1
2
3
4
5
6
7
8
rds_tcp_init_net
rds_tcp_accept_worker
rds_tcp_accept_one
rds_conn_create
__rds_conn_path_init
rds_connect_worker
conn->c_trans->conn_path_connect(cp);
rds_tcp_conn_path_connect

rdst头部格式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
struct cmsghdr {
__kernel_size_t cmsg_len; /* data byte count, including hdr */
int cmsg_level; /* originating protocol */
int cmsg_type; /* protocol-specific type */
};

struct rds_header {
__be64 h_sequence;
__be64 h_ack;
__be32 h_len;
__be16 h_sport;
__be16 h_dport;
u8 h_flags;
u8 h_credit;
u8 h_padding[4];
__sum16 h_csum;

u8 h_exthdr[RDS_HEADER_EXT_SPACE];
};

rds问题

  1. 默认要建立8个连接,建立多连接的好处可能是为了利用多核。
  2. 最终必须为ip小的发起 即便是大的先发起也要断了重连。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the
* client's ipaddr < server's ipaddr. Otherwise, close the accepted
* socket and force a reconneect from smaller -> larger ip addr. The reason
* we special case cp_index 0 is to allow the rds probe ping itself to itself
* get through efficiently.
* Since reconnects are only initiated from the node with the numerically
* smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side
* by moving them to CONNECTING in this function.
*/
static
struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
{
/* for mprds, all paths MUST be initiated by the peer
* with the smaller address.
*/
if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) {
/* Make sure we initiate at least one path if this
* has not already been done; rds_start_mprds() will
* take care of additional paths, if necessary.
*/
if (npaths == 1)
rds_conn_path_connect_if_down(&conn->c_path[0]);
return NULL;
}
}

在网关可以利用rds头部包含最终目的信息。

在服务器可以通过nat进行代理。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
//创建netlink操作
static struct rtnl_link_ops link_ops __read_mostly = {
.kind = KBUILD_MODNAME,
.priv_size = sizeof(struct wg_device),
.setup = wg_setup,
.newlink = wg_newlink,
};

wg_device_init

wg_device_init

ret = rtnl_link_register(&link_ops);

//注册wireguard网卡驱动
//注册后通过ip link add dev wg0 type wireguard 创建虚拟网卡
//iplink_modify(RTM_NEWLINK,NLM_F_CREATE|NLM_F_EXCL,argc-1, argv+1);

wg_newlink

register_netdevice





#define WG_GENL_NAME "wireguard"

static struct genl_family genl_family __ro_after_init = {
.ops = genl_ops,
.n_ops = ARRAY_SIZE(genl_ops),
.name = WG_GENL_NAME,
.version = WG_GENL_VERSION,
.maxattr = WGDEVICE_A_MAX,
.module = THIS_MODULE,
.policy = device_policy,
.netnsok = true
};

wg_genetlink_init(void)

genl_register_family(&genl_family);


指标

延迟

带宽

阻塞检测

rds-tools

拓扑优化

CONFIG_RDS_TCP

https://slidetodoc.com/reliable-datagram-sockets-and-infini-band-hanan-hit/

https://slidetodoc.com/architecture-of-parallel-computers-csc-ece-506-open/


linux协议栈RDS代码流程
http://blog.uanet.cn/NETWORK/linux协议栈RDS代码流程.html
作者
dnsnat
发布于
2025年2月13日
许可协议