// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause

/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
/* Copyright (c) 2008-2019, IBM Corporation */

#include <linux/errno.h>
#include <linux/types.h>
#include <linux/net.h>
#include <linux/scatterlist.h>
#include <linux/highmem.h>

#include <rdma/iw_cm.h>
#include <rdma/ib_verbs.h>

#include "siw.h"
#include "siw_verbs.h"
#include "siw_mem.h"

/*
 * siw_rx_umem()
 *
 * Receive data of @len into target referenced by @dest_addr.
 *
 * @srx:	Receive Context
 * @umem:	siw representation of target memory
 * @dest_addr:	user virtual address
 * @len:	number of bytes to place
 */
static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
		       u64 dest_addr, int len)
{
	int copied = 0;

	while (len) {
		struct page *p;
		int pg_off, bytes, rv;
		void *dest;

		p = siw_get_upage(umem, dest_addr);
		if (unlikely(!p)) {
			pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n",
				__func__, qp_id(rx_qp(srx)),
				(void *)(uintptr_t)dest_addr,
				(void *)(uintptr_t)umem->fp_addr);
			/* siw internal error */
			srx->skb_copied += copied;
			srx->skb_new -= copied;

			return -EFAULT;
		}
		pg_off = dest_addr & ~PAGE_MASK;
		bytes = min(len, (int)PAGE_SIZE - pg_off);

		siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes);

		dest = kmap_atomic(p);
		rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
				   bytes);

		if (unlikely(rv)) {
			kunmap_atomic(dest);
			srx->skb_copied += copied;
			srx->skb_new -= copied;

			pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
				qp_id(rx_qp(srx)), __func__, len, p, rv);

			return -EFAULT;
		}
		if (srx->mpa_crc_hd) {
			if (rdma_is_kernel_res(&rx_qp(srx)->base_qp.res)) {
				crypto_shash_update(srx->mpa_crc_hd,
					(u8 *)(dest + pg_off), bytes);
				kunmap_atomic(dest);
			} else {
				kunmap_atomic(dest);
				/*
				 * Do CRC on original, not target buffer.
				 * Some user land applications may
				 * concurrently write the target buffer,
				 * which would yield a broken CRC.
				 * Walking the skb twice is very ineffcient.
				 * Folding the CRC into skb_copy_bits()
				 * would be much better, but is currently
				 * not supported.
				 */
				siw_crc_skb(srx, bytes);
			}
		} else {
			kunmap_atomic(dest);
		}
		srx->skb_offset += bytes;
		copied += bytes;
		len -= bytes;
		dest_addr += bytes;
		pg_off = 0;
	}
	srx->skb_copied += copied;
	srx->skb_new -= copied;

	return copied;
}

static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
{
	int rv;

	siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len);

	rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
	if (unlikely(rv)) {
		pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n",
			qp_id(rx_qp(srx)), __func__, len, kva, rv);

		return rv;
	}
	if (srx->mpa_crc_hd)
		crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len);

	srx->skb_offset += len;
	srx->skb_copied += len;
	srx->skb_new -= len;

	return len;
}

static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
		      struct siw_mem *mem, u64 addr, int len)
{
	struct siw_pbl *pbl = mem->pbl;
	u64 offset = addr - mem->va;
	int copied = 0;

	while (len) {
		int bytes;
		dma_addr_t buf_addr =
			siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx);
		if (!buf_addr)
			break;

		bytes = min(bytes, len);
		if (siw_rx_kva(srx, ib_virt_dma_to_ptr(buf_addr), bytes) ==
		    bytes) {
			copied += bytes;
			offset += bytes;
			len -= bytes;
		} else {
			break;
		}
	}
	return copied;
}

/*
 * siw_rresp_check_ntoh()
 *
 * Check incoming RRESP fragment header against expected
 * header values and update expected values for potential next
 * fragment.
 *
 * NOTE: This function must be called only if a RRESP DDP segment
 *       starts but not for fragmented consecutive pieces of an
 *       already started DDP segment.
 */
static int siw_rresp_check_ntoh(struct siw_rx_stream *srx,
				struct siw_rx_fpdu *frx)
{
	struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp;
	struct siw_wqe *wqe = &frx->wqe_active;
	enum ddp_ecode ecode;

	u32 sink_stag = be32_to_cpu(rresp->sink_stag);
	u64 sink_to = be64_to_cpu(rresp->sink_to);

	if (frx->first_ddp_seg) {
		srx->ddp_stag = wqe->sqe.sge[0].lkey;
		srx->ddp_to = wqe->sqe.sge[0].laddr;
		frx->pbl_idx = 0;
	}
	/* Below checks extend beyond the semantics of DDP, and
	 * into RDMAP:
	 * We check if the read response matches exactly the
	 * read request which was send to the remote peer to
	 * trigger this read response. RFC5040/5041 do not
	 * always have a proper error code for the detected
	 * error cases. We choose 'base or bounds error' for
	 * cases where the inbound STag is valid, but offset
	 * or length do not match our response receive state.
	 */
	if (unlikely(srx->ddp_stag != sink_stag)) {
		pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
			qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
		ecode = DDP_ECODE_T_INVALID_STAG;
		goto error;
	}
	if (unlikely(srx->ddp_to != sink_to)) {
		pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
			qp_id(rx_qp(srx)), (unsigned long long)sink_to,
			(unsigned long long)srx->ddp_to);
		ecode = DDP_ECODE_T_BASE_BOUNDS;
		goto error;
	}
	if (unlikely(!frx->more_ddp_segs &&
		     (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
		pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
			qp_id(rx_qp(srx)),
			wqe->processed + srx->fpdu_part_rem, wqe->bytes);
		ecode = DDP_ECODE_T_BASE_BOUNDS;
		goto error;
	}
	return 0;
error:
	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
			   DDP_ETYPE_TAGGED_BUF, ecode, 0);
	return -EINVAL;
}

/*
 * siw_write_check_ntoh()
 *
 * Check incoming WRITE fragment header against expected
 * header values and update expected values for potential next
 * fragment
 *
 * NOTE: This function must be called only if a WRITE DDP segment
 *       starts but not for fragmented consecutive pieces of an
 *       already started DDP segment.
 */
static int siw_write_check_ntoh(struct siw_rx_stream *srx,
				struct siw_rx_fpdu *frx)
{
	struct iwarp_rdma_write *write = &srx->hdr.rwrite;
	enum ddp_ecode ecode;

	u32 sink_stag = be32_to_cpu(write->sink_stag);
	u64 sink_to = be64_to_cpu(write->sink_to);

	if (frx->first_ddp_seg) {
		srx->ddp_stag = sink_stag;
		srx->ddp_to = sink_to;
		frx->pbl_idx = 0;
	} else {
		if (unlikely(srx->ddp_stag != sink_stag)) {
			pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
				qp_id(rx_qp(srx)), sink_stag,
				srx->ddp_stag);
			ecode = DDP_ECODE_T_INVALID_STAG;
			goto error;
		}
		if (unlikely(srx->ddp_to != sink_to)) {
			pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
				qp_id(rx_qp(srx)),
				(unsigned long long)sink_to,
				(unsigned long long)srx->ddp_to);
			ecode = DDP_ECODE_T_BASE_BOUNDS;
			goto error;
		}
	}
	return 0;
error:
	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
			   DDP_ETYPE_TAGGED_BUF, ecode, 0);
	return -EINVAL;
}

/*
 * siw_send_check_ntoh()
 *
 * Check incoming SEND fragment header against expected
 * header values and update expected MSN if no next
 * fragment expected
 *
 * NOTE: This function must be called only if a SEND DDP segment
 *       starts but not for fragmented consecutive pieces of an
 *       already started DDP segment.
 */
static int siw_send_check_ntoh(struct siw_rx_stream *srx,
			       struct siw_rx_fpdu *frx)
{
	struct iwarp_send_inv *send = &srx->hdr.send_inv;
	struct siw_wqe *wqe = &frx->wqe_active;
	enum ddp_ecode ecode;

	u32 ddp_msn = be32_to_cpu(send->ddp_msn);
	u32 ddp_mo = be32_to_cpu(send->ddp_mo);
	u32 ddp_qn = be32_to_cpu(send->ddp_qn);

	if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) {
		pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
			qp_id(rx_qp(srx)), ddp_qn);
		ecode = DDP_ECODE_UT_INVALID_QN;
		goto error;
	}
	if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) {
		pr_warn("siw: [QP %u]: send msn: %u != %u\n",
			qp_id(rx_qp(srx)), ddp_msn,
			srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
		ecode = DDP_ECODE_UT_INVALID_MSN_RANGE;
		goto error;
	}
	if (unlikely(ddp_mo != wqe->processed)) {
		pr_warn("siw: [QP %u], send mo: %u != %u\n",
			qp_id(rx_qp(srx)), ddp_mo, wqe->processed);
		ecode = DDP_ECODE_UT_INVALID_MO;
		goto error;
	}
	if (frx->first_ddp_seg) {
		/* initialize user memory write position */
		frx->sge_idx = 0;
		frx->sge_off = 0;
		frx->pbl_idx = 0;

		/* only valid for SEND_INV and SEND_SE_INV operations */
		srx->inval_stag = be32_to_cpu(send->inval_stag);
	}
	if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) {
		siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n",
			   wqe->bytes, wqe->processed, srx->fpdu_part_rem);
		wqe->wc_status = SIW_WC_LOC_LEN_ERR;
		ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF;
		goto error;
	}
	return 0;
error:
	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
			   DDP_ETYPE_UNTAGGED_BUF, ecode, 0);
	return -EINVAL;
}

static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
{
	struct siw_rqe *rqe;
	struct siw_srq *srq;
	struct siw_wqe *wqe = NULL;
	bool srq_event = false;
	unsigned long flags;

	srq = qp->srq;
	if (srq) {
		spin_lock_irqsave(&srq->lock, flags);
		if (unlikely(!srq->num_rqe))
			goto out;

		rqe = &srq->recvq[srq->rq_get % srq->num_rqe];
	} else {
		if (unlikely(!qp->recvq))
			goto out;

		rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size];
	}
	if (likely(rqe->flags == SIW_WQE_VALID)) {
		int num_sge = rqe->num_sge;

		if (likely(num_sge <= SIW_MAX_SGE)) {
			int i = 0;

			wqe = rx_wqe(&qp->rx_untagged);
			rx_type(wqe) = SIW_OP_RECEIVE;
			wqe->wr_status = SIW_WR_INPROGRESS;
			wqe->bytes = 0;
			wqe->processed = 0;

			wqe->rqe.id = rqe->id;
			wqe->rqe.num_sge = num_sge;

			while (i < num_sge) {
				wqe->rqe.sge[i].laddr = rqe->sge[i].laddr;
				wqe->rqe.sge[i].lkey = rqe->sge[i].lkey;
				wqe->rqe.sge[i].length = rqe->sge[i].length;
				wqe->bytes += wqe->rqe.sge[i].length;
				wqe->mem[i] = NULL;
				i++;
			}
			/* can be re-used by appl */
			smp_store_mb(rqe->flags, 0);
		} else {
			siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge);
			if (srq)
				spin_unlock_irqrestore(&srq->lock, flags);
			return NULL;
		}
		if (!srq) {
			qp->rq_get++;
		} else {
			if (srq->armed) {
				/* Test SRQ limit */
				u32 off = (srq->rq_get + srq->limit) %
					  srq->num_rqe;
				struct siw_rqe *rqe2 = &srq->recvq[off];

				if (!(rqe2->flags & SIW_WQE_VALID)) {
					srq->armed = false;
					srq_event = true;
				}
			}
			srq->rq_get++;
		}
	}
out:
	if (srq) {
		spin_unlock_irqrestore(&srq->lock, flags);
		if (srq_event)
			siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED);
	}
	return wqe;
}

/*
 * siw_proc_send:
 *
 * Process one incoming SEND and place data into memory referenced by
 * receive wqe.
 *
 * Function supports partially received sends (suspending/resuming
 * current receive wqe processing)
 *
 * return value:
 *	0:       reached the end of a DDP segment
 *	-EAGAIN: to be called again to finish the DDP segment
 */
int siw_proc_send(struct siw_qp *qp)
{
	struct siw_rx_stream *srx = &qp->rx_stream;
	struct siw_rx_fpdu *frx = &qp->rx_untagged;
	struct siw_wqe *wqe;
	u32 data_bytes; /* all data bytes available */
	u32 rcvd_bytes; /* sum of data bytes rcvd */
	int rv = 0;

	if (frx->first_ddp_seg) {
		wqe = siw_rqe_get(qp);
		if (unlikely(!wqe)) {
			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
					   DDP_ETYPE_UNTAGGED_BUF,
					   DDP_ECODE_UT_INVALID_MSN_NOBUF, 0);
			return -ENOENT;
		}
	} else {
		wqe = rx_wqe(frx);
	}
	if (srx->state == SIW_GET_DATA_START) {
		rv = siw_send_check_ntoh(srx, frx);
		if (unlikely(rv)) {
			siw_qp_event(qp, IB_EVENT_QP_FATAL);
			return rv;
		}
		if (!srx->fpdu_part_rem) /* zero length SEND */
			return 0;
	}
	data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
	rcvd_bytes = 0;

	/* A zero length SEND will skip below loop */
	while (data_bytes) {
		struct ib_pd *pd;
		struct siw_mem **mem, *mem_p;
		struct siw_sge *sge;
		u32 sge_bytes; /* data bytes avail for SGE */

		sge = &wqe->rqe.sge[frx->sge_idx];

		if (!sge->length) {
			/* just skip empty sge's */
			frx->sge_idx++;
			frx->sge_off = 0;
			frx->pbl_idx = 0;
			continue;
		}
		sge_bytes = min(data_bytes, sge->length - frx->sge_off);
		mem = &wqe->mem[frx->sge_idx];

		/*
		 * check with QP's PD if no SRQ present, SRQ's PD otherwise
		 */
		pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;

		rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE,
				   frx->sge_off, sge_bytes);
		if (unlikely(rv)) {
			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
					   DDP_ETYPE_CATASTROPHIC,
					   DDP_ECODE_CATASTROPHIC, 0);

			siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
			break;
		}
		mem_p = *mem;
		if (mem_p->mem_obj == NULL)
			rv = siw_rx_kva(srx,
				ib_virt_dma_to_ptr(sge->laddr + frx->sge_off),
				sge_bytes);
		else if (!mem_p->is_pbl)
			rv = siw_rx_umem(srx, mem_p->umem,
					 sge->laddr + frx->sge_off, sge_bytes);
		else
			rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
					sge->laddr + frx->sge_off, sge_bytes);

		if (unlikely(rv != sge_bytes)) {
			wqe->processed += rcvd_bytes;

			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
					   DDP_ETYPE_CATASTROPHIC,
					   DDP_ECODE_CATASTROPHIC, 0);
			return -EINVAL;
		}
		frx->sge_off += rv;

		if (frx->sge_off == sge->length) {
			frx->sge_idx++;
			frx->sge_off = 0;
			frx->pbl_idx = 0;
		}
		data_bytes -= rv;
		rcvd_bytes += rv;

		srx->fpdu_part_rem -= rv;
		srx->fpdu_part_rcvd += rv;
	}
	wqe->processed += rcvd_bytes;

	if (!srx->fpdu_part_rem)
		return 0;

	return (rv < 0) ? rv : -EAGAIN;
}

/*
 * siw_proc_write:
 *
 * Place incoming WRITE after referencing and checking target buffer

 * Function supports partially received WRITEs (suspending/resuming
 * current receive processing)
 *
 * return value:
 *	0:       reached the end of a DDP segment
 *	-EAGAIN: to be called again to finish the DDP segment
 */
int siw_proc_write(struct siw_qp *qp)
{
	struct siw_rx_stream *srx = &qp->rx_stream;
	struct siw_rx_fpdu *frx = &qp->rx_tagged;
	struct siw_mem *mem;
	int bytes, rv;

	if (srx->state == SIW_GET_DATA_START) {
		if (!srx->fpdu_part_rem) /* zero length WRITE */
			return 0;

		rv = siw_write_check_ntoh(srx, frx);
		if (unlikely(rv)) {
			siw_qp_event(qp, IB_EVENT_QP_FATAL);
			return rv;
		}
	}
	bytes = min(srx->fpdu_part_rem, srx->skb_new);

	if (frx->first_ddp_seg) {
		struct siw_wqe *wqe = rx_wqe(frx);

		rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8);
		if (unlikely(!rx_mem(frx))) {
			siw_dbg_qp(qp,
				   "sink stag not found/invalid, stag 0x%08x\n",
				   srx->ddp_stag);

			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
					   DDP_ETYPE_TAGGED_BUF,
					   DDP_ECODE_T_INVALID_STAG, 0);
			return -EINVAL;
		}
		wqe->rqe.num_sge = 1;
		rx_type(wqe) = SIW_OP_WRITE;
		wqe->wr_status = SIW_WR_INPROGRESS;
	}
	mem = rx_mem(frx);

	/*
	 * Check if application re-registered memory with different
	 * key field of STag.
	 */
	if (unlikely(mem->stag != srx->ddp_stag)) {
		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
				   DDP_ETYPE_TAGGED_BUF,
				   DDP_ECODE_T_INVALID_STAG, 0);
		return -EINVAL;
	}
	rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd,
			   IB_ACCESS_REMOTE_WRITE, bytes);
	if (unlikely(rv)) {
		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
				   DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv),
				   0);

		siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);

		return -EINVAL;
	}

	if (mem->mem_obj == NULL)
		rv = siw_rx_kva(srx,
			(void *)(uintptr_t)(srx->ddp_to + srx->fpdu_part_rcvd),
			bytes);
	else if (!mem->is_pbl)
		rv = siw_rx_umem(srx, mem->umem,
				 srx->ddp_to + srx->fpdu_part_rcvd, bytes);
	else
		rv = siw_rx_pbl(srx, &frx->pbl_idx, mem,
				srx->ddp_to + srx->fpdu_part_rcvd, bytes);

	if (unlikely(rv != bytes)) {
		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
				   DDP_ETYPE_CATASTROPHIC,
				   DDP_ECODE_CATASTROPHIC, 0);
		return -EINVAL;
	}
	srx->fpdu_part_rem -= rv;
	srx->fpdu_part_rcvd += rv;

	if (!srx->fpdu_part_rem) {
		srx->ddp_to += srx->fpdu_part_rcvd;
		return 0;
	}
	return -EAGAIN;
}

/*
 * Inbound RREQ's cannot carry user data.
 */
int siw_proc_rreq(struct siw_qp *qp)
{
	struct siw_rx_stream *srx = &qp->rx_stream;

	if (!srx->fpdu_part_rem)
		return 0;

	pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
		be16_to_cpu(srx->hdr.ctrl.mpa_len));

	return -EPROTO;
}

/*
 * siw_init_rresp:
 *
 * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
 * Put it at the tail of the IRQ, if there is another WQE currently in
 * transmit processing. If not, make it the current WQE to be processed
 * and schedule transmit processing.
 *
 * Can be called from softirq context and from process
 * context (RREAD socket loopback case!)
 *
 * return value:
 *	0:      success,
 *		failure code otherwise
 */

static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
{
	struct siw_wqe *tx_work = tx_wqe(qp);
	struct siw_sqe *resp;

	uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to),
		 laddr = be64_to_cpu(srx->hdr.rreq.source_to);
	uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size),
		 lkey = be32_to_cpu(srx->hdr.rreq.source_stag),
		 rkey = be32_to_cpu(srx->hdr.rreq.sink_stag),
		 msn = be32_to_cpu(srx->hdr.rreq.ddp_msn);

	int run_sq = 1, rv = 0;
	unsigned long flags;

	if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) {
		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
				   DDP_ETYPE_UNTAGGED_BUF,
				   DDP_ECODE_UT_INVALID_MSN_RANGE, 0);
		return -EPROTO;
	}
	spin_lock_irqsave(&qp->sq_lock, flags);

	if (unlikely(!qp->attrs.irq_size)) {
		run_sq = 0;
		goto error_irq;
	}
	if (tx_work->wr_status == SIW_WR_IDLE) {
		/*
		 * immediately schedule READ response w/o
		 * consuming IRQ entry: IRQ must be empty.
		 */
		tx_work->processed = 0;
		tx_work->mem[0] = NULL;
		tx_work->wr_status = SIW_WR_QUEUED;
		resp = &tx_work->sqe;
	} else {
		resp = irq_alloc_free(qp);
		run_sq = 0;
	}
	if (likely(resp)) {
		resp->opcode = SIW_OP_READ_RESPONSE;

		resp->sge[0].length = length;
		resp->sge[0].laddr = laddr;
		resp->sge[0].lkey = lkey;

		/* Keep aside message sequence number for potential
		 * error reporting during Read Response generation.
		 */
		resp->sge[1].length = msn;

		resp->raddr = raddr;
		resp->rkey = rkey;
		resp->num_sge = length ? 1 : 0;

		/* RRESP now valid as current TX wqe or placed into IRQ */
		smp_store_mb(resp->flags, SIW_WQE_VALID);
	} else {
error_irq:
		pr_warn("siw: [QP %u]: IRQ exceeded or null, size %d\n",
			qp_id(qp), qp->attrs.irq_size);

		siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
				   RDMAP_ETYPE_REMOTE_OPERATION,
				   RDMAP_ECODE_CATASTROPHIC_STREAM, 0);
		rv = -EPROTO;
	}

	spin_unlock_irqrestore(&qp->sq_lock, flags);

	if (run_sq)
		rv = siw_sq_start(qp);

	return rv;
}

/*
 * Only called at start of Read.Resonse processing.
 * Transfer pending Read from tip of ORQ into currrent rx wqe,
 * but keep ORQ entry valid until Read.Response processing done.
 * No Queue locking needed.
 */
static int siw_orqe_start_rx(struct siw_qp *qp)
{
	struct siw_sqe *orqe;
	struct siw_wqe *wqe = NULL;

	if (unlikely(!qp->attrs.orq_size))
		return -EPROTO;

	/* make sure ORQ indices are current */
	smp_mb();

	orqe = orq_get_current(qp);
	if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) {
		/* RRESP is a TAGGED RDMAP operation */
		wqe = rx_wqe(&qp->rx_tagged);
		wqe->sqe.id = orqe->id;
		wqe->sqe.opcode = orqe->opcode;
		wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
		wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
		wqe->sqe.sge[0].length = orqe->sge[0].length;
		wqe->sqe.flags = orqe->flags;
		wqe->sqe.num_sge = 1;
		wqe->bytes = orqe->sge[0].length;
		wqe->processed = 0;
		wqe->mem[0] = NULL;
		/* make sure WQE is completely written before valid */
		smp_wmb();
		wqe->wr_status = SIW_WR_INPROGRESS;

		return 0;
	}
	return -EPROTO;
}

/*
 * siw_proc_rresp:
 *
 * Place incoming RRESP data into memory referenced by RREQ WQE
 * which is at the tip of the ORQ
 *
 * Function supports partially received RRESP's (suspending/resuming
 * current receive processing)
 */
int siw_proc_rresp(struct siw_qp *qp)
{
	struct siw_rx_stream *srx = &qp->rx_stream;
	struct siw_rx_fpdu *frx = &qp->rx_tagged;
	struct siw_wqe *wqe = rx_wqe(frx);
	struct siw_mem **mem, *mem_p;
	struct siw_sge *sge;
	int bytes, rv;

	if (frx->first_ddp_seg) {
		if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
			pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
				qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
			rv = -EPROTO;
			goto error_term;
		}
		/*
		 * fetch pending RREQ from orq
		 */
		rv = siw_orqe_start_rx(qp);
		if (rv) {
			pr_warn("siw: [QP %u]: ORQ empty, size %d\n",
				qp_id(qp), qp->attrs.orq_size);
			goto error_term;
		}
		rv = siw_rresp_check_ntoh(srx, frx);
		if (unlikely(rv)) {
			siw_qp_event(qp, IB_EVENT_QP_FATAL);
			return rv;
		}
	} else {
		if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
			pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
				qp_id(qp), wqe->wr_status);
			rv = -EPROTO;
			goto error_term;
		}
	}
	if (!srx->fpdu_part_rem) /* zero length RRESPONSE */
		return 0;

	sge = wqe->sqe.sge; /* there is only one */
	mem = &wqe->mem[0];

	if (!(*mem)) {
		/*
		 * check target memory which resolves memory on first fragment
		 */
		rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
				   wqe->bytes);
		if (unlikely(rv)) {
			siw_dbg_qp(qp, "target mem check: %d\n", rv);
			wqe->wc_status = SIW_WC_LOC_PROT_ERR;

			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
					   DDP_ETYPE_TAGGED_BUF,
					   siw_tagged_error(-rv), 0);

			siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);

			return -EINVAL;
		}
	}
	mem_p = *mem;

	bytes = min(srx->fpdu_part_rem, srx->skb_new);

	if (mem_p->mem_obj == NULL)
		rv = siw_rx_kva(srx,
			ib_virt_dma_to_ptr(sge->laddr + wqe->processed),
			bytes);
	else if (!mem_p->is_pbl)
		rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed,
				 bytes);
	else
		rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
				sge->laddr + wqe->processed, bytes);
	if (rv != bytes) {
		wqe->wc_status = SIW_WC_GENERAL_ERR;
		rv = -EINVAL;
		goto error_term;
	}
	srx->fpdu_part_rem -= rv;
	srx->fpdu_part_rcvd += rv;
	wqe->processed += rv;

	if (!srx->fpdu_part_rem) {
		srx->ddp_to += srx->fpdu_part_rcvd;
		return 0;
	}
	return -EAGAIN;

error_term:
	siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC,
			   DDP_ECODE_CATASTROPHIC, 0);
	return rv;
}

int siw_proc_terminate(struct siw_qp *qp)
{
	struct siw_rx_stream *srx = &qp->rx_stream;
	struct sk_buff *skb = srx->skb;
	struct iwarp_terminate *term = &srx->hdr.terminate;
	union iwarp_hdr term_info;
	u8 *infop = (u8 *)&term_info;
	enum rdma_opcode op;
	u16 to_copy = sizeof(struct iwarp_ctrl);

	pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
		__rdmap_term_layer(term), __rdmap_term_etype(term),
		__rdmap_term_ecode(term));

	if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
	    be32_to_cpu(term->ddp_msn) !=
		    qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
	    be32_to_cpu(term->ddp_mo) != 0) {
		pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
			be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
			be32_to_cpu(term->ddp_mo));
		return -ECONNRESET;
	}
	/*
	 * Receive remaining pieces of TERM if indicated
	 */
	if (!term->flag_m)
		return -ECONNRESET;

	/* Do not take the effort to reassemble a network fragmented
	 * TERM message
	 */
	if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged))
		return -ECONNRESET;

	memset(infop, 0, sizeof(term_info));

	skb_copy_bits(skb, srx->skb_offset, infop, to_copy);

	op = __rdmap_get_opcode(&term_info.ctrl);
	if (op >= RDMAP_TERMINATE)
		goto out;

	infop += to_copy;
	srx->skb_offset += to_copy;
	srx->skb_new -= to_copy;
	srx->skb_copied += to_copy;
	srx->fpdu_part_rcvd += to_copy;
	srx->fpdu_part_rem -= to_copy;

	to_copy = iwarp_pktinfo[op].hdr_len - to_copy;

	/* Again, no network fragmented TERM's */
	if (to_copy + MPA_CRC_SIZE > srx->skb_new)
		return -ECONNRESET;

	skb_copy_bits(skb, srx->skb_offset, infop, to_copy);

	if (term->flag_r) {
		siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
			   op, be16_to_cpu(term_info.ctrl.mpa_len),
			   term->flag_m ? "valid" : "invalid");
	} else if (term->flag_d) {
		siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n",
			   op, be16_to_cpu(term_info.ctrl.mpa_len),
			   term->flag_m ? "valid" : "invalid");
	}
out:
	srx->skb_new -= to_copy;
	srx->skb_offset += to_copy;
	srx->skb_copied += to_copy;
	srx->fpdu_part_rcvd += to_copy;
	srx->fpdu_part_rem -= to_copy;

	return -ECONNRESET;
}

static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx)
{
	struct sk_buff *skb = srx->skb;
	int avail = min(srx->skb_new, srx->fpdu_part_rem);
	u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad;
	__wsum crc_in, crc_own = 0;

	siw_dbg_qp(qp, "expected %d, available %d, pad %u\n",
		   srx->fpdu_part_rem, srx->skb_new, srx->pad);

	skb_copy_bits(skb, srx->skb_offset, tbuf, avail);

	srx->skb_new -= avail;
	srx->skb_offset += avail;
	srx->skb_copied += avail;
	srx->fpdu_part_rem -= avail;

	if (srx->fpdu_part_rem)
		return -EAGAIN;

	if (!srx->mpa_crc_hd)
		return 0;

	if (srx->pad)
		crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad);
	/*
	 * CRC32 is computed, transmitted and received directly in NBO,
	 * so there's never a reason to convert byte order.
	 */
	crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own);
	crc_in = (__force __wsum)srx->trailer.crc;

	if (unlikely(crc_in != crc_own)) {
		pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
			crc_in, crc_own, qp->rx_stream.rdmap_op);

		siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
				   LLP_ETYPE_MPA,
				   LLP_ECODE_RECEIVED_CRC, 0);
		return -EINVAL;
	}
	return 0;
}

#define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)

static int siw_get_hdr(struct siw_rx_stream *srx)
{
	struct sk_buff *skb = srx->skb;
	struct siw_qp *qp = rx_qp(srx);
	struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl;
	struct siw_rx_fpdu *frx;
	u8 opcode;
	int bytes;

	if (srx->fpdu_part_rcvd < MIN_DDP_HDR) {
		/*
		 * copy a mimimum sized (tagged) DDP frame control part
		 */
		bytes = min_t(int, srx->skb_new,
			      MIN_DDP_HDR - srx->fpdu_part_rcvd);

		skb_copy_bits(skb, srx->skb_offset,
			      (char *)c_hdr + srx->fpdu_part_rcvd, bytes);

		srx->fpdu_part_rcvd += bytes;

		srx->skb_new -= bytes;
		srx->skb_offset += bytes;
		srx->skb_copied += bytes;

		if (srx->fpdu_part_rcvd < MIN_DDP_HDR)
			return -EAGAIN;

		if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) {
			enum ddp_etype etype;
			enum ddp_ecode ecode;

			pr_warn("siw: received ddp version unsupported %d\n",
				__ddp_get_version(c_hdr));

			if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) {
				etype = DDP_ETYPE_TAGGED_BUF;
				ecode = DDP_ECODE_T_VERSION;
			} else {
				etype = DDP_ETYPE_UNTAGGED_BUF;
				ecode = DDP_ECODE_UT_VERSION;
			}
			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
					   etype, ecode, 0);
			return -EINVAL;
		}
		if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) {
			pr_warn("siw: received rdmap version unsupported %d\n",
				__rdmap_get_version(c_hdr));

			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
					   RDMAP_ETYPE_REMOTE_OPERATION,
					   RDMAP_ECODE_VERSION, 0);
			return -EINVAL;
		}
		opcode = __rdmap_get_opcode(c_hdr);

		if (opcode > RDMAP_TERMINATE) {
			pr_warn("siw: received unknown packet type %u\n",
				opcode);

			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
					   RDMAP_ETYPE_REMOTE_OPERATION,
					   RDMAP_ECODE_OPCODE, 0);
			return -EINVAL;
		}
		siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode);
	} else {
		opcode = __rdmap_get_opcode(c_hdr);
	}
	set_rx_fpdu_context(qp, opcode);
	frx = qp->rx_fpdu;

	/*
	 * Figure out len of current hdr: variable length of
	 * iwarp hdr may force us to copy hdr information in
	 * two steps. Only tagged DDP messages are already
	 * completely received.
	 */
	if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) {
		int hdrlen = iwarp_pktinfo[opcode].hdr_len;

		bytes = min_t(int, hdrlen - MIN_DDP_HDR, srx->skb_new);

		skb_copy_bits(skb, srx->skb_offset,
			      (char *)c_hdr + srx->fpdu_part_rcvd, bytes);

		srx->fpdu_part_rcvd += bytes;

		srx->skb_new -= bytes;
		srx->skb_offset += bytes;
		srx->skb_copied += bytes;

		if (srx->fpdu_part_rcvd < hdrlen)
			return -EAGAIN;
	}

	/*
	 * DDP/RDMAP header receive completed. Check if the current
	 * DDP segment starts a new RDMAP message or continues a previously
	 * started RDMAP message.
	 *
	 * Alternating reception of DDP segments (or FPDUs) from incomplete
	 * tagged and untagged RDMAP messages is supported, as long as
	 * the current tagged or untagged message gets eventually completed
	 * w/o intersection from another message of the same type
	 * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
	 * but not by a READ RESPONSE etc.
	 */
	if (srx->mpa_crc_hd) {
		/*
		 * Restart CRC computation
		 */
		crypto_shash_init(srx->mpa_crc_hd);
		crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr,
				    srx->fpdu_part_rcvd);
	}
	if (frx->more_ddp_segs) {
		frx->first_ddp_seg = 0;
		if (frx->prev_rdmap_op != opcode) {
			pr_warn("siw: packet intersection: %u : %u\n",
				frx->prev_rdmap_op, opcode);
			/*
			 * The last inbound RDMA operation of same type
			 * (tagged or untagged) is left unfinished.
			 * To complete it in error, make it the current
			 * operation again, even with the header already
			 * overwritten. For error handling, only the opcode
			 * and current rx context are relevant.
			 */
			set_rx_fpdu_context(qp, frx->prev_rdmap_op);
			__rdmap_set_opcode(c_hdr, frx->prev_rdmap_op);
			return -EPROTO;
		}
	} else {
		frx->prev_rdmap_op = opcode;
		frx->first_ddp_seg = 1;
	}
	frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;

	return 0;
}

static int siw_check_tx_fence(struct siw_qp *qp)
{
	struct siw_wqe *tx_waiting = tx_wqe(qp);
	struct siw_sqe *rreq;
	int resume_tx = 0, rv = 0;
	unsigned long flags;

	spin_lock_irqsave(&qp->orq_lock, flags);

	/* free current orq entry */
	rreq = orq_get_current(qp);
	WRITE_ONCE(rreq->flags, 0);

	qp->orq_get++;

	if (qp->tx_ctx.orq_fence) {
		if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
			pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
				qp_id(qp), tx_waiting->wr_status);
			rv = -EPROTO;
			goto out;
		}
		/* resume SQ processing, if possible */
		if (tx_waiting->sqe.opcode == SIW_OP_READ ||
		    tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {

			/* SQ processing was stopped because of a full ORQ */
			rreq = orq_get_free(qp);
			if (unlikely(!rreq)) {
				pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
				rv = -EPROTO;
				goto out;
			}
			siw_read_to_orq(rreq, &tx_waiting->sqe);

			qp->orq_put++;
			qp->tx_ctx.orq_fence = 0;
			resume_tx = 1;

		} else if (siw_orq_empty(qp)) {
			/*
			 * SQ processing was stopped by fenced work request.
			 * Resume since all previous Read's are now completed.
			 */
			qp->tx_ctx.orq_fence = 0;
			resume_tx = 1;
		}
	}
out:
	spin_unlock_irqrestore(&qp->orq_lock, flags);

	if (resume_tx)
		rv = siw_sq_start(qp);

	return rv;
}

/*
 * siw_rdmap_complete()
 *
 * Complete processing of an RDMA message after receiving all
 * DDP segmens or ABort processing after encountering error case.
 *
 *   o SENDs + RRESPs will need for completion,
 *   o RREQs need for  READ RESPONSE initialization
 *   o WRITEs need memory dereferencing
 *
 * TODO: Failed WRITEs need local error to be surfaced.
 */
static int siw_rdmap_complete(struct siw_qp *qp, int error)
{
	struct siw_rx_stream *srx = &qp->rx_stream;
	struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu);
	enum siw_wc_status wc_status = wqe->wc_status;
	u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl);
	int rv = 0;

	switch (opcode) {
	case RDMAP_SEND_SE:
	case RDMAP_SEND_SE_INVAL:
		wqe->rqe.flags |= SIW_WQE_SOLICITED;
		fallthrough;

	case RDMAP_SEND:
	case RDMAP_SEND_INVAL:
		if (wqe->wr_status == SIW_WR_IDLE)
			break;

		srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;

		if (error != 0 && wc_status == SIW_WC_SUCCESS)
			wc_status = SIW_WC_GENERAL_ERR;
		/*
		 * Handle STag invalidation request
		 */
		if (wc_status == SIW_WC_SUCCESS &&
		    (opcode == RDMAP_SEND_INVAL ||
		     opcode == RDMAP_SEND_SE_INVAL)) {
			rv = siw_invalidate_stag(qp->pd, srx->inval_stag);
			if (rv) {
				siw_init_terminate(
					qp, TERM_ERROR_LAYER_RDMAP,
					rv == -EACCES ?
						RDMAP_ETYPE_REMOTE_PROTECTION :
						RDMAP_ETYPE_REMOTE_OPERATION,
					RDMAP_ECODE_CANNOT_INVALIDATE, 0);

				wc_status = SIW_WC_REM_INV_REQ_ERR;
			}
			rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
					      rv ? 0 : srx->inval_stag,
					      wc_status);
		} else {
			rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
					      0, wc_status);
		}
		siw_wqe_put_mem(wqe, SIW_OP_RECEIVE);
		break;

	case RDMAP_RDMA_READ_RESP:
		if (wqe->wr_status == SIW_WR_IDLE)
			break;

		if (error != 0) {
			if ((srx->state == SIW_GET_HDR &&
			     qp->rx_fpdu->first_ddp_seg) || error == -ENODATA)
				/* possible RREQ in ORQ left untouched */
				break;

			if (wc_status == SIW_WC_SUCCESS)
				wc_status = SIW_WC_GENERAL_ERR;
		} else if (rdma_is_kernel_res(&qp->base_qp.res) &&
			   rx_type(wqe) == SIW_OP_READ_LOCAL_INV) {
			/*
			 * Handle any STag invalidation request
			 */
			rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey);
			if (rv) {
				siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
						   RDMAP_ETYPE_CATASTROPHIC,
						   RDMAP_ECODE_UNSPECIFIED, 0);

				if (wc_status == SIW_WC_SUCCESS) {
					wc_status = SIW_WC_GENERAL_ERR;
					error = rv;
				}
			}
		}
		/*
		 * All errors turn the wqe into signalled.
		 */
		if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
			rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
					      wc_status);
		siw_wqe_put_mem(wqe, SIW_OP_READ);

		if (!error) {
			rv = siw_check_tx_fence(qp);
		} else {
			/* Disable current ORQ element */
			if (qp->attrs.orq_size)
				WRITE_ONCE(orq_get_current(qp)->flags, 0);
		}
		break;

	case RDMAP_RDMA_READ_REQ:
		if (!error) {
			rv = siw_init_rresp(qp, srx);
			srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
		}
		break;

	case RDMAP_RDMA_WRITE:
		if (wqe->wr_status == SIW_WR_IDLE)
			break;

		/*
		 * Free References from memory object if
		 * attached to receive context (inbound WRITE).
		 * While a zero-length WRITE is allowed,
		 * no memory reference got created.
		 */
		if (rx_mem(&qp->rx_tagged)) {
			siw_mem_put(rx_mem(&qp->rx_tagged));
			rx_mem(&qp->rx_tagged) = NULL;
		}
		break;

	default:
		break;
	}
	wqe->wr_status = SIW_WR_IDLE;

	return rv;
}

/*
 * siw_tcp_rx_data()
 *
 * Main routine to consume inbound TCP payload
 *
 * @rd_desc:	read descriptor
 * @skb:	socket buffer
 * @off:	offset in skb
 * @len:	skb->len - offset : payload in skb
 */
int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
		    unsigned int off, size_t len)
{
	struct siw_qp *qp = rd_desc->arg.data;
	struct siw_rx_stream *srx = &qp->rx_stream;
	int rv;

	srx->skb = skb;
	srx->skb_new = skb->len - off;
	srx->skb_offset = off;
	srx->skb_copied = 0;

	siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);

	while (srx->skb_new) {
		int run_completion = 1;

		if (unlikely(srx->rx_suspend)) {
			/* Do not process any more data */
			srx->skb_copied += srx->skb_new;
			break;
		}
		switch (srx->state) {
		case SIW_GET_HDR:
			rv = siw_get_hdr(srx);
			if (!rv) {
				srx->fpdu_part_rem =
					be16_to_cpu(srx->hdr.ctrl.mpa_len) -
					srx->fpdu_part_rcvd + MPA_HDR_SIZE;

				if (srx->fpdu_part_rem)
					srx->pad = -srx->fpdu_part_rem & 0x3;
				else
					srx->pad = 0;

				srx->state = SIW_GET_DATA_START;
				srx->fpdu_part_rcvd = 0;
			}
			break;

		case SIW_GET_DATA_MORE:
			/*
			 * Another data fragment of the same DDP segment.
			 * Setting first_ddp_seg = 0 avoids repeating
			 * initializations that shall occur only once per
			 * DDP segment.
			 */
			qp->rx_fpdu->first_ddp_seg = 0;
			fallthrough;

		case SIW_GET_DATA_START:
			/*
			 * Headers will be checked by the opcode-specific
			 * data receive function below.
			 */
			rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp);
			if (!rv) {
				int mpa_len =
					be16_to_cpu(srx->hdr.ctrl.mpa_len)
					+ MPA_HDR_SIZE;

				srx->fpdu_part_rem = (-mpa_len & 0x3)
						      + MPA_CRC_SIZE;
				srx->fpdu_part_rcvd = 0;
				srx->state = SIW_GET_TRAILER;
			} else {
				if (unlikely(rv == -ECONNRESET))
					run_completion = 0;
				else
					srx->state = SIW_GET_DATA_MORE;
			}
			break;

		case SIW_GET_TRAILER:
			/*
			 * read CRC + any padding
			 */
			rv = siw_get_trailer(qp, srx);
			if (likely(!rv)) {
				/*
				 * FPDU completed.
				 * complete RDMAP message if last fragment
				 */
				srx->state = SIW_GET_HDR;
				srx->fpdu_part_rcvd = 0;

				if (!(srx->hdr.ctrl.ddp_rdmap_ctrl &
				      DDP_FLAG_LAST))
					/* more frags */
					break;

				rv = siw_rdmap_complete(qp, 0);
				run_completion = 0;
			}
			break;

		default:
			pr_warn("QP[%u]: RX out of state\n", qp_id(qp));
			rv = -EPROTO;
			run_completion = 0;
		}
		if (unlikely(rv != 0 && rv != -EAGAIN)) {
			if ((srx->state > SIW_GET_HDR ||
			     qp->rx_fpdu->more_ddp_segs) && run_completion)
				siw_rdmap_complete(qp, rv);

			siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
				   srx->state);

			siw_qp_cm_drop(qp, 1);

			break;
		}
		if (rv) {
			siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n",
				   srx->state, srx->fpdu_part_rem);
			break;
		}
	}
	return srx->skb_copied;
}