123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229// SPDX-License-Identifier: GPL-2.0
/*
 * GPL HEADER START
 *
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 only,
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License version 2 for more details (a copy is included
 * in the LICENSE file that accompanied this code).
 *
 * You should have received a copy of the GNU General Public License
 * version 2 along with this program; If not, see
 * http://www.gnu.org/licenses/gpl-2.0.html
 *
 * GPL HEADER END
 */
/*
 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
 * Use is subject to license terms.
 */
/*
 * Copyright (c) 2011, 2012, Intel Corporation.
 */
/*
 * This file is part of Lustre, http://www.lustre.org/
 * Lustre is a trademark of Sun Microsystems, Inc.
 *
 * lustre/include/lustre_mdc.h
 *
 * MDS data structures.
 * See also lustre_idl.h for wire formats of requests.
 */

#ifndef _LUSTRE_MDC_H
#define _LUSTRE_MDC_H

/** \defgroup mdc mdc
 *
 * @{
 */

#include <linux/fs.h>
#include <linux/dcache.h>
#include <lustre_intent.h>
#include <lustre_handles.h>
#include <linux/libcfs/libcfs.h>
#include <obd_class.h>
#include <lustre_lib.h>
#include <lustre_dlm.h>
#include <lustre_export.h>

struct ptlrpc_client;
struct obd_export;
struct ptlrpc_request;
struct obd_device;

/**
 * Serializes in-flight MDT-modifying RPC requests to preserve idempotency.
 *
 * This mutex is used to implement execute-once semantics on the MDT.
 * The MDT stores the last transaction ID and result for every client in
 * its last_rcvd file. If the client doesn't get a reply, it can safely
 * resend the request and the MDT will reconstruct the reply being aware
 * that the request has already been executed. Without this lock,
 * execution status of concurrent in-flight requests would be
 * overwritten.
 *
 * This design limits the extent to which we can keep a full pipeline of
 * in-flight requests from a single client.  This limitation could be
 * overcome by allowing multiple slots per client in the last_rcvd file.
 */
struct mdc_rpc_lock {
	/** Lock protecting in-flight RPC concurrency. */
	struct mutex		rpcl_mutex;
	/** Intent associated with currently executing request. */
	struct lookup_intent	*rpcl_it;
	/** Used for MDS/RPC load testing purposes. */
	int			rpcl_fakes;
};

#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL)

static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck)
{
	mutex_init(&lck->rpcl_mutex);
	lck->rpcl_it = NULL;
}

static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck,
				    struct lookup_intent *it)
{
	if (it && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
		   it->it_op == IT_LAYOUT || it->it_op == IT_READDIR))
		return;

	/* This would normally block until the existing request finishes.
	 * If fail_loc is set it will block until the regular request is
	 * done, then set rpcl_it to MDC_FAKE_RPCL_IT.  Once that is set
	 * it will only be cleared when all fake requests are finished.
	 * Only when all fake requests are finished can normal requests
	 * be sent, to ensure they are recoverable again.
	 */
 again:
	mutex_lock(&lck->rpcl_mutex);

	if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) {
		lck->rpcl_it = MDC_FAKE_RPCL_IT;
		lck->rpcl_fakes++;
		mutex_unlock(&lck->rpcl_mutex);
		return;
	}

	/* This will only happen when the CFS_FAIL_CHECK() was
	 * just turned off but there are still requests in progress.
	 * Wait until they finish.  It doesn't need to be efficient
	 * in this extremely rare case, just have low overhead in
	 * the common case when it isn't true.
	 */
	while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) {
		mutex_unlock(&lck->rpcl_mutex);
		schedule_timeout(cfs_time_seconds(1) / 4);
		goto again;
	}

	LASSERT(!lck->rpcl_it);
	lck->rpcl_it = it;
}

static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
				    struct lookup_intent *it)
{
	if (it && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
		   it->it_op == IT_LAYOUT || it->it_op == IT_READDIR))
		return;

	if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */
		mutex_lock(&lck->rpcl_mutex);

		LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes);
		lck->rpcl_fakes--;

		if (lck->rpcl_fakes == 0)
			lck->rpcl_it = NULL;

	} else {
		LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it);
		lck->rpcl_it = NULL;
	}

	mutex_unlock(&lck->rpcl_mutex);
}

static inline void mdc_get_mod_rpc_slot(struct ptlrpc_request *req,
					struct lookup_intent *it)
{
	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
	u32 opc;
	u16 tag;

	opc = lustre_msg_get_opc(req->rq_reqmsg);
	tag = obd_get_mod_rpc_slot(cli, opc, it);
	lustre_msg_set_tag(req->rq_reqmsg, tag);
}

static inline void mdc_put_mod_rpc_slot(struct ptlrpc_request *req,
					struct lookup_intent *it)
{
	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
	u32 opc;
	u16 tag;

	opc = lustre_msg_get_opc(req->rq_reqmsg);
	tag = lustre_msg_get_tag(req->rq_reqmsg);
	obd_put_mod_rpc_slot(cli, opc, it, tag);
}

/**
 * Update the maximum possible easize.
 *
 * This value is learned from ptlrpc replies sent by the MDT. The
 * default easize is initialized to the minimum value but allowed
 * to grow up to a single page in size if required to handle the
 * common case.
 *
 * \see client_obd::cl_default_mds_easize
 *
 * \param[in] exp	export for MDC device
 * \param[in] body	body of ptlrpc reply from MDT
 *
 */
static inline void mdc_update_max_ea_from_body(struct obd_export *exp,
					       struct mdt_body *body)
{
	if (body->mbo_valid & OBD_MD_FLMODEASIZE) {
		struct client_obd *cli = &exp->exp_obd->u.cli;
		u32 def_easize;

		if (cli->cl_max_mds_easize < body->mbo_max_mdsize)
			cli->cl_max_mds_easize = body->mbo_max_mdsize;

		def_easize = min_t(__u32, body->mbo_max_mdsize,
				   OBD_MAX_DEFAULT_EA_SIZE);
		cli->cl_default_mds_easize = def_easize;
	}
}

/* mdc/mdc_locks.c */
int it_open_error(int phase, struct lookup_intent *it);

static inline bool cl_is_lov_delay_create(unsigned int flags)
{
	return (flags & O_LOV_DELAY_CREATE) == O_LOV_DELAY_CREATE;
}

static inline void cl_lov_delay_create_clear(unsigned int *flags)
{
	if ((*flags & O_LOV_DELAY_CREATE) == O_LOV_DELAY_CREATE)
		*flags &= ~O_LOV_DELAY_CREATE;
}

/** @} mdc */

#endif