From 0b9ea37f24e247ed69baabf27fb211aa6a3e7622 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 Mar 2013 10:15:37 -0400
Subject: [PATCH 01/60] nfsd: eliminate one of the DRC cache searches

The most common case is to do a search of the cache, followed by an
insert. In the case where we have to allocate an entry off the slab,
then we end up having to redo the search, which is wasteful.

Better optimize the code for the common case by eliminating the initial
search of the cache and always preallocating an entry. In the case of a
cache hit, we'll end up just freeing that entry but that's preferable to
an extra search.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfscache.c | 41 +++++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index ca05f6dc3544..c61391e8e09d 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -318,55 +318,53 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
 	__wsum			csum;
 	unsigned long		age;
 	int type = rqstp->rq_cachetype;
-	int rtn;
+	int rtn = RC_DOIT;
 
 	rqstp->rq_cacherep = NULL;
 	if (type == RC_NOCACHE) {
 		nfsdstats.rcnocache++;
-		return RC_DOIT;
+		return rtn;
 	}
 
 	csum = nfsd_cache_csum(rqstp);
 
+	/*
+	 * Since the common case is a cache miss followed by an insert,
+	 * preallocate an entry. First, try to reuse the first entry on the LRU
+	 * if it works, then go ahead and prune the LRU list.
+	 */
 	spin_lock(&cache_lock);
-	rtn = RC_DOIT;
-
-	rp = nfsd_cache_search(rqstp, csum);
-	if (rp)
-		goto found_entry;
-
-	/* Try to use the first entry on the LRU */
 	if (!list_empty(&lru_head)) {
 		rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru);
 		if (nfsd_cache_entry_expired(rp) ||
 		    num_drc_entries >= max_drc_entries) {
 			lru_put_end(rp);
 			prune_cache_entries();
-			goto setup_entry;
+			goto search_cache;
 		}
 	}
 
-	/* Drop the lock and allocate a new entry */
+	/* No expired ones available, allocate a new one. */
 	spin_unlock(&cache_lock);
 	rp = nfsd_reply_cache_alloc();
-	if (!rp) {
-		dprintk("nfsd: unable to allocate DRC entry!\n");
-		return RC_DOIT;
-	}
 	spin_lock(&cache_lock);
-	++num_drc_entries;
+	if (likely(rp))
+		++num_drc_entries;
 
-	/*
-	 * Must search again just in case someone inserted one
-	 * after we dropped the lock above.
-	 */
+search_cache:
 	found = nfsd_cache_search(rqstp, csum);
 	if (found) {
-		nfsd_reply_cache_free_locked(rp);
+		if (likely(rp))
+			nfsd_reply_cache_free_locked(rp);
 		rp = found;
 		goto found_entry;
 	}
 
+	if (!rp) {
+		dprintk("nfsd: unable to allocate DRC entry!\n");
+		goto out;
+	}
+
 	/*
 	 * We're keeping the one we just allocated. Are we now over the
 	 * limit? Prune one off the tip of the LRU in trade for the one we
@@ -376,7 +374,6 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
 		nfsd_reply_cache_free_locked(list_first_entry(&lru_head,
 						struct svc_cacherep, c_lru));
 
-setup_entry:
 	nfsdstats.rcmisses++;
 	rqstp->rq_cacherep = rp;
 	rp->c_state = RC_INPROG;

From 9dc56143c298692276231735ec6546c1fac596e0 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 Mar 2013 10:15:37 -0400
Subject: [PATCH 02/60] nfsd: break out comparator into separate function

Break out the function that compares the rqstp and checksum against a
reply cache entry. While we're at it, track the efficacy of the checksum
over the NFS data by tracking the cases where we would have incorrectly
matched a DRC entry if we had not tracked it or the length.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfscache.c | 46 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index c61391e8e09d..48f5ef944234 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -23,9 +23,21 @@
 static struct hlist_head *	cache_hash;
 static struct list_head 	lru_head;
 static struct kmem_cache	*drc_slab;
-static unsigned int		num_drc_entries;
+
+/* max number of entries allowed in the cache */
 static unsigned int		max_drc_entries;
 
+/*
+ * Stats and other tracking of on the duplicate reply cache. All of these and
+ * the "rc" fields in nfsdstats are protected by the cache_lock
+ */
+
+/* total number of entries */
+static unsigned int		num_drc_entries;
+
+/* cache misses due only to checksum comparison failures */
+static unsigned int		payload_misses;
+
 /*
  * Calculate the hash index from an XID.
  */
@@ -273,6 +285,26 @@ nfsd_cache_csum(struct svc_rqst *rqstp)
 	return csum;
 }
 
+static bool
+nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
+{
+	/* Check RPC header info first */
+	if (rqstp->rq_xid != rp->c_xid || rqstp->rq_proc != rp->c_proc ||
+	    rqstp->rq_prot != rp->c_prot || rqstp->rq_vers != rp->c_vers ||
+	    rqstp->rq_arg.len != rp->c_len ||
+	    !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
+	    rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
+		return false;
+
+	/* compare checksum of NFS data */
+	if (csum != rp->c_csum) {
+		++payload_misses;
+		return false;
+	}
+
+	return true;
+}
+
 /*
  * Search the request hash for an entry that matches the given rqstp.
  * Must be called with cache_lock held. Returns the found entry or
@@ -283,18 +315,10 @@ nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
 {
 	struct svc_cacherep	*rp;
 	struct hlist_head 	*rh;
-	__be32			xid = rqstp->rq_xid;
-	u32			proto =  rqstp->rq_prot,
-				vers = rqstp->rq_vers,
-				proc = rqstp->rq_proc;
 
-	rh = &cache_hash[request_hash(xid)];
+	rh = &cache_hash[request_hash(rqstp->rq_xid)];
 	hlist_for_each_entry(rp, rh, c_hash) {
-		if (xid == rp->c_xid && proc == rp->c_proc &&
-		    proto == rp->c_prot && vers == rp->c_vers &&
-		    rqstp->rq_arg.len == rp->c_len && csum == rp->c_csum &&
-		    rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) &&
-		    rpc_get_port(svc_addr(rqstp)) == rpc_get_port((struct sockaddr *)&rp->c_addr))
+		if (nfsd_cache_match(rqstp, csum, rp))
 			return rp;
 	}
 	return NULL;

From 6c6910cd4d0cdb905fbba8c751afd143696930f2 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 Mar 2013 10:15:38 -0400
Subject: [PATCH 03/60] nfsd: track memory utilization by the DRC

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfscache.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 48f5ef944234..1f45b3353bb1 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -38,6 +38,9 @@ static unsigned int		num_drc_entries;
 /* cache misses due only to checksum comparison failures */
 static unsigned int		payload_misses;
 
+/* amount of memory (in bytes) currently consumed by the DRC */
+static unsigned int		drc_mem_usage;
+
 /*
  * Calculate the hash index from an XID.
  */
@@ -112,12 +115,15 @@ nfsd_reply_cache_alloc(void)
 static void
 nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
 {
-	if (rp->c_type == RC_REPLBUFF)
+	if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
+		drc_mem_usage -= rp->c_replvec.iov_len;
 		kfree(rp->c_replvec.iov_base);
+	}
 	if (!hlist_unhashed(&rp->c_hash))
 		hlist_del(&rp->c_hash);
 	list_del(&rp->c_lru);
 	--num_drc_entries;
+	drc_mem_usage -= sizeof(*rp);
 	kmem_cache_free(drc_slab, rp);
 }
 
@@ -372,8 +378,10 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
 	spin_unlock(&cache_lock);
 	rp = nfsd_reply_cache_alloc();
 	spin_lock(&cache_lock);
-	if (likely(rp))
+	if (likely(rp)) {
 		++num_drc_entries;
+		drc_mem_usage += sizeof(*rp);
+	}
 
 search_cache:
 	found = nfsd_cache_search(rqstp, csum);
@@ -415,6 +423,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
 
 	/* release any buffer */
 	if (rp->c_type == RC_REPLBUFF) {
+		drc_mem_usage -= rp->c_replvec.iov_len;
 		kfree(rp->c_replvec.iov_base);
 		rp->c_replvec.iov_base = NULL;
 	}
@@ -483,6 +492,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
 	struct svc_cacherep *rp = rqstp->rq_cacherep;
 	struct kvec	*resv = &rqstp->rq_res.head[0], *cachv;
 	int		len;
+	size_t		bufsize = 0;
 
 	if (!rp)
 		return;
@@ -504,19 +514,21 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
 		break;
 	case RC_REPLBUFF:
 		cachv = &rp->c_replvec;
-		cachv->iov_base = kmalloc(len << 2, GFP_KERNEL);
+		bufsize = len << 2;
+		cachv->iov_base = kmalloc(bufsize, GFP_KERNEL);
 		if (!cachv->iov_base) {
 			nfsd_reply_cache_free(rp);
 			return;
 		}
-		cachv->iov_len = len << 2;
-		memcpy(cachv->iov_base, statp, len << 2);
+		cachv->iov_len = bufsize;
+		memcpy(cachv->iov_base, statp, bufsize);
 		break;
 	case RC_NOCACHE:
 		nfsd_reply_cache_free(rp);
 		return;
 	}
 	spin_lock(&cache_lock);
+	drc_mem_usage += bufsize;
 	lru_put_end(rp);
 	rp->c_secure = rqstp->rq_secure;
 	rp->c_type = cachetype;

From a2f999a37ebb77e857d3a178bd6f52d1163cd980 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 Mar 2013 10:15:38 -0400
Subject: [PATCH 04/60] nfsd: add new reply_cache_stats file in nfsdfs

For presenting statistics relating to duplicate reply cache.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/cache.h    |  1 +
 fs/nfsd/nfscache.c | 25 +++++++++++++++++++++++++
 fs/nfsd/nfsctl.c   |  9 +++++++++
 3 files changed, 35 insertions(+)

diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 87fd1410b737..d5c5b3e00266 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -82,6 +82,7 @@ int	nfsd_reply_cache_init(void);
 void	nfsd_reply_cache_shutdown(void);
 int	nfsd_cache_lookup(struct svc_rqst *);
 void	nfsd_cache_update(struct svc_rqst *, int, __be32 *);
+int	nfsd_reply_cache_stats_open(struct inode *, struct file *);
 
 #ifdef CONFIG_NFSD_V4
 void	nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp);
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 1f45b3353bb1..fd81ca79a002 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -556,3 +556,28 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
 	vec->iov_len += data->iov_len;
 	return 1;
 }
+
+/*
+ * Note that fields may be added, removed or reordered in the future. Programs
+ * scraping this file for info should test the labels to ensure they're
+ * getting the correct field.
+ */
+static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
+{
+	spin_lock(&cache_lock);
+	seq_printf(m, "max entries:           %u\n", max_drc_entries);
+	seq_printf(m, "num entries:           %u\n", num_drc_entries);
+	seq_printf(m, "hash buckets:          %u\n", HASHSIZE);
+	seq_printf(m, "mem usage:             %u\n", drc_mem_usage);
+	seq_printf(m, "cache hits:            %u\n", nfsdstats.rchits);
+	seq_printf(m, "cache misses:          %u\n", nfsdstats.rcmisses);
+	seq_printf(m, "not cached:            %u\n", nfsdstats.rcnocache);
+	seq_printf(m, "payload misses:        %u\n", payload_misses);
+	spin_unlock(&cache_lock);
+	return 0;
+}
+
+int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, nfsd_reply_cache_stats_show, NULL);
+}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index f33455b4d957..a830f33df3ef 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -35,6 +35,7 @@ enum {
 	NFSD_Threads,
 	NFSD_Pool_Threads,
 	NFSD_Pool_Stats,
+	NFSD_Reply_Cache_Stats,
 	NFSD_Versions,
 	NFSD_Ports,
 	NFSD_MaxBlkSize,
@@ -212,6 +213,13 @@ static const struct file_operations pool_stats_operations = {
 	.owner		= THIS_MODULE,
 };
 
+static struct file_operations reply_cache_stats_operations = {
+	.open		= nfsd_reply_cache_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 /*----------------------------------------------------------------------------*/
 /*
  * payload - write methods
@@ -1047,6 +1055,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 		[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
+		[NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO},
 		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},

From 98d821bda189ba2bfcb3877ea3064da3403698ae Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 Mar 2013 10:15:39 -0400
Subject: [PATCH 05/60] nfsd: keep stats on worst hash balancing seen so far

The typical case with the DRC is a cache miss, so if we keep track of
the max number of entries that we've ever walked over in a search, then
we should have a reasonable estimate of the longest hash chain that
we've ever seen.

With that, we'll also keep track of the total size of the cache when we
see the longest chain. In the case of a tie, we prefer to track the
smallest total cache size in order to properly gauge the worst-case
ratio of max vs. avg chain length.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfscache.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index fd81ca79a002..17cb0d6b9944 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -41,6 +41,12 @@ static unsigned int		payload_misses;
 /* amount of memory (in bytes) currently consumed by the DRC */
 static unsigned int		drc_mem_usage;
 
+/* longest hash chain seen */
+static unsigned int		longest_chain;
+
+/* size of cache when we saw the longest hash chain */
+static unsigned int		longest_chain_cachesize;
+
 /*
  * Calculate the hash index from an XID.
  */
@@ -319,15 +325,30 @@ nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
 static struct svc_cacherep *
 nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
 {
-	struct svc_cacherep	*rp;
+	struct svc_cacherep	*rp, *ret = NULL;
 	struct hlist_head 	*rh;
+	unsigned int		entries = 0;
 
 	rh = &cache_hash[request_hash(rqstp->rq_xid)];
 	hlist_for_each_entry(rp, rh, c_hash) {
-		if (nfsd_cache_match(rqstp, csum, rp))
-			return rp;
+		++entries;
+		if (nfsd_cache_match(rqstp, csum, rp)) {
+			ret = rp;
+			break;
+		}
 	}
-	return NULL;
+
+	/* tally hash chain length stats */
+	if (entries > longest_chain) {
+		longest_chain = entries;
+		longest_chain_cachesize = num_drc_entries;
+	} else if (entries == longest_chain) {
+		/* prefer to keep the smallest cachesize possible here */
+		longest_chain_cachesize = min(longest_chain_cachesize,
+						num_drc_entries);
+	}
+
+	return ret;
 }
 
 /*
@@ -573,6 +594,8 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
 	seq_printf(m, "cache misses:          %u\n", nfsdstats.rcmisses);
 	seq_printf(m, "not cached:            %u\n", nfsdstats.rcnocache);
 	seq_printf(m, "payload misses:        %u\n", payload_misses);
+	seq_printf(m, "longest chain len:     %u\n", longest_chain);
+	seq_printf(m, "cachesize at longest:  %u\n", longest_chain_cachesize);
 	spin_unlock(&cache_lock);
 	return 0;
 }

From 0733c7ba1ef0d7e29a11c52f4d1356fc394de334 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 Mar 2013 10:15:39 -0400
Subject: [PATCH 06/60] nfsd: scale up the number of DRC hash buckets with
 cache size

We've now increased the size of the duplicate reply cache by quite a
bit, but the number of hash buckets has not changed. So, we've gone from
an average hash chain length of 16 in the old code to 4096 when the
cache is its largest. Change the code to scale out the number of buckets
with the max size of the cache.

At the same time, we also need to fix the hash function since the
existing one isn't really suitable when there are more than 256 buckets.
Move instead to use the stock hash_32 function for this. Testing on a
machine that had 2048 buckets showed that this gave a smaller
longest:average ratio than the existing hash function:

The formula here is longest hash bucket searched divided by average
number of entries per bucket at the time that we saw that longest
bucket:

    old hash: 68/(39258/2048) == 3.547404
    hash_32:  45/(33773/2048) == 2.728807

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfscache.c | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 17cb0d6b9944..eb2587745a64 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -11,6 +11,8 @@
 #include <linux/slab.h>
 #include <linux/sunrpc/addr.h>
 #include <linux/highmem.h>
+#include <linux/log2.h>
+#include <linux/hash.h>
 #include <net/checksum.h>
 
 #include "nfsd.h"
@@ -18,7 +20,12 @@
 
 #define NFSDDBG_FACILITY	NFSDDBG_REPCACHE
 
-#define HASHSIZE		64
+/*
+ * We use this value to determine the number of hash buckets from the max
+ * cache size, the idea being that when the cache is at its maximum number
+ * of entries, then this should be the average number of entries per bucket.
+ */
+#define TARGET_BUCKET_SIZE	64
 
 static struct hlist_head *	cache_hash;
 static struct list_head 	lru_head;
@@ -27,6 +34,9 @@ static struct kmem_cache	*drc_slab;
 /* max number of entries allowed in the cache */
 static unsigned int		max_drc_entries;
 
+/* number of significant bits in the hash value */
+static unsigned int		maskbits;
+
 /*
  * Stats and other tracking of on the duplicate reply cache. All of these and
  * the "rc" fields in nfsdstats are protected by the cache_lock
@@ -47,16 +57,6 @@ static unsigned int		longest_chain;
 /* size of cache when we saw the longest hash chain */
 static unsigned int		longest_chain_cachesize;
 
-/*
- * Calculate the hash index from an XID.
- */
-static inline u32 request_hash(u32 xid)
-{
-	u32 h = xid;
-	h ^= (xid >> 24);
-	return h & (HASHSIZE-1);
-}
-
 static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
 static void	cache_cleaner_func(struct work_struct *unused);
 static int 	nfsd_reply_cache_shrink(struct shrinker *shrink,
@@ -103,6 +103,16 @@ nfsd_cache_size_limit(void)
 	return min_t(unsigned int, limit, 256*1024);
 }
 
+/*
+ * Compute the number of hash buckets we need. Divide the max cachesize by
+ * the "target" max bucket size, and round up to next power of two.
+ */
+static unsigned int
+nfsd_hashsize(unsigned int limit)
+{
+	return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
+}
+
 static struct svc_cacherep *
 nfsd_reply_cache_alloc(void)
 {
@@ -143,9 +153,13 @@ nfsd_reply_cache_free(struct svc_cacherep *rp)
 
 int nfsd_reply_cache_init(void)
 {
+	unsigned int hashsize;
+
 	INIT_LIST_HEAD(&lru_head);
 	max_drc_entries = nfsd_cache_size_limit();
 	num_drc_entries = 0;
+	hashsize = nfsd_hashsize(max_drc_entries);
+	maskbits = ilog2(hashsize);
 
 	register_shrinker(&nfsd_reply_cache_shrinker);
 	drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep),
@@ -153,7 +167,7 @@ int nfsd_reply_cache_init(void)
 	if (!drc_slab)
 		goto out_nomem;
 
-	cache_hash = kcalloc(HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
+	cache_hash = kcalloc(hashsize, sizeof(struct hlist_head), GFP_KERNEL);
 	if (!cache_hash)
 		goto out_nomem;
 
@@ -204,7 +218,7 @@ static void
 hash_refile(struct svc_cacherep *rp)
 {
 	hlist_del_init(&rp->c_hash);
-	hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid));
+	hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits));
 }
 
 static inline bool
@@ -329,7 +343,7 @@ nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
 	struct hlist_head 	*rh;
 	unsigned int		entries = 0;
 
-	rh = &cache_hash[request_hash(rqstp->rq_xid)];
+	rh = &cache_hash[hash_32(rqstp->rq_xid, maskbits)];
 	hlist_for_each_entry(rp, rh, c_hash) {
 		++entries;
 		if (nfsd_cache_match(rqstp, csum, rp)) {
@@ -588,7 +602,7 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
 	spin_lock(&cache_lock);
 	seq_printf(m, "max entries:           %u\n", max_drc_entries);
 	seq_printf(m, "num entries:           %u\n", num_drc_entries);
-	seq_printf(m, "hash buckets:          %u\n", HASHSIZE);
+	seq_printf(m, "hash buckets:          %u\n", 1 << maskbits);
 	seq_printf(m, "mem usage:             %u\n", drc_mem_usage);
 	seq_printf(m, "cache hits:            %u\n", nfsdstats.rchits);
 	seq_printf(m, "cache misses:          %u\n", nfsdstats.rcmisses);

From b600de7ab9288eaf6126561203e0ae340828ab44 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 28 Feb 2013 11:55:46 -0800
Subject: [PATCH 07/60] nfsd4: remove BUG_ON

This BUG_ON just crashes the thread a little earlier than it would
otherwise--it doesn't seem useful.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ae73175e6e68..c7e4e8c28827 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1282,12 +1282,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 		if (op->status)
 			goto encode_op;
 
-		if (opdesc->op_func) {
-			if (opdesc->op_get_currentstateid)
-				opdesc->op_get_currentstateid(cstate, &op->u);
-			op->status = opdesc->op_func(rqstp, cstate, &op->u);
-		} else
-			BUG_ON(op->status == nfs_ok);
+		if (opdesc->op_get_currentstateid)
+			opdesc->op_get_currentstateid(cstate, &op->u);
+		op->status = opdesc->op_func(rqstp, cstate, &op->u);
 
 		if (!op->status) {
 			if (opdesc->op_set_currentstateid)

From 9d313b17db965ae42137c5d4dd3063037544c4cd Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 28 Feb 2013 12:51:49 -0800
Subject: [PATCH 08/60] nfsd4: handle seqid-mutating open errors from xdr
 decoding

If a client sets an owner (or group_owner or acl) attribute on open for
create, and the mapping of that owner to an id fails, then we return
BAD_OWNER.  But BAD_OWNER is a seqid-mutating error, so we can't
shortcut the open processing that case: we have to at least look up the
owner so we can find the seqid to bump.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 27 ++++++++++++++++++++++++++-
 fs/nfsd/nfs4xdr.c  |  1 +
 fs/nfsd/xdr4.h     |  1 +
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c7e4e8c28827..42c498ce9f0e 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -351,6 +351,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	}
 	if (status)
 		goto out;
+	if (open->op_xdr_error) {
+		status = open->op_xdr_error;
+		goto out;
+	}
 
 	status = nfsd4_check_open_attributes(rqstp, cstate, open);
 	if (status)
@@ -416,6 +420,24 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return status;
 }
 
+/*
+ * OPEN is the only seqid-mutating operation whose decoding can fail
+ * with a seqid-mutating error (specifically, decoding of user names in
+ * the attributes).  Therefore we have to do some processing to look up
+ * the stateowner so that we can bump the seqid.
+ */
+static __be32 nfsd4_open_omfg(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_op *op)
+{
+	struct nfsd4_open *open = (struct nfsd4_open *)&op->u;
+
+	if (!seqid_mutating_err(ntohl(op->status)))
+		return op->status;
+	if (nfsd4_has_session(cstate))
+		return op->status;
+	open->op_xdr_error = op->status;
+	return nfsd4_open(rqstp, cstate, open);
+}
+
 /*
  * filehandle-manipulating ops.
  */
@@ -1244,8 +1266,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 		 * for example, if there is a miscellaneous XDR error
 		 * it will be set to nfserr_bad_xdr.
 		 */
-		if (op->status)
+		if (op->status) {
+			if (op->opnum == OP_OPEN)
+				op->status = nfsd4_open_omfg(rqstp, cstate, op);
 			goto encode_op;
+		}
 
 		/* We must be able to encode a successful response to
 		 * this operation, with enough room left over to encode a
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index a2720071f282..229b3ac246e1 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -804,6 +804,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 	open->op_iattr.ia_valid = 0;
 	open->op_openowner = NULL;
 
+	open->op_xdr_error = 0;
 	/* seqid, share_access, share_deny, clientid, ownerlen */
 	READ_BUF(4);
 	READ32(open->op_seqid);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 546f8983ecf1..be0a79d1dbcb 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -237,6 +237,7 @@ struct nfsd4_open {
 	u32		op_share_deny;      /* request */
 	u32		op_deleg_want;      /* request */
 	stateid_t	op_stateid;         /* response */
+	__be32		op_xdr_error;       /* see nfsd4_open_omfg() */
 	u32		op_recall;          /* recall */
 	struct nfsd4_change_info  op_cinfo; /* response */
 	u32		op_rflags;          /* response */

From b0a9d3ab577464529f6649ec54f8a0de160866e3 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 7 Mar 2013 17:26:18 -0500
Subject: [PATCH 09/60] nfsd4: fix race on client shutdown

Dropping the session's reference count after the client's means we leave
a window where the session's se_client pointer is NULL.  An xpt_user
callback that encounters such a session may then crash:

[  303.956011] BUG: unable to handle kernel NULL pointer dereference at 0000000000000318
[  303.959061] IP: [<ffffffff81481a8e>] _raw_spin_lock+0x1e/0x40
[  303.959061] PGD 37811067 PUD 3d498067 PMD 0
[  303.959061] Oops: 0002 [#8] PREEMPT SMP
[  303.959061] Modules linked in: md5 nfsd auth_rpcgss nfs_acl snd_hda_intel snd_hda_codec snd_hwdep snd_pcm snd_page_alloc microcode psmouse snd_timer serio_raw pcspkr evdev snd soundcore i2c_piix4 i2c_core intel_agp intel_gtt processor button nfs lockd sunrpc fscache ata_generic pata_acpi ata_piix uhci_hcd libata btrfs usbcore usb_common crc32c scsi_mod libcrc32c zlib_deflate floppy virtio_balloon virtio_net virtio_pci virtio_blk virtio_ring virtio
[  303.959061] CPU 0
[  303.959061] Pid: 264, comm: nfsd Tainted: G      D      3.8.0-ARCH+ #156 Bochs Bochs
[  303.959061] RIP: 0010:[<ffffffff81481a8e>]  [<ffffffff81481a8e>] _raw_spin_lock+0x1e/0x40
[  303.959061] RSP: 0018:ffff880037877dd8  EFLAGS: 00010202
[  303.959061] RAX: 0000000000000100 RBX: ffff880037a2b698 RCX: ffff88003d879278
[  303.959061] RDX: ffff88003d879278 RSI: dead000000100100 RDI: 0000000000000318
[  303.959061] RBP: ffff880037877dd8 R08: ffff88003c5a0f00 R09: 0000000000000002
[  303.959061] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000
[  303.959061] R13: 0000000000000318 R14: ffff880037a2b680 R15: ffff88003c1cbe00
[  303.959061] FS:  0000000000000000(0000) GS:ffff88003fc00000(0000) knlGS:0000000000000000
[  303.959061] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[  303.959061] CR2: 0000000000000318 CR3: 000000003d49c000 CR4: 00000000000006f0
[  303.959061] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  303.959061] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  303.959061] Process nfsd (pid: 264, threadinfo ffff880037876000, task ffff88003c1fd0a0)
[  303.959061] Stack:
[  303.959061]  ffff880037877e08 ffffffffa03772ec ffff88003d879000 ffff88003d879278
[  303.959061]  ffff88003d879080 0000000000000000 ffff880037877e38 ffffffffa0222a1f
[  303.959061]  0000000000107ac0 ffff88003c22e000 ffff88003d879000 ffff88003c1cbe00
[  303.959061] Call Trace:
[  303.959061]  [<ffffffffa03772ec>] nfsd4_conn_lost+0x3c/0xa0 [nfsd]
[  303.959061]  [<ffffffffa0222a1f>] svc_delete_xprt+0x10f/0x180 [sunrpc]
[  303.959061]  [<ffffffffa0223d96>] svc_recv+0xe6/0x580 [sunrpc]
[  303.959061]  [<ffffffffa03587c5>] nfsd+0xb5/0x140 [nfsd]
[  303.959061]  [<ffffffffa0358710>] ? nfsd_destroy+0x90/0x90 [nfsd]
[  303.959061]  [<ffffffff8107ae00>] kthread+0xc0/0xd0
[  303.959061]  [<ffffffff81010000>] ? perf_trace_xen_mmu_set_pte_at+0x50/0x100
[  303.959061]  [<ffffffff8107ad40>] ? kthread_freezable_should_stop+0x70/0x70
[  303.959061]  [<ffffffff814898ec>] ret_from_fork+0x7c/0xb0
[  303.959061]  [<ffffffff8107ad40>] ? kthread_freezable_should_stop+0x70/0x70
[  303.959061] Code: ff ff 5d c3 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 55 65 48 8b 04 25 f0 c6 00 00 48 89 e5 83 80 44 e0 ff ff 01 b8 00 01 00 00 <3e> 66 0f c1 07 0f b6 d4 38 c2 74 0f 66 0f 1f 44 00 00 f3 90 0f
[  303.959061] RIP  [<ffffffff81481a8e>] _raw_spin_lock+0x1e/0x40
[  303.959061]  RSP <ffff880037877dd8>
[  303.959061] CR2: 0000000000000318
[  304.001218] ---[ end trace 2d809cd4a7931f5a ]---
[  304.001903] note: nfsd[264] exited with preempt_count 2

Reported-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 12 ++++++++----
 fs/nfsd/nfs4xdr.c   |  1 -
 fs/nfsd/state.h     |  2 --
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2e27430b9070..3e5cbfe8a967 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -864,7 +864,7 @@ static void free_session(struct kref *kref)
 	__free_session(ses);
 }
 
-void nfsd4_put_session(struct nfsd4_session *ses)
+static void nfsd4_put_session(struct nfsd4_session *ses)
 {
 	struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id);
 
@@ -1057,12 +1057,16 @@ release_session_client(struct nfsd4_session *session)
 	struct nfs4_client *clp = session->se_client;
 	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
 
+	nfsd4_put_session(session);
 	if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock))
 		return;
-	if (is_client_expired(clp)) {
+	/*
+	 * At this point we also know all sessions have refcnt 1,
+	 * so free_client will delete them all if necessary:
+	 */
+	if (is_client_expired(clp))
 		free_client(clp);
-		session->se_client = NULL;
-	} else
+	else
 		renew_client_locked(clp);
 	spin_unlock(&nn->client_lock);
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 229b3ac246e1..9b02b6652f2b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3685,7 +3685,6 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
 		}
 		/* Renew the clientid on success and on replay */
 		release_session_client(cs->session);
-		nfsd4_put_session(cs->session);
 	}
 	return 1;
 }
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 1a8c7391f7ae..327552bb6dba 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -209,8 +209,6 @@ struct nfsd4_session {
 	struct nfsd4_slot	*se_slots[];	/* forward channel slots */
 };
 
-extern void nfsd4_put_session(struct nfsd4_session *ses);
-
 /* formatted contents of nfs4_sessionid */
 struct nfsd4_sessionid {
 	clientid_t	clientid;

From 2e4b7239a62a0c58664bf0cf73aea951b7e046fc Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 8 Mar 2013 09:30:43 -0500
Subject: [PATCH 10/60] nfsd4: fix use-after-free of 4.1 client on connection
 loss

Once we drop the lock here there's nothing keeping the client around:
the only lock still held is the xpt_lock on this socket, but this socket
no longer has any connection with the client so there's no way for other
code to know we're still using the client.

The solution is simple: all nfsd4_probe_callback does is set a few
variables and queue some work, so there's no reason we can't just keep
it under the lock.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3e5cbfe8a967..baf314a950b8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -761,8 +761,8 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u)
 		list_del(&c->cn_persession);
 		free_conn(c);
 	}
-	spin_unlock(&clp->cl_lock);
 	nfsd4_probe_callback(clp);
+	spin_unlock(&clp->cl_lock);
 }
 
 static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)

From 9c6bdbb8dd58c8de8f36e1deb6b768918c85c249 Mon Sep 17 00:00:00 2001
From: Yanchuan Nian <ycnian@gmail.com>
Date: Mon, 11 Mar 2013 10:43:26 +0800
Subject: [PATCH 11/60] nfsd: remove unused macro in nfsv4

lk_rflags is never used anywhere, and rflags is not defined in struct
nfsd4_lock.

Signed-off-by: Yanchuan Nian <ycnian@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/xdr4.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index be0a79d1dbcb..40e05e6d2518 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -184,7 +184,6 @@ struct nfsd4_lock {
 #define lk_old_lock_stateid     v.old.lock_stateid
 #define lk_old_lock_seqid       v.old.lock_seqid
 
-#define lk_rflags       u.ok.rflags
 #define lk_resp_stateid u.ok.stateid
 #define lk_denied       u.denied
 

From 491402a7876e91aa491c33f70ed4e86e59f06c8b Mon Sep 17 00:00:00 2001
From: "ycnian@gmail.com" <ycnian@gmail.com>
Date: Mon, 11 Mar 2013 08:46:14 +0800
Subject: [PATCH 12/60] nfsd: fix bug on nfs4 stateid deallocation

NFS4_OO_PURGE_CLOSE is not handled properly. To avoid memory leak, nfs4
stateid which is pointed by oo_last_closed_stid is freed in nfsd4_close(),
but NFS4_OO_PURGE_CLOSE isn't cleared meanwhile. So the stateid released in
THIS close procedure may be freed immediately in the coming encoding function.
Sorry that Signed-off-by was forgotten in last version.

Signed-off-by: Yanchuan Nian <ycnian@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index baf314a950b8..aac878ecabc4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3823,6 +3823,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfsd4_close_open_stateid(stp);
 	release_last_closed_stateid(oo);
+	oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE;
 	oo->oo_last_closed_stid = stp;
 
 	if (list_empty(&oo->oo_owner.so_stateids)) {

From 78389046f733564d5c2c94f0b8d6ff0cdae951d9 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 12 Mar 2013 10:12:37 -0400
Subject: [PATCH 13/60] nfsd4: warn on odd create_session state

This should never happen.

(Note: the comparable case in setclientid_confirm *can* happen, since
updating a client record can result in both confirmed and unconfirmed
records with the same clientid.)

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index aac878ecabc4..ef7c6222b7c8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1788,6 +1788,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	nfs4_lock_state();
 	unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
 	conf = find_confirmed_client(&cr_ses->clientid, true, nn);
+	WARN_ON_ONCE(conf && unconf);
 
 	if (conf) {
 		cs_slot = &conf->cl_cs_slot;
@@ -2129,6 +2130,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
 	nfs4_lock_state();
 	unconf = find_unconfirmed_client(&dc->clientid, true, nn);
 	conf = find_confirmed_client(&dc->clientid, true, nn);
+	WARN_ON_ONCE(conf && unconf);
 
 	if (conf) {
 		clp = conf;

From 0eb6f20aa532b0c16849d627926c2ad3fe2f1cdf Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 12 Mar 2013 17:36:17 -0400
Subject: [PATCH 14/60] nfsd4: STALE_STATEID cleanup

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ef7c6222b7c8..a0ab6ad7239d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3282,16 +3282,6 @@ static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *s
 	return nfs_ok;
 }
 
-static int
-STALE_STATEID(stateid_t *stateid, struct nfsd_net *nn)
-{
-	if (stateid->si_opaque.so_clid.cl_boot == nn->boot_time)
-		return 0;
-	dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
-		STATEID_VAL(stateid));
-	return 1;
-}
-
 static inline int
 access_permit_read(struct nfs4_ol_stateid *stp)
 {
@@ -3422,19 +3412,20 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,
 				   struct nfsd_net *nn)
 {
 	struct nfs4_client *cl;
+	__be32 status;
 
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
 		return nfserr_bad_stateid;
-	if (STALE_STATEID(stateid, nn))
+	status = lookup_clientid(&stateid->si_opaque.so_clid, sessions,
+							nn, &cl);
+	if (status == nfserr_stale_clientid)
 		return nfserr_stale_stateid;
-	cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions, nn);
-	if (!cl)
-		return nfserr_expired;
+	if (status)
+		return status;
 	*s = find_stateid_by_type(cl, stateid, typemask);
 	if (!*s)
 		return nfserr_bad_stateid;
 	return nfs_ok;
-
 }
 
 /*

From 1ca507920db36aea8b81fe1443f96a1a6a43318f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 14 Mar 2013 18:12:03 -0400
Subject: [PATCH 15/60] nfsd4: remove some dprintk's

E.g. printk's that just report the return value from an op are
uninteresting as we already do that in the main proc_compound loop.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a0ab6ad7239d..84dfbdfd2d2c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1843,15 +1843,13 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	/* cache solo and embedded create sessions under the state lock */
 	nfsd4_cache_create_session(cr_ses, cs_slot, status);
 	nfs4_unlock_state();
-out:
-	dprintk("%s returns %d\n", __func__, ntohl(status));
 	return status;
 out_free_conn:
 	nfs4_unlock_state();
 	free_conn(conn);
 out_free_session:
 	__free_session(new);
-	goto out;
+	return status;
 }
 
 static __be32 nfsd4_map_bcts_dir(u32 *dir)
@@ -1963,7 +1961,6 @@ nfsd4_destroy_session(struct svc_rqst *r,
 	spin_unlock(&nn->client_lock);
 	status = nfs_ok;
 out:
-	dprintk("%s returns %d\n", __func__, ntohl(status));
 	return status;
 }
 
@@ -2116,7 +2113,6 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	}
 	kfree(conn);
 	spin_unlock(&nn->client_lock);
-	dprintk("%s: return %d\n", __func__, ntohl(status));
 	return status;
 }
 
@@ -2155,7 +2151,6 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
 	expire_client(clp);
 out:
 	nfs4_unlock_state();
-	dprintk("%s return %d\n", __func__, ntohl(status));
 	return status;
 }
 
@@ -2532,8 +2527,6 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 	struct nfs4_ol_stateid *stp;
 	__be32 ret;
 
-	dprintk("NFSD: nfs4_share_conflict\n");
-
 	fp = find_file(ino);
 	if (!fp)
 		return nfs_ok;

From c0293b0131a8d582af85023c684786f7536f0767 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 14 Mar 2013 18:20:01 -0400
Subject: [PATCH 16/60] nfsd4: destroy_clientid simplification

I'm not sure what the check for clientid expiry was meant to do here.

The check for a matching session is redundant given the previous check
for state: a client without state is, in particular, a client without
sessions.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 84dfbdfd2d2c..905a5b511047 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2131,13 +2131,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
 	if (conf) {
 		clp = conf;
 
-		if (!is_client_expired(conf) && client_has_state(conf)) {
-			status = nfserr_clientid_busy;
-			goto out;
-		}
-
-		/* rfc5661 18.50.3 */
-		if (cstate->session && conf == cstate->session->se_client) {
+		if (client_has_state(conf)) {
 			status = nfserr_clientid_busy;
 			goto out;
 		}

From bfa85e83a87aec71cbb231256eaad7341aa8b4fa Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 14 Mar 2013 18:24:52 -0400
Subject: [PATCH 17/60] nfsd4: clientid lookup cleanup

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 905a5b511047..c89bb3c40a0b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1380,12 +1380,12 @@ move_to_confirmed(struct nfs4_client *clp)
 }
 
 static struct nfs4_client *
-find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
+find_client_in_id_table(struct list_head *tbl, clientid_t *clid, bool sessions)
 {
 	struct nfs4_client *clp;
 	unsigned int idhashval = clientid_hashval(clid->cl_id);
 
-	list_for_each_entry(clp, &nn->conf_id_hashtbl[idhashval], cl_idhash) {
+	list_for_each_entry(clp, &tbl[idhashval], cl_idhash) {
 		if (same_clid(&clp->cl_clientid, clid)) {
 			if ((bool)clp->cl_minorversion != sessions)
 				return NULL;
@@ -1396,20 +1396,20 @@ find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
 	return NULL;
 }
 
+static struct nfs4_client *
+find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
+{
+	struct list_head *tbl = nn->conf_id_hashtbl;
+
+	return find_client_in_id_table(tbl, clid, sessions);
+}
+
 static struct nfs4_client *
 find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
 {
-	struct nfs4_client *clp;
-	unsigned int idhashval = clientid_hashval(clid->cl_id);
+	struct list_head *tbl = nn->unconf_id_hashtbl;
 
-	list_for_each_entry(clp, &nn->unconf_id_hashtbl[idhashval], cl_idhash) {
-		if (same_clid(&clp->cl_clientid, clid)) {
-			if ((bool)clp->cl_minorversion != sessions)
-				return NULL;
-			return clp;
-		}
-	}
-	return NULL;
+	return find_client_in_id_table(tbl, clid, sessions);
 }
 
 static bool clp_used_exchangeid(struct nfs4_client *clp)

From abcdff09a05117112aa22cd84939039655bca710 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 14 Mar 2013 19:55:33 -0400
Subject: [PATCH 18/60] nfsd4: fix destroy_session race

destroy_session uses the session and client without continuously holding
any reference or locks.

Put the whole thing under the state lock for now.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c89bb3c40a0b..8cc668dc4997 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1926,41 +1926,35 @@ nfsd4_destroy_session(struct svc_rqst *r,
 		      struct nfsd4_destroy_session *sessionid)
 {
 	struct nfsd4_session *ses;
-	__be32 status = nfserr_badsession;
+	__be32 status;
 	struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);
 
-	/* Notes:
-	 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
-	 * - Should we return nfserr_back_chan_busy if waiting for
-	 *   callbacks on to-be-destroyed session?
-	 * - Do we need to clear any callback info from previous session?
-	 */
-
+	nfs4_lock_state();
+	status = nfserr_not_only_op;
 	if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
 		if (!nfsd4_last_compound_op(r))
-			return nfserr_not_only_op;
+			goto out;
 	}
 	dump_sessionid(__func__, &sessionid->sessionid);
 	spin_lock(&nn->client_lock);
 	ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r));
-	if (!ses) {
-		spin_unlock(&nn->client_lock);
-		goto out;
-	}
+	status = nfserr_badsession;
+	if (!ses)
+		goto out_client_lock;
 
 	unhash_session(ses);
 	spin_unlock(&nn->client_lock);
 
-	nfs4_lock_state();
 	nfsd4_probe_callback_sync(ses->se_client);
-	nfs4_unlock_state();
 
 	spin_lock(&nn->client_lock);
 	nfsd4_del_conns(ses);
 	nfsd4_put_session_locked(ses);
-	spin_unlock(&nn->client_lock);
 	status = nfs_ok;
+out_client_lock:
+	spin_unlock(&nn->client_lock);
 out:
+	nfs4_unlock_state();
 	return status;
 }
 

From 4f6e6c17733ecf01c05a693ced8349ccf8101fd8 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 18 Mar 2013 17:31:30 -0400
Subject: [PATCH 19/60] nfsd4: simplify bind_conn_to_session locking

The locking here is very fiddly, and there's no reason for us to be
setting cstate->session, since this is the only op in the compound.
Let's just take the state lock and drop the reference counting.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8cc668dc4997..8e83cef4d0bd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1887,30 +1887,30 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
 {
 	__be32 status;
 	struct nfsd4_conn *conn;
+	struct nfsd4_session *session;
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 
 	if (!nfsd4_last_compound_op(rqstp))
 		return nfserr_not_only_op;
+	nfs4_lock_state();
 	spin_lock(&nn->client_lock);
-	cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp));
-	/* Sorta weird: we only need the refcnt'ing because new_conn acquires
-	 * client_lock iself: */
-	if (cstate->session) {
-		nfsd4_get_session(cstate->session);
-		atomic_inc(&cstate->session->se_client->cl_refcount);
-	}
+	session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp));
 	spin_unlock(&nn->client_lock);
-	if (!cstate->session)
-		return nfserr_badsession;
-
+	status = nfserr_badsession;
+	if (!session)
+		goto out;
 	status = nfsd4_map_bcts_dir(&bcts->dir);
 	if (status)
-		return status;
+		goto out;
 	conn = alloc_conn(rqstp, bcts->dir);
+	status = nfserr_jukebox;
 	if (!conn)
-		return nfserr_jukebox;
-	nfsd4_init_conn(rqstp, conn, cstate->session);
-	return nfs_ok;
+		goto out;
+	nfsd4_init_conn(rqstp, conn, session);
+	status = nfs_ok;
+out:
+	nfs4_unlock_state();
+	return status;
 }
 
 static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)

From 221a68766973d7a3afe40a05abd8258b5de016a0 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 1 Apr 2013 22:23:49 -0400
Subject: [PATCH 20/60] nfsd4: don't destroy in-use clients

When a setclientid_confirm or create_session confirms a client after a
client reboot, it also destroys any previous state held by that client.

The shutdown of that previous state must be careful not to free the
client out from under threads processing other requests that refer to
the client.

This is a particular problem in the NFSv4.1 case when we hold a
reference to a session (hence a client) throughout compound processing.

The server attempts to handle this by unhashing the client at the time
it's destroyed, then delaying the final free to the end.  But this still
leaves some races in the current code.

I believe it's simpler just to fail the attempt to destroy the client by
returning NFS4ERR_DELAY.  This is a case that should never happen
anyway.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 209 ++++++++++++++++++++++++++------------------
 fs/nfsd/nfs4xdr.c   |   3 +-
 fs/nfsd/state.h     |  16 +---
 3 files changed, 131 insertions(+), 97 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8e83cef4d0bd..3b4ce41c9db8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -113,6 +113,90 @@ nfs4_unlock_state(void)
 	mutex_unlock(&client_mutex);
 }
 
+static bool is_client_expired(struct nfs4_client *clp)
+{
+	return clp->cl_time == 0;
+}
+
+static __be32 mark_client_expired_locked(struct nfs4_client *clp)
+{
+	if (atomic_read(&clp->cl_refcount))
+		return nfserr_jukebox;
+	clp->cl_time = 0;
+	return nfs_ok;
+}
+
+static __be32 mark_client_expired(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	__be32 ret;
+
+	spin_lock(&nn->client_lock);
+	ret = mark_client_expired_locked(clp);
+	spin_unlock(&nn->client_lock);
+	return ret;
+}
+
+static __be32 get_client_locked(struct nfs4_client *clp)
+{
+	if (is_client_expired(clp))
+		return nfserr_expired;
+	atomic_inc(&clp->cl_refcount);
+	return nfs_ok;
+}
+
+/* must be called under the client_lock */
+static inline void
+renew_client_locked(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	if (is_client_expired(clp)) {
+		WARN_ON(1);
+		printk("%s: client (clientid %08x/%08x) already expired\n",
+			__func__,
+			clp->cl_clientid.cl_boot,
+			clp->cl_clientid.cl_id);
+		return;
+	}
+
+	dprintk("renewing client (clientid %08x/%08x)\n",
+			clp->cl_clientid.cl_boot,
+			clp->cl_clientid.cl_id);
+	list_move_tail(&clp->cl_lru, &nn->client_lru);
+	clp->cl_time = get_seconds();
+}
+
+static inline void
+renew_client(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	spin_lock(&nn->client_lock);
+	renew_client_locked(clp);
+	spin_unlock(&nn->client_lock);
+}
+
+void put_client_renew_locked(struct nfs4_client *clp)
+{
+	if (!atomic_dec_and_test(&clp->cl_refcount))
+		return;
+	if (!is_client_expired(clp))
+		renew_client_locked(clp);
+}
+
+void put_client_renew(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock))
+		return;
+	if (!is_client_expired(clp))
+		renew_client_locked(clp);
+	spin_unlock(&nn->client_lock);
+}
+
+
 static inline u32
 opaque_hashval(const void *ptr, int nbytes)
 {
@@ -864,7 +948,7 @@ static void free_session(struct kref *kref)
 	__free_session(ses);
 }
 
-static void nfsd4_put_session(struct nfsd4_session *ses)
+void nfsd4_put_session(struct nfsd4_session *ses)
 {
 	struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id);
 
@@ -968,38 +1052,6 @@ unhash_session(struct nfsd4_session *ses)
 	spin_unlock(&ses->se_client->cl_lock);
 }
 
-/* must be called under the client_lock */
-static inline void
-renew_client_locked(struct nfs4_client *clp)
-{
-	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-
-	if (is_client_expired(clp)) {
-		WARN_ON(1);
-		printk("%s: client (clientid %08x/%08x) already expired\n",
-			__func__,
-			clp->cl_clientid.cl_boot,
-			clp->cl_clientid.cl_id);
-		return;
-	}
-
-	dprintk("renewing client (clientid %08x/%08x)\n", 
-			clp->cl_clientid.cl_boot, 
-			clp->cl_clientid.cl_id);
-	list_move_tail(&clp->cl_lru, &nn->client_lru);
-	clp->cl_time = get_seconds();
-}
-
-static inline void
-renew_client(struct nfs4_client *clp)
-{
-	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-
-	spin_lock(&nn->client_lock);
-	renew_client_locked(clp);
-	spin_unlock(&nn->client_lock);
-}
-
 /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
 static int
 STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
@@ -1051,33 +1103,12 @@ free_client(struct nfs4_client *clp)
 	kfree(clp);
 }
 
-void
-release_session_client(struct nfsd4_session *session)
-{
-	struct nfs4_client *clp = session->se_client;
-	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-
-	nfsd4_put_session(session);
-	if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock))
-		return;
-	/*
-	 * At this point we also know all sessions have refcnt 1,
-	 * so free_client will delete them all if necessary:
-	 */
-	if (is_client_expired(clp))
-		free_client(clp);
-	else
-		renew_client_locked(clp);
-	spin_unlock(&nn->client_lock);
-}
-
 /* must be called under the client_lock */
 static inline void
 unhash_client_locked(struct nfs4_client *clp)
 {
 	struct nfsd4_session *ses;
 
-	mark_client_expired(clp);
 	list_del(&clp->cl_lru);
 	spin_lock(&clp->cl_lock);
 	list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
@@ -1119,8 +1150,8 @@ destroy_client(struct nfs4_client *clp)
 		rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
 	spin_lock(&nn->client_lock);
 	unhash_client_locked(clp);
-	if (atomic_read(&clp->cl_refcount) == 0)
-		free_client(clp);
+	WARN_ON_ONCE(atomic_read(&clp->cl_refcount));
+	free_client(clp);
 	spin_unlock(&nn->client_lock);
 }
 
@@ -1815,8 +1846,12 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 			goto out_free_conn;
 		}
 		old = find_confirmed_client_by_name(&unconf->cl_name, nn);
-		if (old)
+		if (old) {
+			status = mark_client_expired(old);
+			if (status)
+				goto out_free_conn;
 			expire_client(old);
+		}
 		move_to_confirmed(unconf);
 		conf = unconf;
 	} else {
@@ -2014,6 +2049,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 {
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	struct nfsd4_session *session;
+	struct nfs4_client *clp;
 	struct nfsd4_slot *slot;
 	struct nfsd4_conn *conn;
 	__be32 status;
@@ -2034,19 +2070,23 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	status = nfserr_badsession;
 	session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp));
 	if (!session)
-		goto out;
+		goto out_no_session;
+	clp = session->se_client;
+	status = get_client_locked(clp);
+	if (status)
+		goto out_no_session;
 
 	status = nfserr_too_many_ops;
 	if (nfsd4_session_too_many_ops(rqstp, session))
-		goto out;
+		goto out_put_client;
 
 	status = nfserr_req_too_big;
 	if (nfsd4_request_too_big(rqstp, session))
-		goto out;
+		goto out_put_client;
 
 	status = nfserr_badslot;
 	if (seq->slotid >= session->se_fchannel.maxreqs)
-		goto out;
+		goto out_put_client;
 
 	slot = session->se_slots[seq->slotid];
 	dprintk("%s: slotid %d\n", __func__, seq->slotid);
@@ -2061,7 +2101,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	if (status == nfserr_replay_cache) {
 		status = nfserr_seq_misordered;
 		if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED))
-			goto out;
+			goto out_put_client;
 		cstate->slot = slot;
 		cstate->session = session;
 		/* Return the cached reply status and set cstate->status
@@ -2071,7 +2111,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 		goto out;
 	}
 	if (status)
-		goto out;
+		goto out_put_client;
 
 	nfsd4_sequence_check_conn(conn, session);
 	conn = NULL;
@@ -2088,26 +2128,24 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	cstate->session = session;
 
 out:
-	/* Hold a session reference until done processing the compound. */
-	if (cstate->session) {
-		struct nfs4_client *clp = session->se_client;
-
-		nfsd4_get_session(cstate->session);
-		atomic_inc(&clp->cl_refcount);
-		switch (clp->cl_cb_state) {
-		case NFSD4_CB_DOWN:
-			seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
-			break;
-		case NFSD4_CB_FAULT:
-			seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
-			break;
-		default:
-			seq->status_flags = 0;
-		}
+	nfsd4_get_session(cstate->session);
+	switch (clp->cl_cb_state) {
+	case NFSD4_CB_DOWN:
+		seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
+		break;
+	case NFSD4_CB_FAULT:
+		seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
+		break;
+	default:
+		seq->status_flags = 0;
 	}
+out_no_session:
 	kfree(conn);
 	spin_unlock(&nn->client_lock);
 	return status;
+out_put_client:
+	put_client_renew_locked(clp);
+	goto out_no_session;
 }
 
 __be32
@@ -2276,8 +2314,12 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 		expire_client(unconf);
 	} else { /* case 3: normal case; new or rebooted client */
 		conf = find_confirmed_client_by_name(&unconf->cl_name, nn);
-		if (conf)
+		if (conf) {
+			status = mark_client_expired(conf);
+			if (status)
+				goto out;
 			expire_client(conf);
+		}
 		move_to_confirmed(unconf);
 		nfsd4_probe_callback(unconf);
 	}
@@ -3189,13 +3231,12 @@ nfs4_laundromat(struct nfsd_net *nn)
 				clientid_val = t;
 			break;
 		}
-		if (atomic_read(&clp->cl_refcount)) {
+		if (mark_client_expired_locked(clp)) {
 			dprintk("NFSD: client in use (clientid %08x)\n",
 				clp->cl_clientid.cl_id);
 			continue;
 		}
-		unhash_client_locked(clp);
-		list_add(&clp->cl_lru, &reaplist);
+		list_move(&clp->cl_lru, &reaplist);
 	}
 	spin_unlock(&nn->client_lock);
 	list_for_each_safe(pos, next, &reaplist) {
@@ -4581,6 +4622,8 @@ nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn)
 
 u64 nfsd_forget_client(struct nfs4_client *clp, u64 max)
 {
+	if (mark_client_expired(clp))
+		return 0;
 	expire_client(clp);
 	return 1;
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9b02b6652f2b..700de0192834 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3684,7 +3684,8 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
 			cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
 		}
 		/* Renew the clientid on success and on replay */
-		release_session_client(cs->session);
+		put_client_renew(cs->session->se_client);
+		nfsd4_put_session(cs->session);
 	}
 	return 1;
 }
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 327552bb6dba..07f8a822a6ce 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -209,6 +209,8 @@ struct nfsd4_session {
 	struct nfsd4_slot	*se_slots[];	/* forward channel slots */
 };
 
+extern void nfsd4_put_session(struct nfsd4_session *ses);
+
 /* formatted contents of nfs4_sessionid */
 struct nfsd4_sessionid {
 	clientid_t	clientid;
@@ -284,18 +286,6 @@ struct nfs4_client {
 	struct net		*net;
 };
 
-static inline void
-mark_client_expired(struct nfs4_client *clp)
-{
-	clp->cl_time = 0;
-}
-
-static inline bool
-is_client_expired(struct nfs4_client *clp)
-{
-	return clp->cl_time == 0;
-}
-
 /* struct nfs4_client_reset
  * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
  * upon lease reset, or from upcall to state_daemon (to read in state
@@ -484,7 +474,7 @@ extern void nfs4_put_delegation(struct nfs4_delegation *dp);
 extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
 							struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
-extern void release_session_client(struct nfsd4_session *);
+extern void put_client_renew(struct nfs4_client *clp);
 extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
 
 /* nfs4recover operations */

From 66b2b9b2b0e8a9034806293a436628400a44a71d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 19 Mar 2013 12:05:39 -0400
Subject: [PATCH 21/60] nfsd4: don't destroy in-use session

This changes session destruction to be similar to client destruction in
that attempts to destroy a session while in use (which should be rare
corner cases) result in DELAY.  This simplifies things somewhat and
helps meet a coming 4.2 requirement.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 72 +++++++++++++++++++++++++--------------------
 fs/nfsd/state.h     |  4 ++-
 2 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3b4ce41c9db8..2fd015587167 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -94,17 +94,32 @@ nfs4_lock_state(void)
 	mutex_lock(&client_mutex);
 }
 
-static void free_session(struct kref *);
+static void free_session(struct nfsd4_session *);
 
-/* Must be called under the client_lock */
-static void nfsd4_put_session_locked(struct nfsd4_session *ses)
+void nfsd4_put_session(struct nfsd4_session *ses)
 {
-	kref_put(&ses->se_ref, free_session);
+	atomic_dec(&ses->se_ref);
 }
 
-static void nfsd4_get_session(struct nfsd4_session *ses)
+static bool is_session_dead(struct nfsd4_session *ses)
 {
-	kref_get(&ses->se_ref);
+	return ses->se_flags & NFS4_SESSION_DEAD;
+}
+
+static __be32 mark_session_dead_locked(struct nfsd4_session *ses)
+{
+	if (atomic_read(&ses->se_ref))
+		return nfserr_jukebox;
+	ses->se_flags |= NFS4_SESSION_DEAD;
+	return nfs_ok;
+}
+
+static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses)
+{
+	if (is_session_dead(ses))
+		return nfserr_badsession;
+	atomic_inc(&ses->se_ref);
+	return nfs_ok;
 }
 
 void
@@ -935,28 +950,15 @@ static void __free_session(struct nfsd4_session *ses)
 	kfree(ses);
 }
 
-static void free_session(struct kref *kref)
+static void free_session(struct nfsd4_session *ses)
 {
-	struct nfsd4_session *ses;
-	struct nfsd_net *nn;
-
-	ses = container_of(kref, struct nfsd4_session, se_ref);
-	nn = net_generic(ses->se_client->net, nfsd_net_id);
+	struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id);
 
 	lockdep_assert_held(&nn->client_lock);
 	nfsd4_del_conns(ses);
 	__free_session(ses);
 }
 
-void nfsd4_put_session(struct nfsd4_session *ses)
-{
-	struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id);
-
-	spin_lock(&nn->client_lock);
-	nfsd4_put_session_locked(ses);
-	spin_unlock(&nn->client_lock);
-}
-
 static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan,
 					   struct nfsd_net *nn)
 {
@@ -997,7 +999,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
 	new->se_flags = cses->flags;
 	new->se_cb_prog = cses->callback_prog;
 	new->se_cb_sec = cses->cb_sec;
-	kref_init(&new->se_ref);
+	atomic_set(&new->se_ref, 0);
 	idx = hash_sessionid(&new->se_sessionid);
 	spin_lock(&nn->client_lock);
 	list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
@@ -1095,7 +1097,8 @@ free_client(struct nfs4_client *clp)
 		ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
 				se_perclnt);
 		list_del(&ses->se_perclnt);
-		nfsd4_put_session_locked(ses);
+		WARN_ON_ONCE(atomic_read(&ses->se_ref));
+		free_session(ses);
 	}
 	free_svc_cred(&clp->cl_cred);
 	kfree(clp->cl_name.data);
@@ -1976,15 +1979,16 @@ nfsd4_destroy_session(struct svc_rqst *r,
 	status = nfserr_badsession;
 	if (!ses)
 		goto out_client_lock;
-
+	status = mark_session_dead_locked(ses);
+	if (status)
+		goto out_client_lock;
 	unhash_session(ses);
 	spin_unlock(&nn->client_lock);
 
 	nfsd4_probe_callback_sync(ses->se_client);
 
 	spin_lock(&nn->client_lock);
-	nfsd4_del_conns(ses);
-	nfsd4_put_session_locked(ses);
+	free_session(ses);
 	status = nfs_ok;
 out_client_lock:
 	spin_unlock(&nn->client_lock);
@@ -2075,18 +2079,21 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	status = get_client_locked(clp);
 	if (status)
 		goto out_no_session;
+	status = nfsd4_get_session_locked(session);
+	if (status)
+		goto out_put_client;
 
 	status = nfserr_too_many_ops;
 	if (nfsd4_session_too_many_ops(rqstp, session))
-		goto out_put_client;
+		goto out_put_session;
 
 	status = nfserr_req_too_big;
 	if (nfsd4_request_too_big(rqstp, session))
-		goto out_put_client;
+		goto out_put_session;
 
 	status = nfserr_badslot;
 	if (seq->slotid >= session->se_fchannel.maxreqs)
-		goto out_put_client;
+		goto out_put_session;
 
 	slot = session->se_slots[seq->slotid];
 	dprintk("%s: slotid %d\n", __func__, seq->slotid);
@@ -2101,7 +2108,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	if (status == nfserr_replay_cache) {
 		status = nfserr_seq_misordered;
 		if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED))
-			goto out_put_client;
+			goto out_put_session;
 		cstate->slot = slot;
 		cstate->session = session;
 		/* Return the cached reply status and set cstate->status
@@ -2111,7 +2118,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 		goto out;
 	}
 	if (status)
-		goto out_put_client;
+		goto out_put_session;
 
 	nfsd4_sequence_check_conn(conn, session);
 	conn = NULL;
@@ -2128,7 +2135,6 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	cstate->session = session;
 
 out:
-	nfsd4_get_session(cstate->session);
 	switch (clp->cl_cb_state) {
 	case NFSD4_CB_DOWN:
 		seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
@@ -2143,6 +2149,8 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	kfree(conn);
 	spin_unlock(&nn->client_lock);
 	return status;
+out_put_session:
+	nfsd4_put_session(session);
 out_put_client:
 	put_client_renew_locked(clp);
 	goto out_no_session;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 07f8a822a6ce..f6ae4db3efdb 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -194,9 +194,11 @@ struct nfsd4_conn {
 };
 
 struct nfsd4_session {
-	struct kref		se_ref;
+	atomic_t		se_ref;
 	struct list_head	se_hash;	/* hash by sessionid */
 	struct list_head	se_perclnt;
+/* See SESSION4_PERSIST, etc. for standard flags; this is internal-only: */
+#define NFS4_SESSION_DEAD	0x010
 	u32			se_flags;
 	struct nfs4_client	*se_client;
 	struct nfs4_sessionid	se_sessionid;

From 89876f8c0dbcc2947b13b9e22cf28c5308cee3c1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 2 Apr 2013 09:01:59 -0400
Subject: [PATCH 22/60] nfsd: convert the file_hashtbl to a hlist

We only ever traverse the hash chains in the forward direction, so a
double pointer list head isn't really necessary.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 14 ++++----------
 fs/nfsd/state.h     |  2 +-
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2fd015587167..7293e298aeed 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -236,7 +236,7 @@ static inline void
 put_nfs4_file(struct nfs4_file *fi)
 {
 	if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
-		list_del(&fi->fi_hash);
+		hlist_del(&fi->fi_hash);
 		spin_unlock(&recall_lock);
 		iput(fi->fi_inode);
 		nfsd4_free_file(fi);
@@ -280,7 +280,7 @@ static unsigned int file_hashval(struct inode *ino)
 	return hash_ptr(ino, FILE_HASH_BITS);
 }
 
-static struct list_head file_hashtbl[FILE_HASH_SIZE];
+static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
 
 static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
 {
@@ -2347,7 +2347,6 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
 	unsigned int hashval = file_hashval(ino);
 
 	atomic_set(&fp->fi_ref, 1);
-	INIT_LIST_HEAD(&fp->fi_hash);
 	INIT_LIST_HEAD(&fp->fi_stateids);
 	INIT_LIST_HEAD(&fp->fi_delegations);
 	fp->fi_inode = igrab(ino);
@@ -2356,7 +2355,7 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
 	memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
 	memset(fp->fi_access, 0, sizeof(fp->fi_access));
 	spin_lock(&recall_lock);
-	list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+	hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]);
 	spin_unlock(&recall_lock);
 }
 
@@ -2542,7 +2541,7 @@ find_file(struct inode *ino)
 	struct nfs4_file *fp;
 
 	spin_lock(&recall_lock);
-	list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
+	hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
 		if (fp->fi_inode == ino) {
 			get_nfs4_file(fp);
 			spin_unlock(&recall_lock);
@@ -4810,11 +4809,6 @@ struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_
 void
 nfs4_state_init(void)
 {
-	int i;
-
-	for (i = 0; i < FILE_HASH_SIZE; i++) {
-		INIT_LIST_HEAD(&file_hashtbl[i]);
-	}
 	INIT_LIST_HEAD(&del_recall_lru);
 }
 
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index f6ae4db3efdb..7674bc806200 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -380,7 +380,7 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
 /* nfs4_file: a file opened by some number of (open) nfs4_stateowners. */
 struct nfs4_file {
 	atomic_t		fi_ref;
-	struct list_head        fi_hash;    /* hash by "struct inode *" */
+	struct hlist_node       fi_hash;    /* hash by "struct inode *" */
 	struct list_head        fi_stateids;
 	struct list_head	fi_delegations;
 	/* One each for O_RDONLY, O_WRONLY, O_RDWR: */

From b022032e195ffca83d7002d6b84297d796ed443b Mon Sep 17 00:00:00 2001
From: fanchaoting <fanchaoting@cn.fujitsu.com>
Date: Mon, 1 Apr 2013 21:07:22 +0800
Subject: [PATCH 23/60] nfsd: don't run get_file if nfs4_preprocess_stateid_op
 return error

we should return error status directly when nfs4_preprocess_stateid_op
return error.

Signed-off-by: fanchaoting <fanchaoting@cn.fujitsu.com>
Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 42c498ce9f0e..a9b707b23858 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -953,14 +953,14 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	nfs4_lock_state();
 	status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
 					cstate, stateid, WR_STATE, &filp);
-	if (filp)
-		get_file(filp);
-	nfs4_unlock_state();
-
 	if (status) {
+		nfs4_unlock_state();
 		dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
 		return status;
 	}
+	if (filp)
+		get_file(filp);
+	nfs4_unlock_state();
 
 	cnt = write->wr_buflen;
 	write->wr_how_written = write->wr_stable_how;

From ff7c4b3693cbc7e938f49ed89e2f649a33f03ed1 Mon Sep 17 00:00:00 2001
From: fanchaoting <fanchaoting@cn.fujitsu.com>
Date: Wed, 27 Mar 2013 16:31:18 +0800
Subject: [PATCH 24/60] nfsd: remove /proc/fs/nfs when create
 /proc/fs/nfs/exports error

when create /proc/fs/nfs/exports error, we should remove /proc/fs/nfs,
if don't do it, it maybe cause Memory leak.

 Signed-off-by: fanchaoting <fanchaoting@cn.fujitsu.com>
 Reviewed-by: chendt.fnst <chendt.fnst@cn.fujitsu.com>

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsctl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index a830f33df3ef..68a4d320cd14 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1111,8 +1111,10 @@ static int create_proc_exports_entry(void)
 		return -ENOMEM;
 	entry = proc_create("exports", 0, entry,
 				 &exports_proc_operations);
-	if (!entry)
+	if (!entry) {
+		remove_proc_entry("fs/nfs", NULL);
 		return -ENOMEM;
+	}
 	return 0;
 }
 #else /* CONFIG_PROC_FS */

From a7823c797b4d29727499f0a75ddf3ed07af8ad0a Mon Sep 17 00:00:00 2001
From: Alexey Khoroshilov <khoroshilov@ispras.ru>
Date: Sat, 23 Mar 2013 00:36:44 +0400
Subject: [PATCH 25/60] SUNRPC/cache: add module_put() on error path in
 cache_open()

If kmalloc() fails in cache_open(), module cd->owner left locked.
The patch adds module_put(cd->owner) on this path.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/cache.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 25d58e766014..1d3c5144a331 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -986,8 +986,10 @@ static int cache_open(struct inode *inode, struct file *filp,
 	nonseekable_open(inode, filp);
 	if (filp->f_mode & FMODE_READ) {
 		rp = kmalloc(sizeof(*rp), GFP_KERNEL);
-		if (!rp)
+		if (!rp) {
+			module_put(cd->owner);
 			return -ENOMEM;
+		}
 		rp->offset = 0;
 		rp->q.reader = 1;
 		atomic_inc(&cd->readers);

From 8be2d2344cc192c20d7b2aa3211a5b74082e47d4 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 21 Mar 2013 10:59:29 -0400
Subject: [PATCH 26/60] nfsd4: minor cb_recall simplification

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 99bc85ff0217..be3ff0f3ff68 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -817,8 +817,7 @@ static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
 static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfsd4_callback *cb = calldata;
-	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-	struct nfs4_client *clp = dp->dl_stid.sc_client;
+	struct nfs4_client *clp = cb->cb_clp;
 	u32 minorversion = clp->cl_minorversion;
 
 	cb->cb_minorversion = minorversion;
@@ -839,8 +838,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 {
 	struct nfsd4_callback *cb = calldata;
-	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-	struct nfs4_client *clp = dp->dl_stid.sc_client;
+	struct nfs4_client *clp = cb->cb_clp;
 
 	dprintk("%s: minorversion=%d\n", __func__,
 		clp->cl_minorversion);
@@ -863,7 +861,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 {
 	struct nfsd4_callback *cb = calldata;
 	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-	struct nfs4_client *clp = dp->dl_stid.sc_client;
+	struct nfs4_client *clp = cb->cb_clp;
 	struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
 
 	nfsd4_cb_done(task, calldata);

From 68a3396178e6688ad7367202cdf0af8ed03c8727 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 21 Mar 2013 11:21:50 -0400
Subject: [PATCH 27/60] nfsd4: shut down more of delegation earlier

Once we've unhashed the delegation, it's only hanging around for the
benefit of an oustanding recall, which only needs the encoded
filehandle, stateid, and dl_retries counter.  No point keeping the file
around any longer, or keeping it hashed.

This also fixes a race: calls to idr_remove should really be serialized
by the caller, but the nfs4_put_delegation call from the callback code
isn't taking the state lock.

(Better might be to cancel the callback before destroying the
delegation, and remove any need for reference counting--but I don't see
an easy way to cancel an rpc call.)

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7293e298aeed..26a03fa6840a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -417,21 +417,18 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
 	return dp;
 }
 
-static void free_stid(struct nfs4_stid *s, struct kmem_cache *slab)
+static void remove_stid(struct nfs4_stid *s)
 {
 	struct idr *stateids = &s->sc_client->cl_stateids;
 
 	idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
-	kmem_cache_free(slab, s);
 }
 
 void
 nfs4_put_delegation(struct nfs4_delegation *dp)
 {
 	if (atomic_dec_and_test(&dp->dl_count)) {
-		dprintk("NFSD: freeing dp %p\n",dp);
-		put_nfs4_file(dp->dl_file);
-		free_stid(&dp->dl_stid, deleg_slab);
+		kmem_cache_free(deleg_slab, dp);
 		num_delegations--;
 	}
 }
@@ -462,6 +459,9 @@ unhash_delegation(struct nfs4_delegation *dp)
 	list_del_init(&dp->dl_recall_lru);
 	spin_unlock(&recall_lock);
 	nfs4_put_deleg_lease(dp->dl_file);
+	put_nfs4_file(dp->dl_file);
+	dp->dl_file = NULL;
+	remove_stid(&dp->dl_stid);
 	nfs4_put_delegation(dp);
 }
 
@@ -605,7 +605,8 @@ static void close_generic_stateid(struct nfs4_ol_stateid *stp)
 
 static void free_generic_stateid(struct nfs4_ol_stateid *stp)
 {
-	free_stid(&stp->st_stid, stateid_slab);
+	remove_stid(&stp->st_stid);
+	kmem_cache_free(stateid_slab, stp);
 }
 
 static void release_lock_stateid(struct nfs4_ol_stateid *stp)

From e8c69d17d1ef8437aee729322db005573a467fd6 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 21 Mar 2013 15:19:33 -0400
Subject: [PATCH 28/60] nfsd4: make del_recall_lru per-network-namespace

If nothing else this simplifies the nfs4_state_shutdown_net logic a tad.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/netns.h     |  1 +
 fs/nfsd/nfs4state.c | 15 +++++++--------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 1051bebff1b0..849a7c3ced22 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -80,6 +80,7 @@ struct nfsd_net {
 	 */
 	struct list_head client_lru;
 	struct list_head close_lru;
+	struct list_head del_recall_lru;
 
 	struct delayed_work laundromat_work;
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 26a03fa6840a..aae93045ce6b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -225,8 +225,6 @@ opaque_hashval(const void *ptr, int nbytes)
 	return x;
 }
 
-static struct list_head del_recall_lru;
-
 static void nfsd4_free_file(struct nfs4_file *f)
 {
 	kmem_cache_free(file_slab, f);
@@ -2583,6 +2581,9 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 
 static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 {
+	struct nfs4_client *clp = dp->dl_stid.sc_client;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
 	/* We're assuming the state code never drops its reference
 	 * without first removing the lease.  Since we're in this lease
 	 * callback (and since the lease code is serialized by the kernel
@@ -2590,7 +2591,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 	 * it's safe to take a reference: */
 	atomic_inc(&dp->dl_count);
 
-	list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
+	list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
 
 	/* only place dl_time is set. protected by lock_flocks*/
 	dp->dl_time = get_seconds();
@@ -3254,7 +3255,7 @@ nfs4_laundromat(struct nfsd_net *nn)
 		expire_client(clp);
 	}
 	spin_lock(&recall_lock);
-	list_for_each_safe(pos, next, &del_recall_lru) {
+	list_for_each_safe(pos, next, &nn->del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
 		if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn)
 			continue;
@@ -4810,7 +4811,6 @@ struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_
 void
 nfs4_state_init(void)
 {
-	INIT_LIST_HEAD(&del_recall_lru);
 }
 
 /*
@@ -4874,6 +4874,7 @@ static int nfs4_state_create_net(struct net *net)
 	nn->unconf_name_tree = RB_ROOT;
 	INIT_LIST_HEAD(&nn->client_lru);
 	INIT_LIST_HEAD(&nn->close_lru);
+	INIT_LIST_HEAD(&nn->del_recall_lru);
 	spin_lock_init(&nn->client_lock);
 
 	INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
@@ -4986,10 +4987,8 @@ nfs4_state_shutdown_net(struct net *net)
 
 	INIT_LIST_HEAD(&reaplist);
 	spin_lock(&recall_lock);
-	list_for_each_safe(pos, next, &del_recall_lru) {
+	list_for_each_safe(pos, next, &nn->del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-		if (dp->dl_stid.sc_client->net != net)
-			continue;
 		list_move(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&recall_lock);

From 41d22663cb6a4108091c050cba3c470a3e175dd9 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 21 Mar 2013 15:49:47 -0400
Subject: [PATCH 29/60] nfsd4: remove unused nfs4_check_deleg argument

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index aae93045ce6b..795b24d82d18 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2737,7 +2737,7 @@ static bool nfsd4_is_deleg_cur(struct nfsd4_open *open)
 }
 
 static __be32
-nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open,
+nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
 		struct nfs4_delegation **dp)
 {
 	int flags;
@@ -3062,7 +3062,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	if (fp) {
 		if ((status = nfs4_check_open(fp, open, &stp)))
 			goto out;
-		status = nfs4_check_deleg(cl, fp, open, &dp);
+		status = nfs4_check_deleg(cl, open, &dp);
 		if (status)
 			goto out;
 	} else {

From 9411b1d4c7df26dca6bc6261b5dc87a5b4c81e5c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 1 Apr 2013 16:37:12 -0400
Subject: [PATCH 30/60] nfsd4: cleanup handling of nfsv4.0 closed stateid's

Closed stateid's are kept around a little while to handle close replays
in the 4.0 case.  So we stash them in the last-used stateid in the
oo_last_closed_stateid field of the open owner.  We can free that in
encode_seqid_op_tail once the seqid on the open owner is next
incremented.  But we don't want to do that on the close itself; so we
set NFS4_OO_PURGE_CLOSE flag set on the open owner, skip freeing it the
first time through encode_seqid_op_tail, then when we see that flag set
next time we free it.

This is unnecessarily baroque.

Instead, just move the logic that increments the seqid out of the xdr
code and into the operation code itself.

The justification given for the current placement is that we need to
wait till the last minute to be sure we know whether the status is a
sequence-id-mutating error or not, but examination of the code shows
that can't actually happen.

Reported-by: Yanchuan Nian <ycnian@gmail.com>
Tested-by: Yanchuan Nian <ycnian@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c  |  3 ++-
 fs/nfsd/nfs4state.c | 57 ++++++++++++++++++++++++++-------------------
 fs/nfsd/nfs4xdr.c   | 34 +++++----------------------
 fs/nfsd/state.h     |  4 +---
 fs/nfsd/xdr4.h      |  1 +
 5 files changed, 43 insertions(+), 56 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index a9b707b23858..609e1e211330 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -415,7 +415,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	nfsd4_cleanup_open_state(open, status);
 	if (open->op_openowner)
 		cstate->replay_owner = &open->op_openowner->oo_owner;
-	else
+	nfsd4_bump_seqid(cstate, status);
+	if (!cstate->replay_owner)
 		nfs4_unlock_state();
 	return status;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 795b24d82d18..bcd2339ae8c1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -720,6 +720,28 @@ dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
 }
 #endif
 
+/*
+ * Bump the seqid on cstate->replay_owner, and clear replay_owner if it
+ * won't be used for replay.
+ */
+void nfsd4_bump_seqid(struct nfsd4_compound_state *cstate, __be32 nfserr)
+{
+	struct nfs4_stateowner *so = cstate->replay_owner;
+
+	if (nfserr == nfserr_replay_me)
+		return;
+
+	if (!seqid_mutating_err(ntohl(nfserr))) {
+		cstate->replay_owner = NULL;
+		return;
+	}
+	if (!so)
+		return;
+	if (so->so_is_open_owner)
+		release_last_closed_stateid(openowner(so));
+	so->so_seqid++;
+	return;
+}
 
 static void
 gen_sessionid(struct nfsd4_session *ses)
@@ -3702,6 +3724,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	nfsd4_client_record_create(oo->oo_owner.so_client);
 	status = nfs_ok;
 out:
+	nfsd4_bump_seqid(cstate, status);
 	if (!cstate->replay_owner)
 		nfs4_unlock_state();
 	return status;
@@ -3785,31 +3808,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 	memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 	status = nfs_ok;
 out:
+	nfsd4_bump_seqid(cstate, status);
 	if (!cstate->replay_owner)
 		nfs4_unlock_state();
 	return status;
 }
 
-void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so)
-{
-	struct nfs4_openowner *oo;
-	struct nfs4_ol_stateid *s;
-
-	if (!so->so_is_open_owner)
-		return;
-	oo = openowner(so);
-	s = oo->oo_last_closed_stid;
-	if (!s)
-		return;
-	if (!(oo->oo_flags & NFS4_OO_PURGE_CLOSE)) {
-		/* Release the last_closed_stid on the next seqid bump: */
-		oo->oo_flags |= NFS4_OO_PURGE_CLOSE;
-		return;
-	}
-	oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE;
-	release_last_closed_stateid(oo);
-}
-
 static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 {
 	unhash_open_stateid(s);
@@ -3838,17 +3842,20 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 					&close->cl_stateid,
 					NFS4_OPEN_STID|NFS4_CLOSED_STID,
 					&stp, nn);
+	nfsd4_bump_seqid(cstate, status);
 	if (status)
 		goto out; 
 	oo = openowner(stp->st_stateowner);
-	status = nfs_ok;
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
 	nfsd4_close_open_stateid(stp);
-	release_last_closed_stateid(oo);
-	oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE;
-	oo->oo_last_closed_stid = stp;
+
+	if (cstate->minorversion) {
+		unhash_stid(&stp->st_stid);
+		free_generic_stateid(stp);
+	} else
+		oo->oo_last_closed_stid = stp;
 
 	if (list_empty(&oo->oo_owner.so_stateids)) {
 		if (cstate->minorversion) {
@@ -4270,6 +4277,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 out:
 	if (status && new_state)
 		release_lockowner(lock_sop);
+	nfsd4_bump_seqid(cstate, status);
 	if (!cstate->replay_owner)
 		nfs4_unlock_state();
 	if (file_lock)
@@ -4439,6 +4447,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
 out:
+	nfsd4_bump_seqid(cstate, status);
 	if (!cstate->replay_owner)
 		nfs4_unlock_state();
 	if (file_lock)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 700de0192834..a5e8a6424843 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1701,28 +1701,6 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
 								\
 	save = resp->p;
 
-/*
- * Routine for encoding the result of a "seqid-mutating" NFSv4 operation.  This
- * is where sequence id's are incremented, and the replay cache is filled.
- * Note that we increment sequence id's here, at the last moment, so we're sure
- * we know whether the error to be returned is a sequence id mutating error.
- */
-
-static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, __be32 nfserr)
-{
-	struct nfs4_stateowner *stateowner = resp->cstate.replay_owner;
-
-	if (seqid_mutating_err(ntohl(nfserr)) && stateowner) {
-		stateowner->so_seqid++;
-		stateowner->so_replay.rp_status = nfserr;
-		stateowner->so_replay.rp_buflen =
-			  (char *)resp->p - (char *)save;
-		memcpy(stateowner->so_replay.rp_buf, save,
-			stateowner->so_replay.rp_buflen);
-		nfsd4_purge_closed_stateid(stateowner);
-	}
-}
-
 /* Encode as an array of strings the string given with components
  * separated @sep, escaped with esc_enter and esc_exit.
  */
@@ -2667,7 +2645,6 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &close->cl_stateid);
 
-	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2770,7 +2747,6 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
 	else if (nfserr == nfserr_denied)
 		nfsd4_encode_lock_denied(resp, &lock->lk_denied);
 
-	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2790,7 +2766,6 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &locku->lu_stateid);
 
-	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2885,7 +2860,6 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
 	}
 	/* XXX save filehandle here */
 out:
-	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2897,7 +2871,6 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
 
-	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2909,7 +2882,6 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &od->od_stateid);
 
-	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -3567,6 +3539,7 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
 void
 nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
+	struct nfs4_stateowner *so = resp->cstate.replay_owner;
 	__be32 *statp;
 	__be32 *p;
 
@@ -3583,6 +3556,11 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 	/* nfsd4_check_drc_limit guarantees enough room for error status */
 	if (!op->status)
 		op->status = nfsd4_check_resp_size(resp, 0);
+	if (so) {
+		so->so_replay.rp_status = op->status;
+		so->so_replay.rp_buflen = (char *)resp->p - (char *)(statp+1);
+		memcpy(so->so_replay.rp_buf, statp+1, so->so_replay.rp_buflen);
+	}
 status:
 	/*
 	 * Note: We write the status directly, instead of using WRITE32(),
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 7674bc806200..13ec4853e9af 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -355,7 +355,6 @@ struct nfs4_openowner {
 	struct nfs4_ol_stateid *oo_last_closed_stid;
 	time_t			oo_time; /* time of placement on so_close_lru */
 #define NFS4_OO_CONFIRMED   1
-#define NFS4_OO_PURGE_CLOSE 2
 #define NFS4_OO_NEW         4
 	unsigned char		oo_flags;
 };
@@ -363,7 +362,7 @@ struct nfs4_openowner {
 struct nfs4_lockowner {
 	struct nfs4_stateowner	lo_owner; /* must be first element */
 	struct list_head	lo_owner_ino_hash; /* hash by owner,file */
-	struct list_head        lo_perstateid; /* for lockowners only */
+	struct list_head        lo_perstateid;
 	struct list_head	lo_list; /* for temporary uses */
 };
 
@@ -477,7 +476,6 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
 							struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
 extern void put_client_renew(struct nfs4_client *clp);
-extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
 
 /* nfs4recover operations */
 extern int nfsd4_client_tracking_init(struct net *net);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 40e05e6d2518..3b271d2092b6 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -623,6 +623,7 @@ extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *, struct nfsd4_test_stateid *test_stateid);
 extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *, struct nfsd4_free_stateid *free_stateid);
+extern void nfsd4_bump_seqid(struct nfsd4_compound_state *, __be32 nfserr);
 #endif
 
 /*

From 2c44a23471d048118e49b616d08df0729cdbd9f1 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Tue, 9 Apr 2013 14:15:31 +0800
Subject: [PATCH 31/60] nfsd: use kmem_cache_free() instead of kfree()

memory allocated by kmem_cache_alloc() should be freed using
kmem_cache_free(), not kfree().

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bcd2339ae8c1..9cb9f6e3f5f2 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -364,7 +364,7 @@ kmem_cache *slab)
 		min_stateid = 0;
 	return stid;
 out_free:
-	kfree(stid);
+	kmem_cache_free(slab, stid);
 	return NULL;
 }
 

From c383747ef674467d02dd9c9320a47de2067b0ce3 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sun, 7 Apr 2013 13:21:08 -0400
Subject: [PATCH 32/60] nfsd4: remove some redundant comments

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9cb9f6e3f5f2..1226ff6030bd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4432,17 +4432,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 						locku->lu_length);
 	nfs4_transform_lock_offset(file_lock);
 
-	/*
-	*  Try to unlock the file in the VFS.
-	*/
 	err = vfs_lock_file(filp, F_SETLK, file_lock, NULL);
 	if (err) {
 		dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
 		goto out_nfserr;
 	}
-	/*
-	* OK, unlock succeeded; the only thing left to do is update the stateid.
-	*/
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 

From 3d74e6a5b6b0d1e4786d1596081bed6ab63a4cac Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 22 Mar 2013 17:44:19 -0400
Subject: [PATCH 33/60] nfsd4: no need for replay_owner in sessions case

The replay_owner will never be used in the sessions case.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c  | 2 +-
 fs/nfsd/nfs4state.c | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 609e1e211330..c97bb424f55f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -413,7 +413,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	WARN_ON(status && open->op_created);
 out:
 	nfsd4_cleanup_open_state(open, status);
-	if (open->op_openowner)
+	if (open->op_openowner && !nfsd4_has_session(cstate))
 		cstate->replay_owner = &open->op_openowner->oo_owner;
 	nfsd4_bump_seqid(cstate, status);
 	if (!cstate->replay_owner)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1226ff6030bd..16db25dc364f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3666,7 +3666,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	if (status)
 		return status;
 	*stpp = openlockstateid(s);
-	cstate->replay_owner = (*stpp)->st_stateowner;
+	if (!nfsd4_has_session(cstate))
+		cstate->replay_owner = (*stpp)->st_stateowner;
 
 	return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
 }
@@ -3858,10 +3859,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		oo->oo_last_closed_stid = stp;
 
 	if (list_empty(&oo->oo_owner.so_stateids)) {
-		if (cstate->minorversion) {
+		if (cstate->minorversion)
 			release_openowner(oo);
-			cstate->replay_owner = NULL;
-		} else {
+		else {
 			/*
 			 * In the 4.0 case we need to keep the owners around a
 			 * little while to handle CLOSE replay.

From bbc9c36c31fc5827c22359a8b0ba9dd71b5eecfc Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 22 Mar 2013 18:03:49 -0400
Subject: [PATCH 34/60] nfsd4: more sessions/open-owner-replay cleanup

More logic that's unnecessary in the 4.1 case.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c97bb424f55f..5dee81141ab7 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -191,9 +191,18 @@ static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
 	return nfserr_symlink;
 }
 
-static __be32
-do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
+static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh *resfh)
 {
+	if (nfsd4_has_session(cstate))
+		return;
+	fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
+			&resfh->fh_handle);
+}
+
+static __be32
+do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
+{
+	struct svc_fh *current_fh = &cstate->current_fh;
 	struct svc_fh *resfh;
 	int accmode;
 	__be32 status;
@@ -252,9 +261,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 	if (is_create_with_attrs(open) && open->op_acl != NULL)
 		do_set_nfs4_acl(rqstp, resfh, open->op_acl, open->op_bmval);
 
-	/* set reply cache */
-	fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
-			&resfh->fh_handle);
+	nfsd4_set_open_owner_reply_cache(cstate, open, resfh);
 	accmode = NFSD_MAY_NOP;
 	if (open->op_created)
 		accmode |= NFSD_MAY_OWNER_OVERRIDE;
@@ -268,8 +275,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 }
 
 static __be32
-do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
+do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
 {
+	struct svc_fh *current_fh = &cstate->current_fh;
 	__be32 status;
 
 	/* We don't know the target directory, and therefore can not
@@ -278,9 +286,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
 
 	memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
 
-	/* set replay cache */
-	fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
-			&current_fh->fh_handle);
+	nfsd4_set_open_owner_reply_cache(cstate, open, current_fh);
 
 	open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
 		(open->op_iattr.ia_size == 0);
@@ -372,8 +378,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	switch (open->op_claim_type) {
 		case NFS4_OPEN_CLAIM_DELEGATE_CUR:
 		case NFS4_OPEN_CLAIM_NULL:
-			status = do_open_lookup(rqstp, &cstate->current_fh,
-						open);
+			status = do_open_lookup(rqstp, cstate, open);
 			if (status)
 				goto out;
 			break;
@@ -386,8 +391,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				goto out;
 		case NFS4_OPEN_CLAIM_FH:
 		case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
-			status = do_open_fhandle(rqstp, &cstate->current_fh,
-						 open);
+			status = do_open_fhandle(rqstp, cstate, open);
 			if (status)
 				goto out;
 			break;

From eb2099f31b0f090684a64ef8df44a30ff7c45fc2 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sun, 7 Apr 2013 13:28:16 -0400
Subject: [PATCH 35/60] nfsd4: release lockowners on last unlock in 4.1 case

In the 4.1 case we're supposed to release lockowners as soon as they're
no longer used.

It would probably be more efficient to reference count them, but that's
slightly fiddly due to the need to have callbacks from locks.c to take
into account lock merging and splitting.

For most cases just scanning the inode's lock list on unlock for
matching locks will be sufficient.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 16db25dc364f..ff1577d6df62 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4387,6 +4387,7 @@ __be32
 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	    struct nfsd4_locku *locku)
 {
+	struct nfs4_lockowner *lo;
 	struct nfs4_ol_stateid *stp;
 	struct file *filp = NULL;
 	struct file_lock *file_lock = NULL;
@@ -4419,9 +4420,10 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		status = nfserr_jukebox;
 		goto out;
 	}
+	lo = lockowner(stp->st_stateowner);
 	locks_init_lock(file_lock);
 	file_lock->fl_type = F_UNLCK;
-	file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
+	file_lock->fl_owner = (fl_owner_t)lo;
 	file_lock->fl_pid = current->tgid;
 	file_lock->fl_file = filp;
 	file_lock->fl_flags = FL_POSIX;
@@ -4440,6 +4442,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
+	if (nfsd4_has_session(cstate) && !check_for_locks(stp->st_file, lo)) {
+		WARN_ON_ONCE(cstate->replay_owner);
+		release_lockowner(lo);
+	}
+
 out:
 	nfsd4_bump_seqid(cstate, status);
 	if (!cstate->replay_owner)

From 0c7c3e67ab91ec6caa44bdf1fc89a48012ceb0c5 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 28 Mar 2013 20:37:14 -0400
Subject: [PATCH 36/60] nfsd4: don't close read-write opens too soon
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Don't actually close any opens until we don't need them at all.

This means being left with write access when it's not really necessary,
but that's better than putting a file that might still have posix locks
held on it, as we have been.

Reported-by: Toralf Förster <toralf.foerster@gmx.de>
Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ff1577d6df62..7d2e3b54b9df 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -307,13 +307,7 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
 {
 	if (atomic_dec_and_test(&fp->fi_access[oflag])) {
 		nfs4_file_put_fd(fp, oflag);
-		/*
-		 * It's also safe to get rid of the RDWR open *if*
-		 * we no longer have need of the other kind of access
-		 * or if we already have the other kind of open:
-		 */
-		if (fp->fi_fds[1-oflag]
-			|| atomic_read(&fp->fi_access[1 - oflag]) == 0)
+		if (atomic_read(&fp->fi_access[1 - oflag]) == 0)
 			nfs4_file_put_fd(fp, O_RDWR);
 	}
 }

From 373cd4098a6392395af63af220d989df00b444f7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 8 Apr 2013 15:42:12 -0400
Subject: [PATCH 37/60] nfsd4: cleanup check_forechannel_attrs

Pass this struct by reference, not by value, and return an error instead
of a boolean to allow for future additions.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7d2e3b54b9df..f1262f73f08b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1803,10 +1803,13 @@ nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
 				/* seqid, slotID, slotID, slotID, status */ \
 			5 ) * sizeof(__be32))
 
-static bool check_forechannel_attrs(struct nfsd4_channel_attrs fchannel)
+static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca)
 {
-	return fchannel.maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ
-		|| fchannel.maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ;
+	if (ca->maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ)
+		return nfserr_toosmall;
+	if (ca->maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ)
+		return nfserr_toosmall;
+	return nfs_ok;
 }
 
 __be32
@@ -1824,8 +1827,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 
 	if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
 		return nfserr_inval;
-	if (check_forechannel_attrs(cr_ses->fore_channel))
-		return nfserr_toosmall;
+	status = check_forechannel_attrs(&cr_ses->fore_channel);
+	if (status)
+		return status;
 	new = alloc_session(&cr_ses->fore_channel, nn);
 	if (!new)
 		return nfserr_jukebox;

From 55c760cfc40d75b4d8a17d56580ec306db2ab14f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 8 Apr 2013 16:44:14 -0400
Subject: [PATCH 38/60] nfsd4: fix forechannel attribute negotiation

Negotiation of the 4.1 session forechannel attributes is a mess.  Fix:

	- Move it all into check_forechannel_attrs instead of spreading
	  it between that, alloc_session, and init_forechannel_attrs.
	- set a minimum "slotsize" so that our drc memory limits apply
	  even for small maxresponsesize_cached.  This also fixes some
	  bugs when slotsize becomes <= 0.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 116 +++++++++++++++++++-------------------------
 1 file changed, 49 insertions(+), 67 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f1262f73f08b..036d5f16fd7f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -776,17 +776,15 @@ free_session_slots(struct nfsd4_session *ses)
  * We don't actually need to cache the rpc and session headers, so we
  * can allocate a little less for each slot:
  */
-static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
+static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca)
 {
-	return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
-}
+	u32 size;
 
-static int nfsd4_sanitize_slot_size(u32 size)
-{
-	size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */
-	size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE);
-
-	return size;
+	if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ)
+		size = 0;
+	else
+		size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+	return size + sizeof(struct nfsd4_slot);
 }
 
 /*
@@ -794,12 +792,12 @@ static int nfsd4_sanitize_slot_size(u32 size)
  * re-negotiate active sessions and reduce their slot usage to make
  * room for new connections. For now we just fail the create session.
  */
-static int nfsd4_get_drc_mem(int slotsize, u32 num)
+static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca)
 {
+	u32 slotsize = slot_bytes(ca);
+	u32 num = ca->maxreqs;
 	int avail;
 
-	num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
-
 	spin_lock(&nfsd_drc_lock);
 	avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
 		    nfsd_drc_max_mem - nfsd_drc_mem_used);
@@ -810,15 +808,19 @@ static int nfsd4_get_drc_mem(int slotsize, u32 num)
 	return num;
 }
 
-static void nfsd4_put_drc_mem(int slotsize, int num)
+static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca)
 {
+	int slotsize = slot_bytes(ca);
+
 	spin_lock(&nfsd_drc_lock);
-	nfsd_drc_mem_used -= slotsize * num;
+	nfsd_drc_mem_used -= slotsize * ca->maxreqs;
 	spin_unlock(&nfsd_drc_lock);
 }
 
-static struct nfsd4_session *__alloc_session(int slotsize, int numslots)
+static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs)
 {
+	int numslots = attrs->maxreqs;
+	int slotsize = slot_bytes(attrs);
 	struct nfsd4_session *new;
 	int mem, i;
 
@@ -831,8 +833,7 @@ static struct nfsd4_session *__alloc_session(int slotsize, int numslots)
 		return NULL;
 	/* allocate each struct nfsd4_slot and data cache in one piece */
 	for (i = 0; i < numslots; i++) {
-		mem = sizeof(struct nfsd4_slot) + slotsize;
-		new->se_slots[i] = kzalloc(mem, GFP_KERNEL);
+		new->se_slots[i] = kzalloc(slotsize, GFP_KERNEL);
 		if (!new->se_slots[i])
 			goto out_free;
 	}
@@ -844,21 +845,6 @@ static struct nfsd4_session *__alloc_session(int slotsize, int numslots)
 	return NULL;
 }
 
-static void init_forechannel_attrs(struct nfsd4_channel_attrs *new,
-				   struct nfsd4_channel_attrs *req,
-				   int numslots, int slotsize,
-				   struct nfsd_net *nn)
-{
-	u32 maxrpc = nn->nfsd_serv->sv_max_mesg;
-
-	new->maxreqs = numslots;
-	new->maxresp_cached = min_t(u32, req->maxresp_cached,
-					slotsize + NFSD_MIN_HDR_SEQ_SZ);
-	new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
-	new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
-	new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
-}
-
 static void free_conn(struct nfsd4_conn *c)
 {
 	svc_xprt_put(c->cn_xprt);
@@ -960,7 +946,6 @@ static void nfsd4_del_conns(struct nfsd4_session *s)
 
 static void __free_session(struct nfsd4_session *ses)
 {
-	nfsd4_put_drc_mem(slot_bytes(&ses->se_fchannel), ses->se_fchannel.maxreqs);
 	free_session_slots(ses);
 	kfree(ses);
 }
@@ -971,35 +956,10 @@ static void free_session(struct nfsd4_session *ses)
 
 	lockdep_assert_held(&nn->client_lock);
 	nfsd4_del_conns(ses);
+	nfsd4_put_drc_mem(&ses->se_fchannel);
 	__free_session(ses);
 }
 
-static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan,
-					   struct nfsd_net *nn)
-{
-	struct nfsd4_session *new;
-	int numslots, slotsize;
-	/*
-	 * Note decreasing slot size below client's request may
-	 * make it difficult for client to function correctly, whereas
-	 * decreasing the number of slots will (just?) affect
-	 * performance.  When short on memory we therefore prefer to
-	 * decrease number of slots instead of their size.
-	 */
-	slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
-	numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
-	if (numslots < 1)
-		return NULL;
-
-	new = __alloc_session(slotsize, numslots);
-	if (!new) {
-		nfsd4_put_drc_mem(slotsize, numslots);
-		return NULL;
-	}
-	init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize, nn);
-	return new;
-}
-
 static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
 {
 	int idx;
@@ -1022,7 +982,8 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
 	list_add(&new->se_perclnt, &clp->cl_sessions);
 	spin_unlock(&clp->cl_lock);
 	spin_unlock(&nn->client_lock);
-
+	memcpy(&new->se_fchannel, &cses->fore_channel,
+			sizeof(struct nfsd4_channel_attrs));
 	if (cses->flags & SESSION4_BACK_CHAN) {
 		struct sockaddr *sa = svc_addr(rqstp);
 		/*
@@ -1803,12 +1764,33 @@ nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
 				/* seqid, slotID, slotID, slotID, status */ \
 			5 ) * sizeof(__be32))
 
-static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca)
+static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn)
 {
+	u32 maxrpc = nn->nfsd_serv->sv_max_mesg;
+
 	if (ca->maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ)
 		return nfserr_toosmall;
 	if (ca->maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ)
 		return nfserr_toosmall;
+	ca->headerpadsz = 0;
+	ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc);
+	ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc);
+	ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND);
+	ca->maxresp_cached = min_t(u32, ca->maxresp_cached,
+			NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ);
+	ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION);
+	/*
+	 * Note decreasing slot size below client's request may make it
+	 * difficult for client to function correctly, whereas
+	 * decreasing the number of slots will (just?) affect
+	 * performance.  When short on memory we therefore prefer to
+	 * decrease number of slots instead of their size.  Clients that
+	 * request larger slots than they need will get poor results:
+	 */
+	ca->maxreqs = nfsd4_get_drc_mem(ca);
+	if (!ca->maxreqs)
+		return nfserr_jukebox;
+
 	return nfs_ok;
 }
 
@@ -1827,13 +1809,13 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 
 	if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
 		return nfserr_inval;
-	status = check_forechannel_attrs(&cr_ses->fore_channel);
+	status = check_forechannel_attrs(&cr_ses->fore_channel, nn);
 	if (status)
 		return status;
-	new = alloc_session(&cr_ses->fore_channel, nn);
-	if (!new)
-		return nfserr_jukebox;
 	status = nfserr_jukebox;
+	new = alloc_session(&cr_ses->fore_channel);
+	if (!new)
+		goto out_release_drc_mem;
 	conn = alloc_conn_from_crses(rqstp, cr_ses);
 	if (!conn)
 		goto out_free_session;
@@ -1892,8 +1874,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 
 	memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
 	       NFS4_MAX_SESSIONID_LEN);
-	memcpy(&cr_ses->fore_channel, &new->se_fchannel,
-		sizeof(struct nfsd4_channel_attrs));
 	cs_slot->sl_seqid++;
 	cr_ses->seqid = cs_slot->sl_seqid;
 
@@ -1906,6 +1886,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	free_conn(conn);
 out_free_session:
 	__free_session(new);
+out_release_drc_mem:
+	nfsd4_put_drc_mem(&cr_ses->fore_channel);
 	return status;
 }
 

From 06b332a52293a45324320b6b446a7fa677fb6702 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 9 Apr 2013 11:34:36 -0400
Subject: [PATCH 39/60] nfsd4: check backchannel attributes on create_session

Make sure the client gives us an adequate backchannel.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c          | 25 +------------------------
 fs/nfsd/nfs4state.c             | 25 +++++++++++++++++++++++++
 fs/nfsd/xdr4cb.h                | 23 +++++++++++++++++++++++
 include/linux/sunrpc/msg_prot.h |  3 +++
 4 files changed, 52 insertions(+), 24 deletions(-)
 create mode 100644 fs/nfsd/xdr4cb.h

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index be3ff0f3ff68..7f05cd140de3 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -37,6 +37,7 @@
 #include "nfsd.h"
 #include "state.h"
 #include "netns.h"
+#include "xdr4cb.h"
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -53,30 +54,6 @@ enum {
 	NFSPROC4_CLNT_CB_SEQUENCE,
 };
 
-#define NFS4_MAXTAGLEN		20
-
-#define NFS4_enc_cb_null_sz		0
-#define NFS4_dec_cb_null_sz		0
-#define cb_compound_enc_hdr_sz		4
-#define cb_compound_dec_hdr_sz		(3 + (NFS4_MAXTAGLEN >> 2))
-#define sessionid_sz			(NFS4_MAX_SESSIONID_LEN >> 2)
-#define cb_sequence_enc_sz		(sessionid_sz + 4 +             \
-					1 /* no referring calls list yet */)
-#define cb_sequence_dec_sz		(op_dec_sz + sessionid_sz + 4)
-
-#define op_enc_sz			1
-#define op_dec_sz			2
-#define enc_nfs4_fh_sz			(1 + (NFS4_FHSIZE >> 2))
-#define enc_stateid_sz			(NFS4_STATEID_SIZE >> 2)
-#define NFS4_enc_cb_recall_sz		(cb_compound_enc_hdr_sz +       \
-					cb_sequence_enc_sz +            \
-					1 + enc_stateid_sz +            \
-					enc_nfs4_fh_sz)
-
-#define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
-					cb_sequence_dec_sz +            \
-					op_dec_sz)
-
 struct nfs4_cb_compound_hdr {
 	/* args */
 	u32		ident;	/* minorversion 0 only */
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 036d5f16fd7f..67017fcebb21 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -42,6 +42,7 @@
 #include <linux/sunrpc/svcauth_gss.h>
 #include <linux/sunrpc/addr.h>
 #include "xdr4.h"
+#include "xdr4cb.h"
 #include "vfs.h"
 #include "current_stateid.h"
 
@@ -1794,6 +1795,27 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
 	return nfs_ok;
 }
 
+static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
+{
+	ca->headerpadsz = 0;
+
+	/*
+	 * These RPC_MAX_HEADER macros are overkill, especially since we
+	 * don't even do gss on the backchannel yet.  But this is still
+	 * less than 1k.  Tighten up this estimate in the unlikely event
+	 * it turns out to be a problem for some client:
+	 */
+	if (ca->maxreq_sz < NFS4_enc_cb_recall_sz + RPC_MAX_HEADER_WITH_AUTH)
+		return nfserr_toosmall;
+	if (ca->maxresp_sz < NFS4_dec_cb_recall_sz + RPC_MAX_REPHEADER_WITH_AUTH)
+		return nfserr_toosmall;
+	ca->maxresp_cached = 0;
+	if (ca->maxops < 2)
+		return nfserr_toosmall;
+
+	return nfs_ok;
+}
+
 __be32
 nfsd4_create_session(struct svc_rqst *rqstp,
 		     struct nfsd4_compound_state *cstate,
@@ -1810,6 +1832,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
 		return nfserr_inval;
 	status = check_forechannel_attrs(&cr_ses->fore_channel, nn);
+	if (status)
+		return status;
+	status = check_backchannel_attrs(&cr_ses->back_channel);
 	if (status)
 		return status;
 	status = nfserr_jukebox;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
new file mode 100644
index 000000000000..c5c55dfb91a9
--- /dev/null
+++ b/fs/nfsd/xdr4cb.h
@@ -0,0 +1,23 @@
+#define NFS4_MAXTAGLEN		20
+
+#define NFS4_enc_cb_null_sz		0
+#define NFS4_dec_cb_null_sz		0
+#define cb_compound_enc_hdr_sz		4
+#define cb_compound_dec_hdr_sz		(3 + (NFS4_MAXTAGLEN >> 2))
+#define sessionid_sz			(NFS4_MAX_SESSIONID_LEN >> 2)
+#define cb_sequence_enc_sz		(sessionid_sz + 4 +             \
+					1 /* no referring calls list yet */)
+#define cb_sequence_dec_sz		(op_dec_sz + sessionid_sz + 4)
+
+#define op_enc_sz			1
+#define op_dec_sz			2
+#define enc_nfs4_fh_sz			(1 + (NFS4_FHSIZE >> 2))
+#define enc_stateid_sz			(NFS4_STATEID_SIZE >> 2)
+#define NFS4_enc_cb_recall_sz		(cb_compound_enc_hdr_sz +       \
+					cb_sequence_enc_sz +            \
+					1 + enc_stateid_sz +            \
+					enc_nfs4_fh_sz)
+
+#define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
+					cb_sequence_dec_sz +            \
+					op_dec_sz)
diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h
index c68a147939a6..aadc6a04e1ac 100644
--- a/include/linux/sunrpc/msg_prot.h
+++ b/include/linux/sunrpc/msg_prot.h
@@ -138,6 +138,9 @@ typedef __be32	rpc_fraghdr;
 #define RPC_MAX_HEADER_WITH_AUTH \
 	(RPC_CALLHDRSIZE + 2*(2+RPC_MAX_AUTH_SIZE/4))
 
+#define RPC_MAX_REPHEADER_WITH_AUTH \
+	(RPC_REPHDRSIZE + (2 + RPC_MAX_AUTH_SIZE/4))
+
 /*
  * RFC1833/RFC3530 rpcbind (v3+) well-known netid's.
  */

From 23340032e64d70ce76817a88e8193c8040b095cf Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 9 Apr 2013 17:42:28 -0400
Subject: [PATCH 40/60] nfsd4: clean up validate_stateid

The logic here is better expressed with a switch statement.

While we're here, CLOSED stateids (or stateids of an unkown type--which
would indicate a server bug) should probably return nfserr_bad_stateid,
though this behavior shouldn't affect any non-buggy client.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 67017fcebb21..add9721ab059 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3456,13 +3456,22 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 	status = check_stateid_generation(stateid, &s->sc_stateid, 1);
 	if (status)
 		return status;
-	if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID)))
+	switch (s->sc_type) {
+	case NFS4_DELEG_STID:
 		return nfs_ok;
-	ols = openlockstateid(s);
-	if (ols->st_stateowner->so_is_open_owner
-	    && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
+	case NFS4_OPEN_STID:
+	case NFS4_LOCK_STID:
+		ols = openlockstateid(s);
+		if (ols->st_stateowner->so_is_open_owner
+	    			&& !(openowner(ols->st_stateowner)->oo_flags
+						& NFS4_OO_CONFIRMED))
+			return nfserr_bad_stateid;
+		return nfs_ok;
+	default:
+		printk("unknown stateid type %x\n", s->sc_type);
+	case NFS4_CLOSED_STID:
 		return nfserr_bad_stateid;
-	return nfs_ok;
+	}
 }
 
 static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,

From 3bd64a5ba1719c2bb6cba4493dfd3e23a7653e54 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 9 Apr 2013 17:02:51 -0400
Subject: [PATCH 41/60] nfsd4: implement SEQ4_STATUS_RECALLABLE_STATE_REVOKED

A 4.1 server must notify a client that has had any state revoked using
the SEQ4_STATUS_RECALLABLE_STATE_REVOKED flag.  The client can figure
out exactly which state is the problem using CHECK_STATEID and then free
it using FREE_STATEID.  The status flag will be unset once all such
revoked stateids are freed.

Our server's only recallable state is delegations.  So we keep with each
4.1 client a list of delegations that have timed out and been recalled,
but haven't yet been freed by FREE_STATEID.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 55 ++++++++++++++++++++++++++++++++++++++-------
 fs/nfsd/state.h     |  3 +++
 2 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index add9721ab059..3b84700d1bd7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -445,7 +445,6 @@ static void unhash_stid(struct nfs4_stid *s)
 static void
 unhash_delegation(struct nfs4_delegation *dp)
 {
-	unhash_stid(&dp->dl_stid);
 	list_del_init(&dp->dl_perclnt);
 	spin_lock(&recall_lock);
 	list_del_init(&dp->dl_perfile);
@@ -454,10 +453,37 @@ unhash_delegation(struct nfs4_delegation *dp)
 	nfs4_put_deleg_lease(dp->dl_file);
 	put_nfs4_file(dp->dl_file);
 	dp->dl_file = NULL;
+}
+
+
+
+static void destroy_revoked_delegation(struct nfs4_delegation *dp)
+{
+	list_del_init(&dp->dl_recall_lru);
 	remove_stid(&dp->dl_stid);
 	nfs4_put_delegation(dp);
 }
 
+static void destroy_delegation(struct nfs4_delegation *dp)
+{
+	unhash_delegation(dp);
+	remove_stid(&dp->dl_stid);
+	nfs4_put_delegation(dp);
+}
+
+static void revoke_delegation(struct nfs4_delegation *dp)
+{
+	struct nfs4_client *clp = dp->dl_stid.sc_client;
+
+	if (clp->cl_minorversion == 0)
+		destroy_delegation(dp);
+	else {
+		unhash_delegation(dp);
+		dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
+		list_add(&dp->dl_recall_lru, &clp->cl_revoked);
+	}
+}
+
 /* 
  * SETCLIENTID state 
  */
@@ -1114,7 +1140,7 @@ destroy_client(struct nfs4_client *clp)
 	spin_unlock(&recall_lock);
 	while (!list_empty(&reaplist)) {
 		dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
-		unhash_delegation(dp);
+		destroy_delegation(dp);
 	}
 	while (!list_empty(&clp->cl_openowners)) {
 		oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
@@ -1310,6 +1336,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
 	INIT_LIST_HEAD(&clp->cl_delegations);
 	INIT_LIST_HEAD(&clp->cl_lru);
 	INIT_LIST_HEAD(&clp->cl_callbacks);
+	INIT_LIST_HEAD(&clp->cl_revoked);
 	spin_lock_init(&clp->cl_lock);
 	nfsd4_init_callback(&clp->cl_cb_null);
 	clp->cl_time = get_seconds();
@@ -2171,6 +2198,8 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	default:
 		seq->status_flags = 0;
 	}
+	if (!list_empty(&clp->cl_revoked))
+		seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
 out_no_session:
 	kfree(conn);
 	spin_unlock(&nn->client_lock);
@@ -3297,7 +3326,7 @@ nfs4_laundromat(struct nfsd_net *nn)
 	spin_unlock(&recall_lock);
 	list_for_each_safe(pos, next, &reaplist) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-		unhash_delegation(dp);
+		revoke_delegation(dp);
 	}
 	test_val = nn->nfsd4_lease;
 	list_for_each_safe(pos, next, &nn->close_lru) {
@@ -3459,6 +3488,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 	switch (s->sc_type) {
 	case NFS4_DELEG_STID:
 		return nfs_ok;
+	case NFS4_REVOKED_DELEG_STID:
+		return nfserr_deleg_revoked;
 	case NFS4_OPEN_STID:
 	case NFS4_LOCK_STID:
 		ols = openlockstateid(s);
@@ -3602,6 +3633,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	stateid_t *stateid = &free_stateid->fr_stateid;
 	struct nfs4_stid *s;
+	struct nfs4_delegation *dp;
 	struct nfs4_client *cl = cstate->session->se_client;
 	__be32 ret = nfserr_bad_stateid;
 
@@ -3623,6 +3655,11 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		else
 			ret = nfserr_locks_held;
 		break;
+	case NFS4_REVOKED_DELEG_STID:
+		dp = delegstateid(s);
+		destroy_revoked_delegation(dp);
+		ret = nfs_ok;
+		break;
 	default:
 		ret = nfserr_bad_stateid;
 	}
@@ -3647,10 +3684,12 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
 	status = nfsd4_check_seqid(cstate, sop, seqid);
 	if (status)
 		return status;
-	if (stp->st_stid.sc_type == NFS4_CLOSED_STID)
+	if (stp->st_stid.sc_type == NFS4_CLOSED_STID
+		|| stp->st_stid.sc_type == NFS4_REVOKED_DELEG_STID)
 		/*
 		 * "Closed" stateid's exist *only* to return
-		 * nfserr_replay_me from the previous step.
+		 * nfserr_replay_me from the previous step, and
+		 * revoked delegations are kept only for free_stateid.
 		 */
 		return nfserr_bad_stateid;
 	status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
@@ -3913,7 +3952,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		goto out;
 
-	unhash_delegation(dp);
+	destroy_delegation(dp);
 out:
 	nfs4_unlock_state();
 
@@ -4763,7 +4802,7 @@ u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max)
 	spin_unlock(&recall_lock);
 
 	list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
-		unhash_delegation(dp);
+		revoke_delegation(dp);
 
 	return count;
 }
@@ -5018,7 +5057,7 @@ nfs4_state_shutdown_net(struct net *net)
 	spin_unlock(&recall_lock);
 	list_for_each_safe(pos, next, &reaplist) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-		unhash_delegation(dp);
+		destroy_delegation(dp);
 	}
 
 	nfsd4_client_tracking_exit(net);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 13ec4853e9af..274e2a114e05 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -79,6 +79,8 @@ struct nfs4_stid {
 #define NFS4_DELEG_STID 4
 /* For an open stateid kept around *only* to process close replays: */
 #define NFS4_CLOSED_STID 8
+/* For a deleg stateid kept around only to process free_stateid's: */
+#define NFS4_REVOKED_DELEG_STID 16
 	unsigned char sc_type;
 	stateid_t sc_stateid;
 	struct nfs4_client *sc_client;
@@ -238,6 +240,7 @@ struct nfs4_client {
 	struct list_head	cl_openowners;
 	struct idr		cl_stateids;	/* stateid lookup */
 	struct list_head	cl_delegations;
+	struct list_head	cl_revoked;	/* unacknowledged, revoked 4.1 state */
 	struct list_head        cl_lru;         /* tail queue */
 	struct xdr_netobj	cl_name; 	/* id generated by client */
 	nfs4_verifier		cl_verifier; 	/* generated by client */

From 53584f66529d4c5940901e4ffe98ad7012dc6e0c Mon Sep 17 00:00:00 2001
From: fanchaoting <tingchaofan@gmail.com>
Date: Thu, 11 Apr 2013 21:24:13 +0800
Subject: [PATCH 42/60] nfsd4: remove some useless code

The "list_empty(&oo->oo_owner.so_stateids)" is aways true, so remove it.

Signed-off-by: fanchaoting <fanchaoting@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3b84700d1bd7..a7954913b332 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3919,8 +3919,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			 * In the 4.0 case we need to keep the owners around a
 			 * little while to handle CLOSE replay.
 			 */
-			if (list_empty(&oo->oo_owner.so_stateids))
-				move_to_close_lru(oo, SVC_NET(rqstp));
+			move_to_close_lru(oo, SVC_NET(rqstp));
 		}
 	}
 out:

From 9aeb5aeeb09d59794896ccefd60d58c44987f52f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 16 Apr 2013 21:29:03 -0400
Subject: [PATCH 43/60] nfsd4: remove unused macro

Cleanup a piece I forgot to remove in
9411b1d4c7df26dca6bc6261b5dc87a5b4c81e5c "nfsd4: cleanup handling of
nfsv4.0 closed stateid's".

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index a5e8a6424843..1cf154511dae 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1693,14 +1693,6 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
 } while (0)
 #define ADJUST_ARGS()		resp->p = p
 
-/*
- * Header routine to setup seqid operation replay cache
- */
-#define ENCODE_SEQID_OP_HEAD					\
-	__be32 *save;						\
-								\
-	save = resp->p;
-
 /* Encode as an array of strings the string given with components
  * separated @sep, escaped with esc_enter and esc_exit.
  */
@@ -2640,8 +2632,6 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
 static __be32
 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
 {
-	ENCODE_SEQID_OP_HEAD;
-
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &close->cl_stateid);
 
@@ -2740,8 +2730,6 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie
 static __be32
 nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
 {
-	ENCODE_SEQID_OP_HEAD;
-
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &lock->lk_resp_stateid);
 	else if (nfserr == nfserr_denied)
@@ -2761,8 +2749,6 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
 static __be32
 nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
 {
-	ENCODE_SEQID_OP_HEAD;
-
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &locku->lu_stateid);
 
@@ -2788,7 +2774,6 @@ static __be32
 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
 {
 	__be32 *p;
-	ENCODE_SEQID_OP_HEAD;
 
 	if (nfserr)
 		goto out;
@@ -2866,8 +2851,6 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
 static __be32
 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
 {
-	ENCODE_SEQID_OP_HEAD;
-
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
 
@@ -2877,8 +2860,6 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct
 static __be32
 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
 {
-	ENCODE_SEQID_OP_HEAD;
-
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &od->od_stateid);
 

From ba138435d1f8b25aa2b787848ee939270a50e34f Mon Sep 17 00:00:00 2001
From: Fengguang Wu <fengguang.wu@intel.com>
Date: Tue, 16 Apr 2013 22:14:15 -0400
Subject: [PATCH 44/60] nfsd4: put_client_renew_locked can be static

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a7954913b332..e39bf5381bca 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -193,7 +193,7 @@ renew_client(struct nfs4_client *clp)
 	spin_unlock(&nn->client_lock);
 }
 
-void put_client_renew_locked(struct nfs4_client *clp)
+static void put_client_renew_locked(struct nfs4_client *clp)
 {
 	if (!atomic_dec_and_test(&clp->cl_refcount))
 		return;

From bf8d909705e9d9bac31d9b8eac6734d2b51332a7 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Fri, 19 Apr 2013 16:09:38 -0400
Subject: [PATCH 45/60] nfsd: Decode and send 64bit time values

The seconds field of an nfstime4 structure is 64bit, but we are assuming
that the first 32bits are zero-filled.  So if the client tries to set
atime to a value before the epoch (touch -t 196001010101), then the
server will save the wrong value on disk.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1cf154511dae..888a600dad8c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -344,10 +344,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 			   all 32 bits of 'nseconds'. */
 			READ_BUF(12);
 			len += 12;
-			READ32(dummy32);
-			if (dummy32)
-				return nfserr_inval;
-			READ32(iattr->ia_atime.tv_sec);
+			READ64(iattr->ia_atime.tv_sec);
 			READ32(iattr->ia_atime.tv_nsec);
 			if (iattr->ia_atime.tv_nsec >= (u32)1000000000)
 				return nfserr_inval;
@@ -370,10 +367,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 			   all 32 bits of 'nseconds'. */
 			READ_BUF(12);
 			len += 12;
-			READ32(dummy32);
-			if (dummy32)
-				return nfserr_inval;
-			READ32(iattr->ia_mtime.tv_sec);
+			READ64(iattr->ia_mtime.tv_sec);
 			READ32(iattr->ia_mtime.tv_nsec);
 			if (iattr->ia_mtime.tv_nsec >= (u32)1000000000)
 				return nfserr_inval;
@@ -2372,8 +2366,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if (bmval1 & FATTR4_WORD1_TIME_ACCESS) {
 		if ((buflen -= 12) < 0)
 			goto out_resource;
-		WRITE32(0);
-		WRITE32(stat.atime.tv_sec);
+		WRITE64((s64)stat.atime.tv_sec);
 		WRITE32(stat.atime.tv_nsec);
 	}
 	if (bmval1 & FATTR4_WORD1_TIME_DELTA) {
@@ -2386,15 +2379,13 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if (bmval1 & FATTR4_WORD1_TIME_METADATA) {
 		if ((buflen -= 12) < 0)
 			goto out_resource;
-		WRITE32(0);
-		WRITE32(stat.ctime.tv_sec);
+		WRITE64((s64)stat.ctime.tv_sec);
 		WRITE32(stat.ctime.tv_nsec);
 	}
 	if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
 		if ((buflen -= 12) < 0)
 			goto out_resource;
-		WRITE32(0);
-		WRITE32(stat.mtime.tv_sec);
+		WRITE64((s64)stat.mtime.tv_sec);
 		WRITE32(stat.mtime.tv_nsec);
 	}
 	if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {

From 7073ea8799a8cf73db60270986f14e4aae20fa80 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 21 Feb 2013 10:14:22 -0500
Subject: [PATCH 46/60] SUNRPC: attempt AF_LOCAL connect on setup

In the gss-proxy case, setup time is when I know I'll have the right
namespace for the connect.

In other cases, it might be useful to get any connection errors
earlier--though actually in practice it doesn't make any difference for
rpcbind.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/xprtsock.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 9c2825827dec..ffd50348a509 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2655,6 +2655,9 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
 		}
 		xprt_set_bound(xprt);
 		xs_format_peer_addresses(xprt, "local", RPCBIND_NETID_LOCAL);
+		ret = ERR_PTR(xs_local_setup_socket(transport));
+		if (ret)
+			goto out_err;
 		break;
 	default:
 		ret = ERR_PTR(-EAFNOSUPPORT);

From 33d90ac0581ce81d1ebfc51918a2757e41a6011c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 11 Apr 2013 15:06:36 -0400
Subject: [PATCH 47/60] SUNRPC: allow disabling idle timeout

In the gss-proxy case we don't want to have to reconnect at random--we
want to connect only on gss-proxy startup when we can steal gss-proxy's
context to do the connect in the right namespace.

So, provide a flag that allows the rpc_create caller to turn off the
idle timeout.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/clnt.h | 1 +
 include/linux/sunrpc/xprt.h | 1 +
 net/sunrpc/clnt.c           | 2 ++
 net/sunrpc/xprt.c           | 2 ++
 4 files changed, 6 insertions(+)

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index e7d492ce7c18..bfe11be81f6f 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -125,6 +125,7 @@ struct rpc_create_args {
 #define RPC_CLNT_CREATE_DISCRTRY	(1UL << 5)
 #define RPC_CLNT_CREATE_QUIET		(1UL << 6)
 #define RPC_CLNT_CREATE_INFINITE_SLOTS	(1UL << 7)
+#define RPC_CLNT_CREATE_NO_IDLE_TIMEOUT	(1UL << 8)
 
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index ff5392421cb2..cec7b9b5e1bf 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -256,6 +256,7 @@ static inline int bc_prealloc(struct rpc_rqst *req)
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 
 #define XPRT_CREATE_INFINITE_SLOTS	(1U)
+#define XPRT_CREATE_NO_IDLE_TIMEOUT	(1U << 1)
 
 struct xprt_create {
 	int			ident;		/* XPRT_TRANSPORT identifier */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 651245aa829a..80cf23241da9 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -416,6 +416,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 
 	if (args->flags & RPC_CLNT_CREATE_INFINITE_SLOTS)
 		xprtargs.flags |= XPRT_CREATE_INFINITE_SLOTS;
+	if (args->flags & RPC_CLNT_CREATE_NO_IDLE_TIMEOUT)
+		xprtargs.flags |= XPRT_CREATE_NO_IDLE_TIMEOUT;
 	/*
 	 * If the caller chooses not to specify a hostname, whip
 	 * up a string representation of the passed-in address.
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 745fca3cfd36..095363eee764 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1300,6 +1300,8 @@ struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
 				-PTR_ERR(xprt));
 		goto out;
 	}
+	if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT)
+		xprt->idle_timeout = 0;
 	INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
 	if (xprt_has_timer(xprt))
 		setup_timer(&xprt->timer, xprt_init_autodisconnect,

From 400f26b542e86995662a0cc5483656b7b1f42af6 Mon Sep 17 00:00:00 2001
From: Simo Sorce <simo@redhat.com>
Date: Fri, 25 May 2012 18:09:53 -0400
Subject: [PATCH 48/60] SUNRPC: conditionally return endtime from
 import_sec_context

We expose this parameter for a future caller.
It will be used to extract the endtime from the gss-proxy upcall mechanism,
in order to set the rsc cache expiration time.

Signed-off-by: Simo Sorce <simo@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/gss_api.h        | 2 ++
 net/sunrpc/auth_gss/auth_gss.c        | 2 +-
 net/sunrpc/auth_gss/gss_krb5_mech.c   | 7 +++++--
 net/sunrpc/auth_gss/gss_mech_switch.c | 5 +++--
 net/sunrpc/auth_gss/svcauth_gss.c     | 3 ++-
 5 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h
index a19e2547ae6a..04d03bb2de5d 100644
--- a/include/linux/sunrpc/gss_api.h
+++ b/include/linux/sunrpc/gss_api.h
@@ -37,6 +37,7 @@ int gss_import_sec_context(
 		size_t			bufsize,
 		struct gss_api_mech	*mech,
 		struct gss_ctx		**ctx_id,
+		time_t			*endtime,
 		gfp_t			gfp_mask);
 u32 gss_get_mic(
 		struct gss_ctx		*ctx_id,
@@ -92,6 +93,7 @@ struct gss_api_ops {
 			const void		*input_token,
 			size_t			bufsize,
 			struct gss_ctx		*ctx_id,
+			time_t			*endtime,
 			gfp_t			gfp_mask);
 	u32 (*gss_get_mic)(
 			struct gss_ctx		*ctx_id,
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 5257d2982ba5..23563e783ec2 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -238,7 +238,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
 		p = ERR_PTR(-EFAULT);
 		goto err;
 	}
-	ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, GFP_NOFS);
+	ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_NOFS);
 	if (ret < 0) {
 		p = ERR_PTR(ret);
 		goto err;
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index d3611f11a8df..3bc4a23938ea 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -679,6 +679,7 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
 static int
 gss_import_sec_context_kerberos(const void *p, size_t len,
 				struct gss_ctx *ctx_id,
+				time_t *endtime,
 				gfp_t gfp_mask)
 {
 	const void *end = (const void *)((const char *)p + len);
@@ -694,9 +695,11 @@ gss_import_sec_context_kerberos(const void *p, size_t len,
 	else
 		ret = gss_import_v2_context(p, end, ctx, gfp_mask);
 
-	if (ret == 0)
+	if (ret == 0) {
 		ctx_id->internal_ctx_id = ctx;
-	else
+		if (endtime)
+			*endtime = ctx->endtime;
+	} else
 		kfree(ctx);
 
 	dprintk("RPC:       %s: returning %d\n", __func__, ret);
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index f0f4eee63a35..43fd5bbf92c6 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -325,14 +325,15 @@ int
 gss_import_sec_context(const void *input_token, size_t bufsize,
 		       struct gss_api_mech	*mech,
 		       struct gss_ctx		**ctx_id,
+		       time_t			*endtime,
 		       gfp_t gfp_mask)
 {
 	if (!(*ctx_id = kzalloc(sizeof(**ctx_id), gfp_mask)))
 		return -ENOMEM;
 	(*ctx_id)->mech_type = gss_mech_get(mech);
 
-	return mech->gm_ops
-		->gss_import_sec_context(input_token, bufsize, *ctx_id, gfp_mask);
+	return mech->gm_ops->gss_import_sec_context(input_token, bufsize,
+						*ctx_id, endtime, gfp_mask);
 }
 
 /* gss_get_mic: compute a mic over message and return mic_token. */
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 5ead60550895..20eedecc35f8 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -497,7 +497,8 @@ static int rsc_parse(struct cache_detail *cd,
 		len = qword_get(&mesg, buf, mlen);
 		if (len < 0)
 			goto out;
-		status = gss_import_sec_context(buf, len, gm, &rsci.mechctx, GFP_KERNEL);
+		status = gss_import_sec_context(buf, len, gm, &rsci.mechctx,
+						NULL, GFP_KERNEL);
 		if (status)
 			goto out;
 

From 1d658336b05f8697d6445834f8867f8ad5e4f735 Mon Sep 17 00:00:00 2001
From: Simo Sorce <simo@redhat.com>
Date: Fri, 25 May 2012 18:09:55 -0400
Subject: [PATCH 49/60] SUNRPC: Add RPC based upcall mechanism for RPCGSS auth

This patch implements a sunrpc client to use the services of the gssproxy
userspace daemon.

In particular it allows to perform calls in user space using an RPC
call instead of custom hand-coded upcall/downcall messages.

Currently only accept_sec_context is implemented as that is all is needed for
the server case.

File server modules like NFS and CIFS can use full gssapi services this way,
once init_sec_context is also implemented.

For the NFS server case this code allow to lift the limit of max 2k krb5
tickets. This limit is prevents legitimate kerberos deployments from using krb5
authentication with the Linux NFS server as they have normally ticket that are
many kilobytes large.

It will also allow to lift the limitation on the size of the credential set
(uid,gid,gids) passed down from user space for users that have very many groups
associated. Currently the downcall mechanism used by rpc.svcgssd is limited
to around 2k secondary groups of the 65k allowed by kernel structures.

Signed-off-by: Simo Sorce <simo@redhat.com>
[bfields: containerization, concurrent upcalls, misc. fixes and cleanup]
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/auth_gss/Makefile         |   3 +-
 net/sunrpc/auth_gss/gss_rpc_upcall.c | 355 ++++++++++++
 net/sunrpc/auth_gss/gss_rpc_upcall.h |  47 ++
 net/sunrpc/auth_gss/gss_rpc_xdr.c    | 832 +++++++++++++++++++++++++++
 net/sunrpc/auth_gss/gss_rpc_xdr.h    | 264 +++++++++
 net/sunrpc/clnt.c                    |   1 +
 net/sunrpc/netns.h                   |   3 +
 7 files changed, 1504 insertions(+), 1 deletion(-)
 create mode 100644 net/sunrpc/auth_gss/gss_rpc_upcall.c
 create mode 100644 net/sunrpc/auth_gss/gss_rpc_upcall.h
 create mode 100644 net/sunrpc/auth_gss/gss_rpc_xdr.c
 create mode 100644 net/sunrpc/auth_gss/gss_rpc_xdr.h

diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile
index 9e4cb59ef9f0..14e9e53e63d5 100644
--- a/net/sunrpc/auth_gss/Makefile
+++ b/net/sunrpc/auth_gss/Makefile
@@ -5,7 +5,8 @@
 obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o
 
 auth_rpcgss-y := auth_gss.o gss_generic_token.o \
-	gss_mech_switch.o svcauth_gss.o
+	gss_mech_switch.o svcauth_gss.o \
+	gss_rpc_upcall.o gss_rpc_xdr.o
 
 obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o
 
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
new file mode 100644
index 000000000000..2d33ddfe74e5
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -0,0 +1,355 @@
+/*
+ *  linux/net/sunrpc/gss_rpc_upcall.c
+ *
+ *  Copyright (C) 2012 Simo Sorce <simo@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/un.h>
+
+#include <linux/sunrpc/svcauth.h>
+#include "gss_rpc_upcall.h"
+
+#define GSSPROXY_SOCK_PATHNAME	"/var/run/gssproxy.sock"
+
+#define GSSPROXY_PROGRAM	(400112u)
+#define GSSPROXY_VERS_1		(1u)
+
+/*
+ * Encoding/Decoding functions
+ */
+
+enum {
+	GSSX_NULL = 0,	/* Unused */
+        GSSX_INDICATE_MECHS = 1,
+        GSSX_GET_CALL_CONTEXT = 2,
+        GSSX_IMPORT_AND_CANON_NAME = 3,
+        GSSX_EXPORT_CRED = 4,
+        GSSX_IMPORT_CRED = 5,
+        GSSX_ACQUIRE_CRED = 6,
+        GSSX_STORE_CRED = 7,
+        GSSX_INIT_SEC_CONTEXT = 8,
+        GSSX_ACCEPT_SEC_CONTEXT = 9,
+        GSSX_RELEASE_HANDLE = 10,
+        GSSX_GET_MIC = 11,
+        GSSX_VERIFY = 12,
+        GSSX_WRAP = 13,
+        GSSX_UNWRAP = 14,
+        GSSX_WRAP_SIZE_LIMIT = 15,
+};
+
+#define PROC(proc, name)				\
+[GSSX_##proc] = {					\
+	.p_proc   = GSSX_##proc,			\
+	.p_encode = (kxdreproc_t)gssx_enc_##name,	\
+	.p_decode = (kxdrdproc_t)gssx_dec_##name,	\
+	.p_arglen = GSSX_ARG_##name##_sz,		\
+	.p_replen = GSSX_RES_##name##_sz, 		\
+	.p_statidx = GSSX_##proc,			\
+	.p_name   = #proc,				\
+}
+
+struct rpc_procinfo gssp_procedures[] = {
+	PROC(INDICATE_MECHS, indicate_mechs),
+        PROC(GET_CALL_CONTEXT, get_call_context),
+        PROC(IMPORT_AND_CANON_NAME, import_and_canon_name),
+        PROC(EXPORT_CRED, export_cred),
+        PROC(IMPORT_CRED, import_cred),
+        PROC(ACQUIRE_CRED, acquire_cred),
+        PROC(STORE_CRED, store_cred),
+        PROC(INIT_SEC_CONTEXT, init_sec_context),
+        PROC(ACCEPT_SEC_CONTEXT, accept_sec_context),
+        PROC(RELEASE_HANDLE, release_handle),
+        PROC(GET_MIC, get_mic),
+        PROC(VERIFY, verify),
+        PROC(WRAP, wrap),
+        PROC(UNWRAP, unwrap),
+        PROC(WRAP_SIZE_LIMIT, wrap_size_limit),
+};
+
+
+
+/*
+ * Common transport functions
+ */
+
+static const struct rpc_program gssp_program;
+
+static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt)
+{
+	static const struct sockaddr_un gssp_localaddr = {
+		.sun_family		= AF_LOCAL,
+		.sun_path		= GSSPROXY_SOCK_PATHNAME,
+	};
+	struct rpc_create_args args = {
+		.net		= net,
+		.protocol	= XPRT_TRANSPORT_LOCAL,
+		.address	= (struct sockaddr *)&gssp_localaddr,
+		.addrsize	= sizeof(gssp_localaddr),
+		.servername	= "localhost",
+		.program	= &gssp_program,
+		.version	= GSSPROXY_VERS_1,
+		.authflavor	= RPC_AUTH_NULL,
+		/*
+		 * Note we want connection to be done in the caller's
+		 * filesystem namespace.  We therefore turn off the idle
+		 * timeout, which would result in reconnections being
+		 * done without the correct namespace:
+		 */
+		.flags		= RPC_CLNT_CREATE_NOPING |
+				  RPC_CLNT_CREATE_NO_IDLE_TIMEOUT
+	};
+	struct rpc_clnt *clnt;
+	int result = 0;
+
+	clnt = rpc_create(&args);
+	if (IS_ERR(clnt)) {
+		dprintk("RPC:       failed to create AF_LOCAL gssproxy "
+				"client (errno %ld).\n", PTR_ERR(clnt));
+		result = -PTR_ERR(clnt);
+		*_clnt = NULL;
+		goto out;
+	}
+
+	dprintk("RPC:       created new gssp local client (gssp_local_clnt: "
+			"%p)\n", clnt);
+	*_clnt = clnt;
+
+out:
+	return result;
+}
+
+void init_gssp_clnt(struct sunrpc_net *sn)
+{
+	mutex_init(&sn->gssp_lock);
+	sn->gssp_clnt = NULL;
+}
+
+int set_gssp_clnt(struct net *net)
+{
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+	struct rpc_clnt *clnt;
+	int ret;
+
+	mutex_lock(&sn->gssp_lock);
+	ret = gssp_rpc_create(net, &clnt);
+	if (!ret) {
+		if (sn->gssp_clnt)
+			rpc_shutdown_client(sn->gssp_clnt);
+		sn->gssp_clnt = clnt;
+	}
+	mutex_unlock(&sn->gssp_lock);
+	return ret;
+}
+
+void clear_gssp_clnt(struct sunrpc_net *sn)
+{
+	mutex_lock(&sn->gssp_lock);
+	if (sn->gssp_clnt) {
+		rpc_shutdown_client(sn->gssp_clnt);
+		sn->gssp_clnt = NULL;
+	}
+	mutex_unlock(&sn->gssp_lock);
+}
+
+static struct rpc_clnt *get_gssp_clnt(struct sunrpc_net *sn)
+{
+	struct rpc_clnt *clnt;
+
+	mutex_lock(&sn->gssp_lock);
+	clnt = sn->gssp_clnt;
+	if (clnt)
+		atomic_inc(&clnt->cl_count);
+	mutex_unlock(&sn->gssp_lock);
+	return clnt;
+}
+
+static int gssp_call(struct net *net, struct rpc_message *msg)
+{
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+	struct rpc_clnt *clnt;
+	int status;
+
+	clnt = get_gssp_clnt(sn);
+	if (!clnt)
+		return -EIO;
+	status = rpc_call_sync(clnt, msg, 0);
+	if (status < 0) {
+		dprintk("gssp: rpc_call returned error %d\n", -status);
+		switch (status) {
+		case -EPROTONOSUPPORT:
+			status = -EINVAL;
+			break;
+		case -ECONNREFUSED:
+		case -ETIMEDOUT:
+		case -ENOTCONN:
+			status = -EAGAIN;
+			break;
+		case -ERESTARTSYS:
+			if (signalled ())
+				status = -EINTR;
+			break;
+		default:
+			break;
+		}
+	}
+	rpc_release_client(clnt);
+	return status;
+}
+
+
+/*
+ * Public functions
+ */
+
+/* numbers somewhat arbitrary but large enough for current needs */
+#define GSSX_MAX_OUT_HANDLE	128
+#define GSSX_MAX_MECH_OID	16
+#define GSSX_MAX_SRC_PRINC	256
+#define GSSX_KMEMBUF (GSSX_max_output_handle_sz + \
+			GSSX_max_oid_sz + \
+			GSSX_max_princ_sz + \
+			sizeof(struct svc_cred))
+
+int gssp_accept_sec_context_upcall(struct net *net,
+				struct gssp_upcall_data *data)
+{
+	struct gssx_ctx ctxh = {
+		.state = data->in_handle
+	};
+	struct gssx_arg_accept_sec_context arg = {
+		.input_token = data->in_token,
+	};
+	struct gssx_ctx rctxh = {
+		/*
+		 * pass in the max length we expect for each of these
+		 * buffers but let the xdr code kmalloc them:
+		 */
+		.exported_context_token.len = GSSX_max_output_handle_sz,
+		.mech.len = GSSX_max_oid_sz,
+		.src_name.display_name.len = GSSX_max_princ_sz
+	};
+	struct gssx_res_accept_sec_context res = {
+		.context_handle = &rctxh,
+		.output_token = &data->out_token
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &gssp_procedures[GSSX_ACCEPT_SEC_CONTEXT],
+		.rpc_argp = &arg,
+		.rpc_resp = &res,
+		.rpc_cred = NULL, /* FIXME ? */
+	};
+	struct xdr_netobj client_name = { 0 , NULL };
+	int ret;
+
+	if (data->in_handle.len != 0)
+		arg.context_handle = &ctxh;
+	res.output_token->len = GSSX_max_output_token_sz;
+
+	/* use nfs/ for targ_name ? */
+
+	ret = gssp_call(net, &msg);
+
+	/* we need to fetch all data even in case of error so
+	 * that we can free special strctures is they have been allocated */
+	data->major_status = res.status.major_status;
+	data->minor_status = res.status.minor_status;
+	if (res.context_handle) {
+		data->out_handle = rctxh.exported_context_token;
+		data->mech_oid = rctxh.mech;
+		client_name = rctxh.src_name.display_name;
+	}
+
+	if (res.options.count == 1) {
+		gssx_buffer *value = &res.options.data[0].value;
+		/* Currently we only decode CREDS_VALUE, if we add
+		 * anything else we'll have to loop and match on the
+		 * option name */
+		if (value->len == 1) {
+			/* steal group info from struct svc_cred */
+			data->creds = *(struct svc_cred *)value->data;
+			data->found_creds = 1;
+		}
+		/* whether we use it or not, free data */
+		kfree(value->data);
+	}
+
+	if (res.options.count != 0) {
+		kfree(res.options.data);
+	}
+
+	/* convert to GSS_NT_HOSTBASED_SERVICE form and set into creds */
+	if (data->found_creds && client_name.data != NULL) {
+		char *c;
+
+		data->creds.cr_principal = kstrndup(client_name.data,
+						client_name.len, GFP_KERNEL);
+		if (data->creds.cr_principal) {
+			/* terminate and remove realm part */
+			c = strchr(data->creds.cr_principal, '@');
+			if (c) {
+				*c = '\0';
+
+				/* change service-hostname delimiter */
+				c = strchr(data->creds.cr_principal, '/');
+				if (c) *c = '@';
+			}
+			if (!c) {
+				/* not a service principal */
+				kfree(data->creds.cr_principal);
+				data->creds.cr_principal = NULL;
+			}
+		}
+	}
+	kfree(client_name.data);
+
+	return ret;
+}
+
+void gssp_free_upcall_data(struct gssp_upcall_data *data)
+{
+	kfree(data->in_handle.data);
+	kfree(data->out_handle.data);
+	kfree(data->out_token.data);
+	kfree(data->mech_oid.data);
+	free_svc_cred(&data->creds);
+}
+
+/*
+ * Initialization stuff
+ */
+
+static const struct rpc_version gssp_version1 = {
+	.number		= GSSPROXY_VERS_1,
+	.nrprocs	= ARRAY_SIZE(gssp_procedures),
+	.procs		= gssp_procedures,
+};
+
+static const struct rpc_version *gssp_version[] = {
+	NULL,
+	&gssp_version1,
+};
+
+static struct rpc_stat gssp_stats;
+
+static const struct rpc_program gssp_program = {
+	.name		= "gssproxy",
+	.number		= GSSPROXY_PROGRAM,
+	.nrvers		= ARRAY_SIZE(gssp_version),
+	.version	= gssp_version,
+	.stats		= &gssp_stats,
+};
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.h b/net/sunrpc/auth_gss/gss_rpc_upcall.h
new file mode 100644
index 000000000000..4c2caaa7e84e
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.h
@@ -0,0 +1,47 @@
+/*
+ *  linux/net/sunrpc/gss_rpc_upcall.h
+ *
+ *  Copyright (C) 2012 Simo Sorce <simo@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _GSS_RPC_UPCALL_H
+#define _GSS_RPC_UPCALL_H
+
+#include <linux/sunrpc/auth_gss.h>
+#include "gss_rpc_xdr.h"
+#include "../netns.h"
+
+struct gssp_upcall_data {
+	struct xdr_netobj in_handle;
+	struct gssp_in_token in_token;
+	struct xdr_netobj out_handle;
+	struct xdr_netobj out_token;
+	struct xdr_netobj mech_oid;
+	struct svc_cred creds;
+	int found_creds;
+	int major_status;
+	int minor_status;
+};
+
+int gssp_accept_sec_context_upcall(struct net *net,
+				struct gssp_upcall_data *data);
+void gssp_free_upcall_data(struct gssp_upcall_data *data);
+
+void init_gssp_clnt(struct sunrpc_net *);
+int set_gssp_clnt(struct net *);
+void clear_gssp_clnt(struct sunrpc_net *);
+#endif /* _GSS_RPC_UPCALL_H */
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
new file mode 100644
index 000000000000..d0ccdffa7e54
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -0,0 +1,832 @@
+/*
+ * GSS Proxy upcall module
+ *
+ *  Copyright (C) 2012 Simo Sorce <simo@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/sunrpc/svcauth.h>
+#include "gss_rpc_xdr.h"
+
+static bool gssx_check_pointer(struct xdr_stream *xdr)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (unlikely(p == NULL))
+		return -ENOSPC;
+	return *p?true:false;
+}
+
+static int gssx_enc_bool(struct xdr_stream *xdr, int v)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (unlikely(p == NULL))
+		return -ENOSPC;
+	*p = v ? xdr_one : xdr_zero;
+	return 0;
+}
+
+static int gssx_dec_bool(struct xdr_stream *xdr, u32 *v)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -ENOSPC;
+	*v = be32_to_cpu(*p);
+	return 0;
+}
+
+static int gssx_enc_buffer(struct xdr_stream *xdr,
+			   gssx_buffer *buf)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, sizeof(u32) + buf->len);
+	if (!p)
+		return -ENOSPC;
+	xdr_encode_opaque(p, buf->data, buf->len);
+	return 0;
+}
+
+static int gssx_enc_in_token(struct xdr_stream *xdr,
+			     struct gssp_in_token *in)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return -ENOSPC;
+	*p = cpu_to_be32(in->page_len);
+
+	/* all we need to do is to write pages */
+	xdr_write_pages(xdr, in->pages, in->page_base, in->page_len);
+
+	return 0;
+}
+
+
+static int gssx_dec_buffer(struct xdr_stream *xdr,
+			   gssx_buffer *buf)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -ENOSPC;
+
+	length = be32_to_cpup(p);
+	p = xdr_inline_decode(xdr, length);
+	if (unlikely(p == NULL))
+		return -ENOSPC;
+
+	if (buf->len == 0) {
+		/* we intentionally are not interested in this buffer */
+		return 0;
+	}
+	if (length > buf->len)
+		return -ENOSPC;
+
+	if (!buf->data) {
+		buf->data = kmemdup(p, length, GFP_KERNEL);
+		if (!buf->data)
+			return -ENOMEM;
+	} else {
+		memcpy(buf->data, p, length);
+	}
+	buf->len = length;
+	return 0;
+}
+
+static int gssx_enc_option(struct xdr_stream *xdr,
+			   struct gssx_option *opt)
+{
+	int err;
+
+	err = gssx_enc_buffer(xdr, &opt->option);
+	if (err)
+		return err;
+	err = gssx_enc_buffer(xdr, &opt->value);
+	return err;
+}
+
+static int gssx_dec_option(struct xdr_stream *xdr,
+			   struct gssx_option *opt)
+{
+	int err;
+
+	err = gssx_dec_buffer(xdr, &opt->option);
+	if (err)
+		return err;
+	err = gssx_dec_buffer(xdr, &opt->value);
+	return err;
+}
+
+static int dummy_enc_opt_array(struct xdr_stream *xdr,
+				struct gssx_option_array *oa)
+{
+	__be32 *p;
+
+	if (oa->count != 0)
+		return -EINVAL;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return -ENOSPC;
+	*p = 0;
+
+	return 0;
+}
+
+static int dummy_dec_opt_array(struct xdr_stream *xdr,
+				struct gssx_option_array *oa)
+{
+	struct gssx_option dummy;
+	u32 count, i;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -ENOSPC;
+	count = be32_to_cpup(p++);
+	memset(&dummy, 0, sizeof(dummy));
+	for (i = 0; i < count; i++) {
+		gssx_dec_option(xdr, &dummy);
+	}
+
+	oa->count = 0;
+	oa->data = NULL;
+	return 0;
+}
+
+static int get_s32(void **p, void *max, s32 *res)
+{
+	void *base = *p;
+	void *next = (void *)((char *)base + sizeof(s32));
+	if (unlikely(next > max || next < base))
+		return -EINVAL;
+	memcpy(res, base, sizeof(s32));
+	*p = next;
+	return 0;
+}
+
+static int gssx_dec_linux_creds(struct xdr_stream *xdr,
+				struct svc_cred *creds)
+{
+	u32 length;
+	__be32 *p;
+	void *q, *end;
+	s32 tmp;
+	int N, i, err;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -ENOSPC;
+
+	length = be32_to_cpup(p);
+
+	/* FIXME: we do not want to use the scratch buffer for this one
+	 * may need to use functions that allows us to access an io vector
+	 * directly */
+	p = xdr_inline_decode(xdr, length);
+	if (unlikely(p == NULL))
+		return -ENOSPC;
+
+	q = p;
+	end = q + length;
+
+	/* uid */
+	err = get_s32(&q, end, &tmp);
+	if (err)
+		return err;
+	creds->cr_uid = tmp;
+
+	/* gid */
+	err = get_s32(&q, end, &tmp);
+	if (err)
+		return err;
+	creds->cr_gid = tmp;
+
+	/* number of additional gid's */
+	err = get_s32(&q, end, &tmp);
+	if (err)
+		return err;
+	N = tmp;
+	creds->cr_group_info = groups_alloc(N);
+	if (creds->cr_group_info == NULL)
+		return -ENOMEM;
+
+	/* gid's */
+	for (i = 0; i < N; i++) {
+		err = get_s32(&q, end, &tmp);
+		if (err) {
+			groups_free(creds->cr_group_info);
+			return err;
+		}
+		GROUP_AT(creds->cr_group_info, i) = tmp;
+	}
+
+	return 0;
+}
+
+static int gssx_dec_option_array(struct xdr_stream *xdr,
+				 struct gssx_option_array *oa)
+{
+	struct svc_cred *creds;
+	u32 count, i;
+	__be32 *p;
+	int err;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -ENOSPC;
+	count = be32_to_cpup(p++);
+	if (count != 0) {
+		/* we recognize only 1 currently: CREDS_VALUE */
+		oa->count = 1;
+
+		oa->data = kmalloc(sizeof(struct gssx_option), GFP_KERNEL);
+		if (!oa->data)
+			return -ENOMEM;
+
+		creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL);
+		if (!creds) {
+			kfree(oa->data);
+			return -ENOMEM;
+		}
+
+		oa->data[0].option.data = CREDS_VALUE;
+		oa->data[0].option.len = sizeof(CREDS_VALUE);
+		oa->data[0].value.data = (void *)creds;
+		oa->data[0].value.len = 0;
+	}
+	for (i = 0; i < count; i++) {
+		gssx_buffer dummy = { 0, NULL };
+		u32 length;
+
+		/* option buffer */
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(p == NULL))
+			return -ENOSPC;
+
+		length = be32_to_cpup(p);
+		p = xdr_inline_decode(xdr, length);
+		if (unlikely(p == NULL))
+			return -ENOSPC;
+
+		if (length == sizeof(CREDS_VALUE) &&
+		    memcmp(p, CREDS_VALUE, sizeof(CREDS_VALUE)) == 0) {
+			/* We have creds here. parse them */
+			err = gssx_dec_linux_creds(xdr, creds);
+			if (err)
+				return err;
+			oa->data[0].value.len = 1; /* presence */
+		} else {
+			/* consume uninteresting buffer */
+			err = gssx_dec_buffer(xdr, &dummy);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
+static int gssx_dec_status(struct xdr_stream *xdr,
+			   struct gssx_status *status)
+{
+	__be32 *p;
+	int err;
+
+	/* status->major_status */
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(p == NULL))
+		return -ENOSPC;
+	p = xdr_decode_hyper(p, &status->major_status);
+
+	/* status->mech */
+	err = gssx_dec_buffer(xdr, &status->mech);
+	if (err)
+		return err;
+
+	/* status->minor_status */
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(p == NULL))
+		return -ENOSPC;
+	p = xdr_decode_hyper(p, &status->minor_status);
+
+	/* status->major_status_string */
+	err = gssx_dec_buffer(xdr, &status->major_status_string);
+	if (err)
+		return err;
+
+	/* status->minor_status_string */
+	err = gssx_dec_buffer(xdr, &status->minor_status_string);
+	if (err)
+		return err;
+
+	/* status->server_ctx */
+	err = gssx_dec_buffer(xdr, &status->server_ctx);
+	if (err)
+		return err;
+
+	/* we assume we have no options for now, so simply consume them */
+	/* status->options */
+	err = dummy_dec_opt_array(xdr, &status->options);
+
+	return err;
+}
+
+static int gssx_enc_call_ctx(struct xdr_stream *xdr,
+			     struct gssx_call_ctx *ctx)
+{
+	struct gssx_option opt;
+	__be32 *p;
+	int err;
+
+	/* ctx->locale */
+	err = gssx_enc_buffer(xdr, &ctx->locale);
+	if (err)
+		return err;
+
+	/* ctx->server_ctx */
+	err = gssx_enc_buffer(xdr, &ctx->server_ctx);
+	if (err)
+		return err;
+
+	/* we always want to ask for lucid contexts */
+	/* ctx->options */
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(2);
+
+	/* we want a lucid_v1 context */
+	opt.option.data = LUCID_OPTION;
+	opt.option.len = sizeof(LUCID_OPTION);
+	opt.value.data = LUCID_VALUE;
+	opt.value.len = sizeof(LUCID_VALUE);
+	err = gssx_enc_option(xdr, &opt);
+
+	/* ..and user creds */
+	opt.option.data = CREDS_OPTION;
+	opt.option.len = sizeof(CREDS_OPTION);
+	opt.value.data = CREDS_VALUE;
+	opt.value.len = sizeof(CREDS_VALUE);
+	err = gssx_enc_option(xdr, &opt);
+
+	return err;
+}
+
+static int gssx_dec_name_attr(struct xdr_stream *xdr,
+			     struct gssx_name_attr *attr)
+{
+	int err;
+
+	/* attr->attr */
+	err = gssx_dec_buffer(xdr, &attr->attr);
+	if (err)
+		return err;
+
+	/* attr->value */
+	err = gssx_dec_buffer(xdr, &attr->value);
+	if (err)
+		return err;
+
+	/* attr->extensions */
+	err = dummy_dec_opt_array(xdr, &attr->extensions);
+
+	return err;
+}
+
+static int dummy_enc_nameattr_array(struct xdr_stream *xdr,
+				    struct gssx_name_attr_array *naa)
+{
+	__be32 *p;
+
+	if (naa->count != 0)
+		return -EINVAL;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return -ENOSPC;
+	*p = 0;
+
+	return 0;
+}
+
+static int dummy_dec_nameattr_array(struct xdr_stream *xdr,
+				    struct gssx_name_attr_array *naa)
+{
+	struct gssx_name_attr dummy;
+	u32 count, i;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -ENOSPC;
+	count = be32_to_cpup(p++);
+	for (i = 0; i < count; i++) {
+		gssx_dec_name_attr(xdr, &dummy);
+	}
+
+	naa->count = 0;
+	naa->data = NULL;
+	return 0;
+}
+
+static struct xdr_netobj zero_netobj = {};
+
+static struct gssx_name_attr_array zero_name_attr_array = {};
+
+static struct gssx_option_array zero_option_array = {};
+
+static int gssx_enc_name(struct xdr_stream *xdr,
+			 struct gssx_name *name)
+{
+	int err;
+
+	/* name->display_name */
+	err = gssx_enc_buffer(xdr, &name->display_name);
+	if (err)
+		return err;
+
+	/* name->name_type */
+	err = gssx_enc_buffer(xdr, &zero_netobj);
+	if (err)
+		return err;
+
+	/* name->exported_name */
+	err = gssx_enc_buffer(xdr, &zero_netobj);
+	if (err)
+		return err;
+
+	/* name->exported_composite_name */
+	err = gssx_enc_buffer(xdr, &zero_netobj);
+	if (err)
+		return err;
+
+	/* leave name_attributes empty for now, will add once we have any
+	 * to pass up at all */
+	/* name->name_attributes */
+	err = dummy_enc_nameattr_array(xdr, &zero_name_attr_array);
+	if (err)
+		return err;
+
+	/* leave options empty for now, will add once we have any options
+	 * to pass up at all */
+	/* name->extensions */
+	err = dummy_enc_opt_array(xdr, &zero_option_array);
+
+	return err;
+}
+
+static int gssx_dec_name(struct xdr_stream *xdr,
+			 struct gssx_name *name)
+{
+	struct xdr_netobj dummy_netobj;
+	struct gssx_name_attr_array dummy_name_attr_array;
+	struct gssx_option_array dummy_option_array;
+	int err;
+
+	/* name->display_name */
+	err = gssx_dec_buffer(xdr, &name->display_name);
+	if (err)
+		return err;
+
+	/* name->name_type */
+	err = gssx_dec_buffer(xdr, &dummy_netobj);
+	if (err)
+		return err;
+
+	/* name->exported_name */
+	err = gssx_dec_buffer(xdr, &dummy_netobj);
+	if (err)
+		return err;
+
+	/* name->exported_composite_name */
+	err = gssx_dec_buffer(xdr, &dummy_netobj);
+	if (err)
+		return err;
+
+	/* we assume we have no attributes for now, so simply consume them */
+	/* name->name_attributes */
+	err = dummy_dec_nameattr_array(xdr, &dummy_name_attr_array);
+	if (err)
+		return err;
+
+	/* we assume we have no options for now, so simply consume them */
+	/* name->extensions */
+	err = dummy_dec_opt_array(xdr, &dummy_option_array);
+
+	return err;
+}
+
+static int dummy_enc_credel_array(struct xdr_stream *xdr,
+				  struct gssx_cred_element_array *cea)
+{
+	__be32 *p;
+
+	if (cea->count != 0)
+		return -EINVAL;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return -ENOSPC;
+	*p = 0;
+
+	return 0;
+}
+
+static int gssx_enc_cred(struct xdr_stream *xdr,
+			 struct gssx_cred *cred)
+{
+	int err;
+
+	/* cred->desired_name */
+	err = gssx_enc_name(xdr, &cred->desired_name);
+	if (err)
+		return err;
+
+	/* cred->elements */
+	err = dummy_enc_credel_array(xdr, &cred->elements);
+
+	/* cred->cred_handle_reference */
+	err = gssx_enc_buffer(xdr, &cred->cred_handle_reference);
+	if (err)
+		return err;
+
+	/* cred->needs_release */
+	err = gssx_enc_bool(xdr, cred->needs_release);
+
+	return err;
+}
+
+static int gssx_enc_ctx(struct xdr_stream *xdr,
+			struct gssx_ctx *ctx)
+{
+	__be32 *p;
+	int err;
+
+	/* ctx->exported_context_token */
+	err = gssx_enc_buffer(xdr, &ctx->exported_context_token);
+	if (err)
+		return err;
+
+	/* ctx->state */
+	err = gssx_enc_buffer(xdr, &ctx->state);
+	if (err)
+		return err;
+
+	/* ctx->need_release */
+	err = gssx_enc_bool(xdr, ctx->need_release);
+	if (err)
+		return err;
+
+	/* ctx->mech */
+	err = gssx_enc_buffer(xdr, &ctx->mech);
+	if (err)
+		return err;
+
+	/* ctx->src_name */
+	err = gssx_enc_name(xdr, &ctx->src_name);
+	if (err)
+		return err;
+
+	/* ctx->targ_name */
+	err = gssx_enc_name(xdr, &ctx->targ_name);
+	if (err)
+		return err;
+
+	/* ctx->lifetime */
+	p = xdr_reserve_space(xdr, 8+8);
+	if (!p)
+		return -ENOSPC;
+	p = xdr_encode_hyper(p, ctx->lifetime);
+
+	/* ctx->ctx_flags */
+	p = xdr_encode_hyper(p, ctx->ctx_flags);
+
+	/* ctx->locally_initiated */
+	err = gssx_enc_bool(xdr, ctx->locally_initiated);
+	if (err)
+		return err;
+
+	/* ctx->open */
+	err = gssx_enc_bool(xdr, ctx->open);
+	if (err)
+		return err;
+
+	/* leave options empty for now, will add once we have any options
+	 * to pass up at all */
+	/* ctx->options */
+	err = dummy_enc_opt_array(xdr, &ctx->options);
+
+	return err;
+}
+
+static int gssx_dec_ctx(struct xdr_stream *xdr,
+			struct gssx_ctx *ctx)
+{
+	__be32 *p;
+	int err;
+
+	/* ctx->exported_context_token */
+	err = gssx_dec_buffer(xdr, &ctx->exported_context_token);
+	if (err)
+		return err;
+
+	/* ctx->state */
+	err = gssx_dec_buffer(xdr, &ctx->state);
+	if (err)
+		return err;
+
+	/* ctx->need_release */
+	err = gssx_dec_bool(xdr, &ctx->need_release);
+	if (err)
+		return err;
+
+	/* ctx->mech */
+	err = gssx_dec_buffer(xdr, &ctx->mech);
+	if (err)
+		return err;
+
+	/* ctx->src_name */
+	err = gssx_dec_name(xdr, &ctx->src_name);
+	if (err)
+		return err;
+
+	/* ctx->targ_name */
+	err = gssx_dec_name(xdr, &ctx->targ_name);
+	if (err)
+		return err;
+
+	/* ctx->lifetime */
+	p = xdr_inline_decode(xdr, 8+8);
+	if (unlikely(p == NULL))
+		return -ENOSPC;
+	p = xdr_decode_hyper(p, &ctx->lifetime);
+
+	/* ctx->ctx_flags */
+	p = xdr_decode_hyper(p, &ctx->ctx_flags);
+
+	/* ctx->locally_initiated */
+	err = gssx_dec_bool(xdr, &ctx->locally_initiated);
+	if (err)
+		return err;
+
+	/* ctx->open */
+	err = gssx_dec_bool(xdr, &ctx->open);
+	if (err)
+		return err;
+
+	/* we assume we have no options for now, so simply consume them */
+	/* ctx->options */
+	err = dummy_dec_opt_array(xdr, &ctx->options);
+
+	return err;
+}
+
+static int gssx_enc_cb(struct xdr_stream *xdr, struct gssx_cb *cb)
+{
+	__be32 *p;
+	int err;
+
+	/* cb->initiator_addrtype */
+	p = xdr_reserve_space(xdr, 8);
+	if (!p)
+		return -ENOSPC;
+	p = xdr_encode_hyper(p, cb->initiator_addrtype);
+
+	/* cb->initiator_address */
+	err = gssx_enc_buffer(xdr, &cb->initiator_address);
+	if (err)
+		return err;
+
+	/* cb->acceptor_addrtype */
+	p = xdr_reserve_space(xdr, 8);
+	if (!p)
+		return -ENOSPC;
+	p = xdr_encode_hyper(p, cb->acceptor_addrtype);
+
+	/* cb->acceptor_address */
+	err = gssx_enc_buffer(xdr, &cb->acceptor_address);
+	if (err)
+		return err;
+
+	/* cb->application_data */
+	err = gssx_enc_buffer(xdr, &cb->application_data);
+
+	return err;
+}
+
+void gssx_enc_accept_sec_context(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 struct gssx_arg_accept_sec_context *arg)
+{
+	int err;
+
+	err = gssx_enc_call_ctx(xdr, &arg->call_ctx);
+	if (err)
+		goto done;
+
+	/* arg->context_handle */
+	if (arg->context_handle) {
+		err = gssx_enc_ctx(xdr, arg->context_handle);
+		if (err)
+			goto done;
+	} else {
+		err = gssx_enc_bool(xdr, 0);
+	}
+
+	/* arg->cred_handle */
+	if (arg->cred_handle) {
+		err = gssx_enc_cred(xdr, arg->cred_handle);
+		if (err)
+			goto done;
+	} else {
+		err = gssx_enc_bool(xdr, 0);
+	}
+
+	/* arg->input_token */
+	err = gssx_enc_in_token(xdr, &arg->input_token);
+	if (err)
+		goto done;
+
+	/* arg->input_cb */
+	if (arg->input_cb) {
+		err = gssx_enc_cb(xdr, arg->input_cb);
+		if (err)
+			goto done;
+	} else {
+		err = gssx_enc_bool(xdr, 0);
+	}
+
+	err = gssx_enc_bool(xdr, arg->ret_deleg_cred);
+	if (err)
+		goto done;
+
+	/* leave options empty for now, will add once we have any options
+	 * to pass up at all */
+	/* arg->options */
+	err = dummy_enc_opt_array(xdr, &arg->options);
+
+done:
+	if (err)
+		dprintk("RPC:       gssx_enc_accept_sec_context: %d\n", err);
+}
+
+int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
+				struct xdr_stream *xdr,
+				struct gssx_res_accept_sec_context *res)
+{
+	int err;
+
+	/* res->status */
+	err = gssx_dec_status(xdr, &res->status);
+	if (err)
+		return err;
+
+	/* res->context_handle */
+	if (gssx_check_pointer(xdr)) {
+		err = gssx_dec_ctx(xdr, res->context_handle);
+		if (err)
+			return err;
+	} else {
+		res->context_handle = NULL;
+	}
+
+	/* res->output_token */
+	if (gssx_check_pointer(xdr)) {
+		err = gssx_dec_buffer(xdr, res->output_token);
+		if (err)
+			return err;
+	} else {
+		res->output_token = NULL;
+	}
+
+	/* res->delegated_cred_handle */
+	if (gssx_check_pointer(xdr)) {
+		/* we do not support upcall servers sending this data. */
+		return -EINVAL;
+	}
+
+	/* res->options */
+	err = gssx_dec_option_array(xdr, &res->options);
+
+	return err;
+}
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.h b/net/sunrpc/auth_gss/gss_rpc_xdr.h
new file mode 100644
index 000000000000..1c98b27d870c
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.h
@@ -0,0 +1,264 @@
+/*
+ * GSS Proxy upcall module
+ *
+ *  Copyright (C) 2012 Simo Sorce <simo@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _LINUX_GSS_RPC_XDR_H
+#define _LINUX_GSS_RPC_XDR_H
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xprtsock.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_AUTH
+#endif
+
+#define LUCID_OPTION "exported_context_type"
+#define LUCID_VALUE  "linux_lucid_v1"
+#define CREDS_OPTION "exported_creds_type"
+#define CREDS_VALUE  "linux_creds_v1"
+
+typedef struct xdr_netobj gssx_buffer;
+typedef struct xdr_netobj utf8string;
+typedef struct xdr_netobj gssx_OID;
+
+enum gssx_cred_usage {
+	GSSX_C_INITIATE = 1,
+	GSSX_C_ACCEPT = 2,
+	GSSX_C_BOTH = 3,
+};
+
+struct gssx_option {
+	gssx_buffer option;
+	gssx_buffer value;
+};
+
+struct gssx_option_array {
+	u32 count;
+	struct gssx_option *data;
+};
+
+struct gssx_status {
+	u64 major_status;
+	gssx_OID mech;
+	u64 minor_status;
+	utf8string major_status_string;
+	utf8string minor_status_string;
+	gssx_buffer server_ctx;
+	struct gssx_option_array options;
+};
+
+struct gssx_call_ctx {
+	utf8string locale;
+	gssx_buffer server_ctx;
+	struct gssx_option_array options;
+};
+
+struct gssx_name_attr {
+	gssx_buffer attr;
+	gssx_buffer value;
+	struct gssx_option_array extensions;
+};
+
+struct gssx_name_attr_array {
+	u32 count;
+	struct gssx_name_attr *data;
+};
+
+struct gssx_name {
+	gssx_buffer display_name;
+};
+typedef struct gssx_name gssx_name;
+
+struct gssx_cred_element {
+	gssx_name MN;
+	gssx_OID mech;
+	u32 cred_usage;
+	u64 initiator_time_rec;
+	u64 acceptor_time_rec;
+	struct gssx_option_array options;
+};
+
+struct gssx_cred_element_array {
+	u32 count;
+	struct gssx_cred_element *data;
+};
+
+struct gssx_cred {
+	gssx_name desired_name;
+	struct gssx_cred_element_array elements;
+	gssx_buffer cred_handle_reference;
+	u32 needs_release;
+};
+
+struct gssx_ctx {
+	gssx_buffer exported_context_token;
+	gssx_buffer state;
+	u32 need_release;
+	gssx_OID mech;
+	gssx_name src_name;
+	gssx_name targ_name;
+	u64 lifetime;
+	u64 ctx_flags;
+	u32 locally_initiated;
+	u32 open;
+	struct gssx_option_array options;
+};
+
+struct gssx_cb {
+	u64 initiator_addrtype;
+	gssx_buffer initiator_address;
+	u64 acceptor_addrtype;
+	gssx_buffer acceptor_address;
+	gssx_buffer application_data;
+};
+
+
+/* This structure is not defined in the protocol.
+ * It is used in the kernel to carry around a big buffer
+ * as a set of pages */
+struct gssp_in_token {
+	struct page **pages;	/* Array of contiguous pages */
+	unsigned int page_base;	/* Start of page data */
+	unsigned int page_len;	/* Length of page data */
+};
+
+struct gssx_arg_accept_sec_context {
+	struct gssx_call_ctx call_ctx;
+	struct gssx_ctx *context_handle;
+	struct gssx_cred *cred_handle;
+	struct gssp_in_token input_token;
+	struct gssx_cb *input_cb;
+	u32 ret_deleg_cred;
+	struct gssx_option_array options;
+};
+
+struct gssx_res_accept_sec_context {
+	struct gssx_status status;
+	struct gssx_ctx *context_handle;
+	gssx_buffer *output_token;
+	/* struct gssx_cred *delegated_cred_handle; not used in kernel */
+	struct gssx_option_array options;
+};
+
+
+
+#define gssx_enc_indicate_mechs NULL
+#define gssx_dec_indicate_mechs NULL
+#define gssx_enc_get_call_context NULL
+#define gssx_dec_get_call_context NULL
+#define gssx_enc_import_and_canon_name NULL
+#define gssx_dec_import_and_canon_name NULL
+#define gssx_enc_export_cred NULL
+#define gssx_dec_export_cred NULL
+#define gssx_enc_import_cred NULL
+#define gssx_dec_import_cred NULL
+#define gssx_enc_acquire_cred NULL
+#define gssx_dec_acquire_cred NULL
+#define gssx_enc_store_cred NULL
+#define gssx_dec_store_cred NULL
+#define gssx_enc_init_sec_context NULL
+#define gssx_dec_init_sec_context NULL
+void gssx_enc_accept_sec_context(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 struct gssx_arg_accept_sec_context *args);
+int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
+				struct xdr_stream *xdr,
+				struct gssx_res_accept_sec_context *res);
+#define gssx_enc_release_handle NULL
+#define gssx_dec_release_handle NULL
+#define gssx_enc_get_mic NULL
+#define gssx_dec_get_mic NULL
+#define gssx_enc_verify NULL
+#define gssx_dec_verify NULL
+#define gssx_enc_wrap NULL
+#define gssx_dec_wrap NULL
+#define gssx_enc_unwrap NULL
+#define gssx_dec_unwrap NULL
+#define gssx_enc_wrap_size_limit NULL
+#define gssx_dec_wrap_size_limit NULL
+
+/* non implemented calls are set to 0 size */
+#define GSSX_ARG_indicate_mechs_sz 0
+#define GSSX_RES_indicate_mechs_sz 0
+#define GSSX_ARG_get_call_context_sz 0
+#define GSSX_RES_get_call_context_sz 0
+#define GSSX_ARG_import_and_canon_name_sz 0
+#define GSSX_RES_import_and_canon_name_sz 0
+#define GSSX_ARG_export_cred_sz 0
+#define GSSX_RES_export_cred_sz 0
+#define GSSX_ARG_import_cred_sz 0
+#define GSSX_RES_import_cred_sz 0
+#define GSSX_ARG_acquire_cred_sz 0
+#define GSSX_RES_acquire_cred_sz 0
+#define GSSX_ARG_store_cred_sz 0
+#define GSSX_RES_store_cred_sz 0
+#define GSSX_ARG_init_sec_context_sz 0
+#define GSSX_RES_init_sec_context_sz 0
+
+#define GSSX_default_in_call_ctx_sz (4 + 4 + 4 + \
+			8 + sizeof(LUCID_OPTION) + sizeof(LUCID_VALUE) + \
+			8 + sizeof(CREDS_OPTION) + sizeof(CREDS_VALUE))
+#define GSSX_default_in_ctx_hndl_sz (4 + 4+8 + 4 + 4 + 6*4 + 6*4 + 8 + 8 + \
+					4 + 4 + 4)
+#define GSSX_default_in_cred_sz 4 /* we send in no cred_handle */
+#define GSSX_default_in_token_sz 4 /* does *not* include token data */
+#define GSSX_default_in_cb_sz 4 /* we do not use channel bindings */
+#define GSSX_ARG_accept_sec_context_sz (GSSX_default_in_call_ctx_sz + \
+					GSSX_default_in_ctx_hndl_sz + \
+					GSSX_default_in_cred_sz + \
+					GSSX_default_in_token_sz + \
+					GSSX_default_in_cb_sz + \
+					4 /* no deleg creds boolean */ + \
+					4) /* empty options */
+
+/* somewhat arbitrary numbers but large enough (we ignore some of the data
+ * sent down, but it is part of the protocol so we need enough space to take
+ * it in) */
+#define GSSX_default_status_sz 8 + 24 + 8 + 256 + 256 + 16 + 4
+#define GSSX_max_output_handle_sz 128
+#define GSSX_max_oid_sz 16
+#define GSSX_max_princ_sz 256
+#define GSSX_default_ctx_sz (GSSX_max_output_handle_sz + \
+			     16 + 4 + GSSX_max_oid_sz + \
+			     2 * GSSX_max_princ_sz + \
+			     8 + 8 + 4 + 4 + 4)
+#define GSSX_max_output_token_sz 1024
+#define GSSX_max_creds_sz (4 + 4 + 4 + NGROUPS_MAX * 4)
+#define GSSX_RES_accept_sec_context_sz (GSSX_default_status_sz + \
+					GSSX_default_ctx_sz + \
+					GSSX_max_output_token_sz + \
+					4 + GSSX_max_creds_sz)
+
+#define GSSX_ARG_release_handle_sz 0
+#define GSSX_RES_release_handle_sz 0
+#define GSSX_ARG_get_mic_sz 0
+#define GSSX_RES_get_mic_sz 0
+#define GSSX_ARG_verify_sz 0
+#define GSSX_RES_verify_sz 0
+#define GSSX_ARG_wrap_sz 0
+#define GSSX_RES_wrap_sz 0
+#define GSSX_ARG_unwrap_sz 0
+#define GSSX_RES_unwrap_sz 0
+#define GSSX_ARG_wrap_size_limit_sz 0
+#define GSSX_RES_wrap_size_limit_sz 0
+
+
+
+#endif /* _LINUX_GSS_RPC_XDR_H */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 80cf23241da9..ac74399e8899 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -688,6 +688,7 @@ rpc_release_client(struct rpc_clnt *clnt)
 	if (atomic_dec_and_test(&clnt->cl_count))
 		rpc_free_auth(clnt);
 }
+EXPORT_SYMBOL_GPL(rpc_release_client);
 
 /**
  * rpc_bind_new_program - bind a new RPC program to an existing client
diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h
index ce7bd449173d..e9f8895d70ca 100644
--- a/net/sunrpc/netns.h
+++ b/net/sunrpc/netns.h
@@ -23,6 +23,9 @@ struct sunrpc_net {
 	struct rpc_clnt *rpcb_local_clnt4;
 	spinlock_t rpcb_clnt_lock;
 	unsigned int rpcb_users;
+
+	struct mutex gssp_lock;
+	struct rpc_clnt *gssp_clnt;
 };
 
 extern int sunrpc_net_id;

From 030d794bf49855f5e2a9e8dfbfad34211d1eb08b Mon Sep 17 00:00:00 2001
From: Simo Sorce <simo@redhat.com>
Date: Fri, 25 May 2012 18:09:56 -0400
Subject: [PATCH 50/60] SUNRPC: Use gssproxy upcall for server RPCGSS
 authentication.

The main advantge of this new upcall mechanism is that it can handle
big tickets as seen in Kerberos implementations where tickets carry
authorization data like the MS-PAC buffer with AD or the Posix Authorization
Data being discussed in IETF on the krbwg working group.

The Gssproxy program is used to perform the accept_sec_context call on the
kernel's behalf. The code is changed to also pass the input buffer straight
to upcall mechanism to avoid allocating and copying many pages as tokens can
be as big (potentially more in future) as 64KiB.

Signed-off-by: Simo Sorce <simo@redhat.com>
[bfields: containerization, negotiation api]
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 Documentation/filesystems/nfs/00-INDEX        |   2 +
 .../filesystems/nfs/rpc-server-gss.txt        |  91 +++++
 net/sunrpc/auth_gss/gss_rpc_upcall.c          |   2 +
 net/sunrpc/auth_gss/svcauth_gss.c             | 347 +++++++++++++++++-
 net/sunrpc/netns.h                            |   3 +
 5 files changed, 436 insertions(+), 9 deletions(-)
 create mode 100644 Documentation/filesystems/nfs/rpc-server-gss.txt

diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX
index 1716874a651e..66eb6c8c5334 100644
--- a/Documentation/filesystems/nfs/00-INDEX
+++ b/Documentation/filesystems/nfs/00-INDEX
@@ -20,3 +20,5 @@ rpc-cache.txt
 	- introduction to the caching mechanisms in the sunrpc layer.
 idmapper.txt
 	- information for configuring request-keys to be used by idmapper
+knfsd-rpcgss.txt
+	- Information on GSS authentication support in the NFS Server
diff --git a/Documentation/filesystems/nfs/rpc-server-gss.txt b/Documentation/filesystems/nfs/rpc-server-gss.txt
new file mode 100644
index 000000000000..716f4be8e8b3
--- /dev/null
+++ b/Documentation/filesystems/nfs/rpc-server-gss.txt
@@ -0,0 +1,91 @@
+
+rpcsec_gss support for kernel RPC servers
+=========================================
+
+This document gives references to the standards and protocols used to
+implement RPCGSS authentication in kernel RPC servers such as the NFS
+server and the NFS client's NFSv4.0 callback server.  (But note that
+NFSv4.1 and higher don't require the client to act as a server for the
+purposes of authentication.)
+
+RPCGSS is specified in a few IETF documents:
+ - RFC2203 v1: http://tools.ietf.org/rfc/rfc2203.txt
+ - RFC5403 v2: http://tools.ietf.org/rfc/rfc5403.txt
+and there is a 3rd version  being proposed:
+ - http://tools.ietf.org/id/draft-williams-rpcsecgssv3.txt
+   (At draft n. 02 at the time of writing)
+
+Background
+----------
+
+The RPCGSS Authentication method describes a way to perform GSSAPI
+Authentication for NFS.  Although GSSAPI is itself completely mechanism
+agnostic, in many cases only the KRB5 mechanism is supported by NFS
+implementations.
+
+The Linux kernel, at the moment, supports only the KRB5 mechanism, and
+depends on GSSAPI extensions that are KRB5 specific.
+
+GSSAPI is a complex library, and implementing it completely in kernel is
+unwarranted. However GSSAPI operations are fundementally separable in 2
+parts:
+- initial context establishment
+- integrity/privacy protection (signing and encrypting of individual
+  packets)
+
+The former is more complex and policy-independent, but less
+performance-sensitive.  The latter is simpler and needs to be very fast.
+
+Therefore, we perform per-packet integrity and privacy protection in the
+kernel, but leave the initial context establishment to userspace.  We
+need upcalls to request userspace to perform context establishment.
+
+NFS Server Legacy Upcall Mechanism
+----------------------------------
+
+The classic upcall mechanism uses a custom text based upcall mechanism
+to talk to a custom daemon called rpc.svcgssd that is provide by the
+nfs-utils package.
+
+This upcall mechanism has 2 limitations:
+
+A) It can handle tokens that are no bigger than 2KiB
+
+In some Kerberos deployment GSSAPI tokens can be quite big, up and
+beyond 64KiB in size due to various authorization extensions attacked to
+the Kerberos tickets, that needs to be sent through the GSS layer in
+order to perform context establishment.
+
+B) It does not properly handle creds where the user is member of more
+than a few housand groups (the current hard limit in the kernel is 65K
+groups) due to limitation on the size of the buffer that can be send
+back to the kernel (4KiB).
+
+NFS Server New RPC Upcall Mechanism
+-----------------------------------
+
+The newer upcall mechanism uses RPC over a unix socket to a daemon
+called gss-proxy, implemented by a userspace program called Gssproxy.
+
+The gss_proxy RPC protocol is currently documented here:
+
+	https://fedorahosted.org/gss-proxy/wiki/ProtocolDocumentation
+
+This upcall mechanism uses the kernel rpc client and connects to the gssproxy
+userspace program over a regular unix socket. The gssproxy protocol does not
+suffer from the size limitations of the legacy protocol.
+
+Negotiating Upcall Mechanisms
+-----------------------------
+
+To provide backward compatibility, the kernel defaults to using the
+legacy mechanism.  To switch to the new mechanism, gss-proxy must bind
+to /var/run/gssproxy.sock and then write "1" to
+/proc/net/rpc/use-gss-proxy.  If gss-proxy dies, it must repeat both
+steps.
+
+Once the upcall mechanism is chosen, it cannot be changed.  To prevent
+locking into the legacy mechanisms, the above steps must be performed
+before starting nfsd.  Whoever starts nfsd can guarantee this by reading
+from /proc/net/rpc/use-gss-proxy and checking that it contains a
+"1"--the read will block until gss-proxy has done its write to the file.
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
index 2d33ddfe74e5..3f874d704859 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.c
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -137,6 +137,7 @@ void init_gssp_clnt(struct sunrpc_net *sn)
 {
 	mutex_init(&sn->gssp_lock);
 	sn->gssp_clnt = NULL;
+	init_waitqueue_head(&sn->gssp_wq);
 }
 
 int set_gssp_clnt(struct net *net)
@@ -153,6 +154,7 @@ int set_gssp_clnt(struct net *net)
 		sn->gssp_clnt = clnt;
 	}
 	mutex_unlock(&sn->gssp_lock);
+	wake_up(&sn->gssp_wq);
 	return ret;
 }
 
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 20eedecc35f8..58f5bc329408 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -48,8 +48,8 @@
 #include <linux/sunrpc/svcauth.h>
 #include <linux/sunrpc/svcauth_gss.h>
 #include <linux/sunrpc/cache.h>
+#include "gss_rpc_upcall.h"
 
-#include "../netns.h"
 
 #ifdef RPC_DEBUG
 # define RPCDBG_FACILITY	RPCDBG_AUTH
@@ -988,13 +988,10 @@ gss_write_init_verf(struct cache_detail *cd, struct svc_rqst *rqstp,
 }
 
 static inline int
-gss_read_verf(struct rpc_gss_wire_cred *gc,
-	      struct kvec *argv, __be32 *authp,
-	      struct xdr_netobj *in_handle,
-	      struct xdr_netobj *in_token)
+gss_read_common_verf(struct rpc_gss_wire_cred *gc,
+		     struct kvec *argv, __be32 *authp,
+		     struct xdr_netobj *in_handle)
 {
-	struct xdr_netobj tmpobj;
-
 	/* Read the verifier; should be NULL: */
 	*authp = rpc_autherr_badverf;
 	if (argv->iov_len < 2 * 4)
@@ -1010,6 +1007,23 @@ gss_read_verf(struct rpc_gss_wire_cred *gc,
 	if (dup_netobj(in_handle, &gc->gc_ctx))
 		return SVC_CLOSE;
 	*authp = rpc_autherr_badverf;
+
+	return 0;
+}
+
+static inline int
+gss_read_verf(struct rpc_gss_wire_cred *gc,
+	      struct kvec *argv, __be32 *authp,
+	      struct xdr_netobj *in_handle,
+	      struct xdr_netobj *in_token)
+{
+	struct xdr_netobj tmpobj;
+	int res;
+
+	res = gss_read_common_verf(gc, argv, authp, in_handle);
+	if (res)
+		return res;
+
 	if (svc_safe_getnetobj(argv, &tmpobj)) {
 		kfree(in_handle->data);
 		return SVC_DENIED;
@@ -1022,6 +1036,40 @@ gss_read_verf(struct rpc_gss_wire_cred *gc,
 	return 0;
 }
 
+/* Ok this is really heavily depending on a set of semantics in
+ * how rqstp is set up by svc_recv and pages laid down by the
+ * server when reading a request. We are basically guaranteed that
+ * the token lays all down linearly across a set of pages, starting
+ * at iov_base in rq_arg.head[0] which happens to be the first of a
+ * set of pages stored in rq_pages[].
+ * rq_arg.head[0].iov_base will provide us the page_base to pass
+ * to the upcall.
+ */
+static inline int
+gss_read_proxy_verf(struct svc_rqst *rqstp,
+		    struct rpc_gss_wire_cred *gc, __be32 *authp,
+		    struct xdr_netobj *in_handle,
+		    struct gssp_in_token *in_token)
+{
+	struct kvec *argv = &rqstp->rq_arg.head[0];
+	u32 inlen;
+	int res;
+
+	res = gss_read_common_verf(gc, argv, authp, in_handle);
+	if (res)
+		return res;
+
+	inlen = svc_getnl(argv);
+	if (inlen > (argv->iov_len + rqstp->rq_arg.page_len))
+		return SVC_DENIED;
+
+	in_token->pages = rqstp->rq_pages;
+	in_token->page_base = (ulong)argv->iov_base & ~PAGE_MASK;
+	in_token->page_len = inlen;
+
+	return 0;
+}
+
 static inline int
 gss_write_resv(struct kvec *resv, size_t size_limit,
 	       struct xdr_netobj *out_handle, struct xdr_netobj *out_token,
@@ -1049,7 +1097,7 @@ gss_write_resv(struct kvec *resv, size_t size_limit,
  * the upcall results are available, write the verifier and result.
  * Otherwise, drop the request pending an answer to the upcall.
  */
-static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
+static int svcauth_gss_legacy_init(struct svc_rqst *rqstp,
 			struct rpc_gss_wire_cred *gc, __be32 *authp)
 {
 	struct kvec *argv = &rqstp->rq_arg.head[0];
@@ -1089,6 +1137,278 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
 	return ret;
 }
 
+static int gss_proxy_save_rsc(struct cache_detail *cd,
+				struct gssp_upcall_data *ud,
+				uint64_t *handle)
+{
+	struct rsc rsci, *rscp = NULL;
+	static atomic64_t ctxhctr;
+	long long ctxh;
+	struct gss_api_mech *gm = NULL;
+	time_t expiry;
+	int status = -EINVAL;
+
+	memset(&rsci, 0, sizeof(rsci));
+	/* context handle */
+	status = -ENOMEM;
+	/* the handle needs to be just a unique id,
+	 * use a static counter */
+	ctxh = atomic64_inc_return(&ctxhctr);
+
+	/* make a copy for the caller */
+	*handle = ctxh;
+
+	/* make a copy for the rsc cache */
+	if (dup_to_netobj(&rsci.handle, (char *)handle, sizeof(uint64_t)))
+		goto out;
+	rscp = rsc_lookup(cd, &rsci);
+	if (!rscp)
+		goto out;
+
+	/* creds */
+	if (!ud->found_creds) {
+		/* userspace seem buggy, we should always get at least a
+		 * mapping to nobody */
+		dprintk("RPC:       No creds found, marking Negative!\n");
+		set_bit(CACHE_NEGATIVE, &rsci.h.flags);
+	} else {
+
+		/* steal creds */
+		rsci.cred = ud->creds;
+		memset(&ud->creds, 0, sizeof(struct svc_cred));
+
+		status = -EOPNOTSUPP;
+		/* get mech handle from OID */
+		gm = gss_mech_get_by_OID(&ud->mech_oid);
+		if (!gm)
+			goto out;
+
+		status = -EINVAL;
+		/* mech-specific data: */
+		status = gss_import_sec_context(ud->out_handle.data,
+						ud->out_handle.len,
+						gm, &rsci.mechctx,
+						&expiry, GFP_KERNEL);
+		if (status)
+			goto out;
+	}
+
+	rsci.h.expiry_time = expiry;
+	rscp = rsc_update(cd, &rsci, rscp);
+	status = 0;
+out:
+	gss_mech_put(gm);
+	rsc_free(&rsci);
+	if (rscp)
+		cache_put(&rscp->h, cd);
+	else
+		status = -ENOMEM;
+	return status;
+}
+
+static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
+			struct rpc_gss_wire_cred *gc, __be32 *authp)
+{
+	struct kvec *resv = &rqstp->rq_res.head[0];
+	struct xdr_netobj cli_handle;
+	struct gssp_upcall_data ud;
+	uint64_t handle;
+	int status;
+	int ret;
+	struct net *net = rqstp->rq_xprt->xpt_net;
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+	memset(&ud, 0, sizeof(ud));
+	ret = gss_read_proxy_verf(rqstp, gc, authp,
+				  &ud.in_handle, &ud.in_token);
+	if (ret)
+		return ret;
+
+	ret = SVC_CLOSE;
+
+	/* Perform synchronous upcall to gss-proxy */
+	status = gssp_accept_sec_context_upcall(net, &ud);
+	if (status)
+		goto out;
+
+	dprintk("RPC:       svcauth_gss: gss major status = %d\n",
+			ud.major_status);
+
+	switch (ud.major_status) {
+	case GSS_S_CONTINUE_NEEDED:
+		cli_handle = ud.out_handle;
+		break;
+	case GSS_S_COMPLETE:
+		status = gss_proxy_save_rsc(sn->rsc_cache, &ud, &handle);
+		if (status)
+			goto out;
+		cli_handle.data = (u8 *)&handle;
+		cli_handle.len = sizeof(handle);
+		break;
+	default:
+		ret = SVC_CLOSE;
+		goto out;
+	}
+
+	/* Got an answer to the upcall; use it: */
+	if (gss_write_init_verf(sn->rsc_cache, rqstp,
+				&cli_handle, &ud.major_status))
+		goto out;
+	if (gss_write_resv(resv, PAGE_SIZE,
+			   &cli_handle, &ud.out_token,
+			   ud.major_status, ud.minor_status))
+		goto out;
+
+	ret = SVC_COMPLETE;
+out:
+	gssp_free_upcall_data(&ud);
+	return ret;
+}
+
+DEFINE_SPINLOCK(use_gssp_lock);
+
+static bool use_gss_proxy(struct net *net)
+{
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+	if (sn->use_gss_proxy != -1)
+		return sn->use_gss_proxy;
+	spin_lock(&use_gssp_lock);
+	/*
+	 * If you wanted gss-proxy, you should have said so before
+	 * starting to accept requests:
+	 */
+	sn->use_gss_proxy = 0;
+	spin_unlock(&use_gssp_lock);
+	return 0;
+}
+
+static bool set_gss_proxy(struct net *net, int type)
+{
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+	int ret = 0;
+
+	WARN_ON_ONCE(type != 0 && type != 1);
+	spin_lock(&use_gssp_lock);
+	if (sn->use_gss_proxy == -1 || sn->use_gss_proxy == type)
+		sn->use_gss_proxy = type;
+	else
+		ret = -EBUSY;
+	spin_unlock(&use_gssp_lock);
+	wake_up(&sn->gssp_wq);
+	return ret;
+}
+
+static inline bool gssp_ready(struct sunrpc_net *sn)
+{
+	switch (sn->use_gss_proxy) {
+		case -1:
+			return false;
+		case 0:
+			return true;
+		case 1:
+			return sn->gssp_clnt;
+	}
+	WARN_ON_ONCE(1);
+	return false;
+}
+
+static int wait_for_gss_proxy(struct net *net)
+{
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+	return wait_event_interruptible(sn->gssp_wq, gssp_ready(sn));
+}
+
+#ifdef CONFIG_PROC_FS
+
+static ssize_t write_gssp(struct file *file, const char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	struct net *net = PDE(file->f_path.dentry->d_inode)->data;
+	char tbuf[20];
+	unsigned long i;
+	int res;
+
+	if (*ppos || count > sizeof(tbuf)-1)
+		return -EINVAL;
+	if (copy_from_user(tbuf, buf, count))
+		return -EFAULT;
+
+	tbuf[count] = 0;
+	res = kstrtoul(tbuf, 0, &i);
+	if (res)
+		return res;
+	if (i != 1)
+		return -EINVAL;
+	res = set_gss_proxy(net, 1);
+	if (res)
+		return res;
+	res = set_gssp_clnt(net);
+	if (res)
+		return res;
+	return count;
+}
+
+static ssize_t read_gssp(struct file *file, char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	struct net *net = PDE(file->f_path.dentry->d_inode)->data;
+	unsigned long p = *ppos;
+	char tbuf[10];
+	size_t len;
+	int ret;
+
+	ret = wait_for_gss_proxy(net);
+	if (ret)
+		return ret;
+
+	snprintf(tbuf, sizeof(tbuf), "%d\n", use_gss_proxy(net));
+	len = strlen(tbuf);
+	if (p >= len)
+		return 0;
+	len -= p;
+	if (len > count)
+		len = count;
+	if (copy_to_user(buf, (void *)(tbuf+p), len))
+		return -EFAULT;
+	*ppos += len;
+	return len;
+}
+
+static const struct file_operations use_gss_proxy_ops = {
+	.open = nonseekable_open,
+	.write = write_gssp,
+	.read = read_gssp,
+};
+
+static int create_use_gss_proxy_proc_entry(struct net *net)
+{
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+	struct proc_dir_entry **p = &sn->use_gssp_proc;
+
+	sn->use_gss_proxy = -1;
+	*p = proc_create_data("use-gss-proxy", S_IFREG|S_IRUSR|S_IWUSR,
+			      sn->proc_net_rpc,
+			      &use_gss_proxy_ops, net);
+	if (!*p)
+		return -ENOMEM;
+	init_gssp_clnt(sn);
+	return 0;
+}
+
+static void destroy_use_gss_proxy_proc_entry(struct net *net)
+{
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+	if (sn->use_gssp_proc) {
+		remove_proc_entry("use-gss-proxy", sn->proc_net_rpc); 
+		clear_gssp_clnt(sn);
+	}
+}
+
+#endif /* CONFIG_PROC_FS */
+
 /*
  * Accept an rpcsec packet.
  * If context establishment, punt to user space
@@ -1155,7 +1475,10 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
 	switch (gc->gc_proc) {
 	case RPC_GSS_PROC_INIT:
 	case RPC_GSS_PROC_CONTINUE_INIT:
-		return svcauth_gss_handle_init(rqstp, gc, authp);
+		if (use_gss_proxy(SVC_NET(rqstp)))
+			return svcauth_gss_proxy_init(rqstp, gc, authp);
+		else
+			return svcauth_gss_legacy_init(rqstp, gc, authp);
 	case RPC_GSS_PROC_DATA:
 	case RPC_GSS_PROC_DESTROY:
 		/* Look up the context, and check the verifier: */
@@ -1530,7 +1853,12 @@ gss_svc_init_net(struct net *net)
 	rv = rsi_cache_create_net(net);
 	if (rv)
 		goto out1;
+	rv = create_use_gss_proxy_proc_entry(net);
+	if (rv)
+		goto out2;
 	return 0;
+out2:
+	destroy_use_gss_proxy_proc_entry(net);
 out1:
 	rsc_cache_destroy_net(net);
 	return rv;
@@ -1539,6 +1867,7 @@ gss_svc_init_net(struct net *net)
 void
 gss_svc_shutdown_net(struct net *net)
 {
+	destroy_use_gss_proxy_proc_entry(net);
 	rsi_cache_destroy_net(net);
 	rsc_cache_destroy_net(net);
 }
diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h
index e9f8895d70ca..7111a4c9113b 100644
--- a/net/sunrpc/netns.h
+++ b/net/sunrpc/netns.h
@@ -25,7 +25,10 @@ struct sunrpc_net {
 	unsigned int rpcb_users;
 
 	struct mutex gssp_lock;
+	wait_queue_head_t gssp_wq;
 	struct rpc_clnt *gssp_clnt;
+	int use_gss_proxy;
+	struct proc_dir_entry *use_gssp_proc;
 };
 
 extern int sunrpc_net_id;

From aa387d6ce15330e09037947147c5a5a2ba42a0e8 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 15 Apr 2013 16:03:46 -0400
Subject: [PATCH 51/60] nfsd: fix EXDEV checking in rename

We again check for the EXDEV a little later on, so the first check is
redundant.  This check is also slightly racier, since a badly timed
eviction from the export cache could leave us with the two fh_export
pointers pointing to two different cache entries which each refer to the
same underlying export.

It's better to compare vfsmounts as the later check does, but that
leaves a minor security hole in the case where the two exports refer to
two different directories especially if (for example) they have
different root-squashing options.

So, compare ex_path.dentry too.

Reported-by: Joe Habermann <joe.habermann@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 2b2e2396a869..84ce601d8063 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1758,10 +1758,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	tdentry = tfhp->fh_dentry;
 	tdir = tdentry->d_inode;
 
-	err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
-	if (ffhp->fh_export != tfhp->fh_export)
-		goto out;
-
 	err = nfserr_perm;
 	if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
 		goto out;
@@ -1802,6 +1798,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	host_err = -EXDEV;
 	if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
 		goto out_dput_new;
+	if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
+		goto out_dput_new;
 
 	host_err = nfsd_break_lease(odentry->d_inode);
 	if (host_err)

From dd30333cf5a2f9dfecda5c6f4523133f13847aae Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 12 Apr 2013 18:10:56 -0400
Subject: [PATCH 52/60] nfsd4: better error return to indicate SSV non-support

As 4.1 becomes less experimental and SSV still isn't implemented, we
have to admit it's not going to be, and return some sensible error
rather than just saying "our server's broken".  Discussion in the ietf
group hasn't turned up any objections to using NFS4ERR_ENC_ALG_UNSUPP
for that purpose.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e39bf5381bca..a964a1761077 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1651,6 +1651,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 	default:				/* checked by xdr code */
 		WARN_ON_ONCE(1);
 	case SP4_SSV:
+		return nfserr_encr_alg_unsupp;
 	case SP4_MACH_CRED:
 		return nfserr_serverfault;	/* no excuse :-/ */
 	}

From 0ff3bab530c35dd6499f7f2d639816df2e1241cd Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 29 Apr 2013 17:03:31 -0400
Subject: [PATCH 53/60] SUNRPC: define
 {create,destroy}_use_gss_proxy_proc_entry in !PROC case

Though I wonder whether we should really just depend on CONFIG_PROC_FS
at some point.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
---
 net/sunrpc/auth_gss/svcauth_gss.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 1c66a3b78329..b70ac1cec8f5 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1283,6 +1283,8 @@ static bool use_gss_proxy(struct net *net)
 	return 0;
 }
 
+#ifdef CONFIG_PROC_FS
+
 static bool set_gss_proxy(struct net *net, int type)
 {
 	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
@@ -1320,7 +1322,6 @@ static int wait_for_gss_proxy(struct net *net)
 	return wait_event_interruptible(sn->gssp_wq, gssp_ready(sn));
 }
 
-#ifdef CONFIG_PROC_FS
 
 static ssize_t write_gssp(struct file *file, const char __user *buf,
 			 size_t count, loff_t *ppos)
@@ -1406,6 +1407,14 @@ static void destroy_use_gss_proxy_proc_entry(struct net *net)
 		clear_gssp_clnt(sn);
 	}
 }
+#else /* CONFIG_PROC_FS */
+
+static int create_use_gss_proxy_proc_entry(struct net *net)
+{
+	return 0;
+}
+
+static void destroy_use_gss_proxy_proc_entry(struct net *net) {}
 
 #endif /* CONFIG_PROC_FS */
 

From 6278b62aa8f90c668a4e4b94ad9d3952cf4331b7 Mon Sep 17 00:00:00 2001
From: Fengguang Wu <fengguang.wu@intel.com>
Date: Mon, 29 Apr 2013 17:19:48 -0400
Subject: [PATCH 54/60] SUNRPC: gssp_procedures[] can be static

Cc: Simo Sorce <simo@redhat.com>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
---
 net/sunrpc/auth_gss/gss_rpc_upcall.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
index c63273604ddc..d304f41260f2 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.c
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -63,7 +63,7 @@ enum {
 	.p_name   = #proc,				\
 }
 
-struct rpc_procinfo gssp_procedures[] = {
+static struct rpc_procinfo gssp_procedures[] = {
 	PROC(INDICATE_MECHS, indicate_mechs),
         PROC(GET_CALL_CONTEXT, get_call_context),
         PROC(IMPORT_AND_CANON_NAME, import_and_canon_name),

From d28fcc830c2eadc526e43b0a5f6d2ed04e7421ef Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 29 Apr 2013 18:21:29 -0400
Subject: [PATCH 55/60] svcrpc: fix gss-proxy to respect user namespaces

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/auth_gss/gss_rpc_xdr.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index d0ccdffa7e54..5c4c61d527e2 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -216,13 +216,13 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr,
 	err = get_s32(&q, end, &tmp);
 	if (err)
 		return err;
-	creds->cr_uid = tmp;
+	creds->cr_uid = make_kuid(&init_user_ns, tmp);
 
 	/* gid */
 	err = get_s32(&q, end, &tmp);
 	if (err)
 		return err;
-	creds->cr_gid = tmp;
+	creds->cr_gid = make_kgid(&init_user_ns, tmp);
 
 	/* number of additional gid's */
 	err = get_s32(&q, end, &tmp);
@@ -235,15 +235,21 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr,
 
 	/* gid's */
 	for (i = 0; i < N; i++) {
+		kgid_t kgid;
 		err = get_s32(&q, end, &tmp);
-		if (err) {
-			groups_free(creds->cr_group_info);
-			return err;
-		}
-		GROUP_AT(creds->cr_group_info, i) = tmp;
+		if (err)
+			goto out_free_groups;
+		err = -EINVAL;
+		kgid = make_kgid(&init_user_ns, tmp);
+		if (!gid_valid(kgid))
+			goto out_free_groups;
+		GROUP_AT(creds->cr_group_info, i) = kgid;
 	}
 
 	return 0;
+out_free_groups:
+	groups_free(creds->cr_group_info);
+	return err;
 }
 
 static int gssx_dec_option_array(struct xdr_stream *xdr,

From 2a6cf944c2f8ad5a7ef599ed275b85fa56eba3fc Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 30 Apr 2013 15:28:51 -0400
Subject: [PATCH 56/60] nfsd4: don't remap EISDIR errors in rename

We're going out of our way here to remap an error to make rfc 3530
happy--but the rfc itself (nor rfc 1813, which has similar language)
gives no justification.  And disagrees with local filesystem behavior,
with Linux and posix man pages, and knfsd's implemented behavior for v2
and v3.

And the documented behavior seems better, in that it gives a little more
information--you could implement the 3530 behavior using the posix
behavior, but not the other way around.

Also, the Linux client makes no attempt to remap this error in the v4
case, so it can end up just returning EEXIST to the application in a
case where it should return EISDIR.

So honestly I think the rfc's are just buggy here--or in any case it
doesn't see worth the trouble to remap this error.

Reported-by: Frank S Filz <ffilz@us.ibm.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5dee81141ab7..8ae5abfe6ba2 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -813,21 +813,11 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
 			     rename->rn_snamelen, &cstate->current_fh,
 			     rename->rn_tname, rename->rn_tnamelen);
-
-	/* the underlying filesystem returns different error's than required
-	 * by NFSv4. both save_fh and current_fh have been verified.. */
-	if (status == nfserr_isdir)
-		status = nfserr_exist;
-	else if ((status == nfserr_notdir) &&
-                  (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) &&
-                   S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode)))
-		status = nfserr_exist;
-
-	if (!status) {
-		set_change_info(&rename->rn_sinfo, &cstate->current_fh);
-		set_change_info(&rename->rn_tinfo, &cstate->save_fh);
-	}
-	return status;
+	if (status)
+		return status;
+	set_change_info(&rename->rn_sinfo, &cstate->current_fh);
+	set_change_info(&rename->rn_tinfo, &cstate->save_fh);
+	return nfs_ok;
 }
 
 static __be32

From 1eb6d6223aaac87538a0e9aa0d30980df224914c Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Thu, 18 Apr 2013 10:49:09 +0800
Subject: [PATCH 57/60] svcauth_gss: fix error return code in rsc_parse()

Fix to return a negative error code from the error handling
case instead of 0, as returned elsewhere in this function.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/auth_gss/svcauth_gss.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index b70ac1cec8f5..b1924e53e8c7 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -506,8 +506,10 @@ static int rsc_parse(struct cache_detail *cd,
 		len = qword_get(&mesg, buf, mlen);
 		if (len > 0) {
 			rsci.cred.cr_principal = kstrdup(buf, GFP_KERNEL);
-			if (!rsci.cred.cr_principal)
+			if (!rsci.cred.cr_principal) {
+				status = -ENOMEM;
 				goto out;
+			}
 		}
 
 	}

From c8c797f9fdf655245c0d60667af6efa7b32c98f1 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Fri, 5 Apr 2013 21:22:39 +0800
Subject: [PATCH 58/60] nfsd: make symbol nfsd_reply_cache_shrinker static

symbol 'nfsd_reply_cache_shrinker' only used within this file. It should
be static.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfscache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index eb2587745a64..e76244edd748 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -62,7 +62,7 @@ static void	cache_cleaner_func(struct work_struct *unused);
 static int 	nfsd_reply_cache_shrink(struct shrinker *shrink,
 					struct shrink_control *sc);
 
-struct shrinker nfsd_reply_cache_shrinker = {
+static struct shrinker nfsd_reply_cache_shrinker = {
 	.shrink	= nfsd_reply_cache_shrink,
 	.seeks	= 1,
 };

From ed9411a00464860cafe7e07224818cdf04fd9e89 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 30 Apr 2013 18:48:45 -0400
Subject: [PATCH 59/60] NFSD: Simplify GSS flavor encoding in
 nfsd4_do_encode_secinfo()

Clean up.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 78272185a13d..a885e97dc5f4 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3119,17 +3119,11 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
 		struct rpcsec_gss_info info;
 
 		if (rpcauth_get_gssinfo(flavs[i].pseudoflavor, &info) == 0) {
-			RESERVE_SPACE(4);
+			RESERVE_SPACE(4 + 4 + info.oid.len + 4 + 4);
 			WRITE32(RPC_AUTH_GSS);
-			ADJUST_ARGS();
-			RESERVE_SPACE(4 + info.oid.len);
 			WRITE32(info.oid.len);
 			WRITEMEM(info.oid.data, info.oid.len);
-			ADJUST_ARGS();
-			RESERVE_SPACE(4);
 			WRITE32(info.qop);
-			ADJUST_ARGS();
-			RESERVE_SPACE(4);
 			WRITE32(info.service);
 			ADJUST_ARGS();
 		} else {

From 676e4ebd5f2c3b4fd1d2bff79b68385c23c5c105 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 30 Apr 2013 18:48:54 -0400
Subject: [PATCH 60/60] NFSD: SECINFO doesn't handle unsupported pseudoflavors
 correctly

If nfsd4_do_encode_secinfo() can't find GSS info that matches an
export security flavor, it assumes the flavor is not a GSS
pseudoflavor, and simply puts it on the wire.

However, if this XDR encoding logic is given a legitimate GSS
pseudoflavor but the RPC layer says it does not support that
pseudoflavor for some reason, then the server leaks GSS pseudoflavor
numbers onto the wire.

I confirmed this happens by blacklisting rpcsec_gss_krb5, then
attempted a client transition from the pseudo-fs to a Kerberos-only
share.  The client received a flavor list containing the Kerberos
pseudoflavor numbers, rather than GSS tuples.

The encoder logic can check that each pseudoflavor in flavs[] is
less than MAXFLAVOR before writing it into the buffer, to prevent
this.  But after "nflavs" is written into the XDR buffer, the
encoder can't skip writing flavor information into the buffer when
it discovers the RPC layer doesn't support that flavor.

So count the number of valid flavors as they are written into the
XDR buffer, then write that count into a placeholder in the XDR
buffer when all recognized flavors have been encoded.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index a885e97dc5f4..6cd86e0fe450 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3085,10 +3085,11 @@ static __be32
 nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
 			 __be32 nfserr, struct svc_export *exp)
 {
-	u32 i, nflavs;
+	u32 i, nflavs, supported;
 	struct exp_flavor_info *flavs;
 	struct exp_flavor_info def_flavs[2];
-	__be32 *p;
+	__be32 *p, *flavorsp;
+	static bool report = true;
 
 	if (nfserr)
 		goto out;
@@ -3112,13 +3113,17 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
 		}
 	}
 
+	supported = 0;
 	RESERVE_SPACE(4);
-	WRITE32(nflavs);
+	flavorsp = p++;		/* to be backfilled later */
 	ADJUST_ARGS();
+
 	for (i = 0; i < nflavs; i++) {
+		rpc_authflavor_t pf = flavs[i].pseudoflavor;
 		struct rpcsec_gss_info info;
 
-		if (rpcauth_get_gssinfo(flavs[i].pseudoflavor, &info) == 0) {
+		if (rpcauth_get_gssinfo(pf, &info) == 0) {
+			supported++;
 			RESERVE_SPACE(4 + 4 + info.oid.len + 4 + 4);
 			WRITE32(RPC_AUTH_GSS);
 			WRITE32(info.oid.len);
@@ -3126,13 +3131,22 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
 			WRITE32(info.qop);
 			WRITE32(info.service);
 			ADJUST_ARGS();
-		} else {
+		} else if (pf < RPC_AUTH_MAXFLAVOR) {
+			supported++;
 			RESERVE_SPACE(4);
-			WRITE32(flavs[i].pseudoflavor);
+			WRITE32(pf);
 			ADJUST_ARGS();
+		} else {
+			if (report)
+				pr_warn("NFS: SECINFO: security flavor %u "
+					"is not supported\n", pf);
 		}
 	}
 
+	if (nflavs != supported)
+		report = false;
+	*flavorsp = htonl(supported);
+
 out:
 	if (exp)
 		exp_put(exp);