linux_dsm_epyc7002/fs/afs/volume.c
David Howells c435ee3455 afs: Overhaul the callback handling
Overhaul the AFS callback handling by the following means:

 (1) Don't give up callback promises on vnodes that we are no longer using,
     rather let them just expire on the server or let the server break
     them.  This is actually more efficient for the server as the callback
     lookup is expensive if there are lots of extant callbacks.

 (2) Only give up the callback promises we have from a server when the
     server record is destroyed.  Then we can just give up *all* the
     callback promises on it in one go.

 (3) Servers can end up being shared between cells if cells are aliased, so
     don't add all the vnodes being backed by a particular server into a
     big FID-indexed tree on that server as there may be duplicates.

     Instead have each volume instance (~= superblock) register an interest
     in a server as it starts to make use of it and use this to allow the
     processor for callbacks from the server to find the superblock and
     thence the inode corresponding to the FID being broken by means of
     ilookup_nowait().

 (4) Rather than iterating over the entire callback list when a mass-break
     comes in from the server, maintain a counter of mass-breaks in
     afs_server (cb_seq) and make afs_validate() check it against the copy
     in afs_vnode.

     It would be nice not to have to take a read_lock whilst doing this,
     but that's tricky without using RCU.

 (5) Save a ref on the fileserver we're using for a call in the afs_call
     struct so that we can access its cb_s_break during call decoding.

 (6) Write-lock around callback and status storage in a vnode and read-lock
     around getattr so that we don't see the status mid-update.

This has the following consequences:

 (1) Data invalidation isn't seen until someone calls afs_validate() on a
     vnode.  Unfortunately, we need to use a key to query the server, but
     getting one from a background thread is tricky without caching loads
     of keys all over the place.

 (2) Mass invalidation isn't seen until someone calls afs_validate().

 (3) Callback breaking is going to hit the inode_hash_lock quite a bit.
     Could this be replaced with rcu_read_lock() since inodes are destroyed
     under RCU conditions.

Signed-off-by: David Howells <dhowells@redhat.com>
2017-11-13 15:38:18 +00:00

405 lines
10 KiB
C

/* AFS volume management
*
* Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/sched.h>
#include "internal.h"
static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" };
/*
* lookup a volume by name
* - this can be one of the following:
* "%[cell:]volume[.]" R/W volume
* "#[cell:]volume[.]" R/O or R/W volume (rwparent=0),
* or R/W (rwparent=1) volume
* "%[cell:]volume.readonly" R/O volume
* "#[cell:]volume.readonly" R/O volume
* "%[cell:]volume.backup" Backup volume
* "#[cell:]volume.backup" Backup volume
*
* The cell name is optional, and defaults to the current cell.
*
* See "The Rules of Mount Point Traversal" in Chapter 5 of the AFS SysAdmin
* Guide
* - Rule 1: Explicit type suffix forces access of that type or nothing
* (no suffix, then use Rule 2 & 3)
* - Rule 2: If parent volume is R/O, then mount R/O volume by preference, R/W
* if not available
* - Rule 3: If parent volume is R/W, then only mount R/W volume unless
* explicitly told otherwise
*/
struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
{
struct afs_vlocation *vlocation = NULL;
struct afs_volume *volume = NULL;
struct afs_server *server = NULL;
char srvtmask;
int ret, loop;
_enter("{%*.*s,%d}",
params->volnamesz, params->volnamesz, params->volname, params->rwpath);
/* lookup the volume location record */
vlocation = afs_vlocation_lookup(params->net, params->cell, params->key,
params->volname, params->volnamesz);
if (IS_ERR(vlocation)) {
ret = PTR_ERR(vlocation);
vlocation = NULL;
goto error;
}
/* make the final decision on the type we want */
ret = -ENOMEDIUM;
if (params->force && !(vlocation->vldb.vidmask & (1 << params->type)))
goto error;
srvtmask = 0;
for (loop = 0; loop < vlocation->vldb.nservers; loop++)
srvtmask |= vlocation->vldb.srvtmask[loop];
if (params->force) {
if (!(srvtmask & (1 << params->type)))
goto error;
} else if (srvtmask & AFS_VOL_VTM_RO) {
params->type = AFSVL_ROVOL;
} else if (srvtmask & AFS_VOL_VTM_RW) {
params->type = AFSVL_RWVOL;
} else {
goto error;
}
down_write(&params->cell->vl_sem);
/* is the volume already active? */
if (vlocation->vols[params->type]) {
/* yes - re-use it */
volume = vlocation->vols[params->type];
afs_get_volume(volume);
goto success;
}
/* create a new volume record */
_debug("creating new volume record");
ret = -ENOMEM;
volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL);
if (!volume)
goto error_up;
atomic_set(&volume->usage, 1);
volume->type = params->type;
volume->type_force = params->force;
volume->cell = params->cell;
volume->vid = vlocation->vldb.vid[params->type];
init_rwsem(&volume->server_sem);
/* look up all the applicable server records */
for (loop = 0; loop < 8; loop++) {
if (vlocation->vldb.srvtmask[loop] & (1 << volume->type)) {
server = afs_lookup_server(
volume->cell, &vlocation->vldb.servers[loop]);
if (IS_ERR(server)) {
ret = PTR_ERR(server);
goto error_discard;
}
volume->servers[volume->nservers] = server;
volume->nservers++;
}
}
/* attach the cache and volume location */
#ifdef CONFIG_AFS_FSCACHE
volume->cache = fscache_acquire_cookie(volume->cell->cache,
&afs_volume_cache_index_def,
volume, true);
#endif
afs_get_vlocation(vlocation);
volume->vlocation = vlocation;
vlocation->vols[volume->type] = volume;
success:
_debug("kAFS selected %s volume %08x",
afs_voltypes[volume->type], volume->vid);
up_write(&params->cell->vl_sem);
afs_put_vlocation(params->net, vlocation);
_leave(" = %p", volume);
return volume;
/* clean up */
error_up:
up_write(&params->cell->vl_sem);
error:
afs_put_vlocation(params->net, vlocation);
_leave(" = %d", ret);
return ERR_PTR(ret);
error_discard:
up_write(&params->cell->vl_sem);
for (loop = volume->nservers - 1; loop >= 0; loop--) {
afs_put_cb_interest(params->net, volume->cb_interests[loop]);
afs_put_server(params->net, volume->servers[loop]);
}
kfree(volume);
goto error;
}
/*
* destroy a volume record
*/
void afs_put_volume(struct afs_cell *cell, struct afs_volume *volume)
{
struct afs_vlocation *vlocation;
int loop;
if (!volume)
return;
_enter("%p", volume);
ASSERTCMP(atomic_read(&volume->usage), >, 0);
vlocation = volume->vlocation;
/* to prevent a race, the decrement and the dequeue must be effectively
* atomic */
down_write(&cell->vl_sem);
if (likely(!atomic_dec_and_test(&volume->usage))) {
up_write(&vlocation->cell->vl_sem);
_leave("");
return;
}
vlocation->vols[volume->type] = NULL;
up_write(&cell->vl_sem);
/* finish cleaning up the volume */
#ifdef CONFIG_AFS_FSCACHE
fscache_relinquish_cookie(volume->cache, 0);
#endif
afs_put_vlocation(cell->net, vlocation);
for (loop = volume->nservers - 1; loop >= 0; loop--) {
afs_put_cb_interest(cell->net, volume->cb_interests[loop]);
afs_put_server(cell->net, volume->servers[loop]);
}
kfree(volume);
_leave(" [destroyed]");
}
/*
* pick a server to use to try accessing this volume
* - returns with an elevated usage count on the server chosen
*/
struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode)
{
struct afs_volume *volume = vnode->volume;
struct afs_server *server;
int ret, state, loop;
_enter("%s", volume->vlocation->vldb.name);
/* stick with the server we're already using if we can */
if (vnode->cb_interest && vnode->cb_interest->server->fs_state == 0) {
afs_get_server(vnode->cb_interest->server);
_leave(" = %p [current]", vnode->cb_interest->server);
return vnode->cb_interest->server;
}
down_read(&volume->server_sem);
/* handle the no-server case */
if (volume->nservers == 0) {
ret = volume->rjservers ? -ENOMEDIUM : -ESTALE;
up_read(&volume->server_sem);
_leave(" = %d [no servers]", ret);
return ERR_PTR(ret);
}
/* basically, just search the list for the first live server and use
* that */
ret = 0;
for (loop = 0; loop < volume->nservers; loop++) {
server = volume->servers[loop];
state = server->fs_state;
_debug("consider %d [%d]", loop, state);
switch (state) {
case 0:
goto picked_server;
case -ENETUNREACH:
if (ret == 0)
ret = state;
break;
case -EHOSTUNREACH:
if (ret == 0 ||
ret == -ENETUNREACH)
ret = state;
break;
case -ECONNREFUSED:
if (ret == 0 ||
ret == -ENETUNREACH ||
ret == -EHOSTUNREACH)
ret = state;
break;
default:
case -EREMOTEIO:
if (ret == 0 ||
ret == -ENETUNREACH ||
ret == -EHOSTUNREACH ||
ret == -ECONNREFUSED)
ret = state;
break;
}
}
/* no available servers
* - TODO: handle the no active servers case better
*/
error:
up_read(&volume->server_sem);
_leave(" = %d", ret);
return ERR_PTR(ret);
picked_server:
/* Found an apparently healthy server. We need to register an interest
* in receiving callbacks before we talk to it.
*/
ret = afs_register_server_cb_interest(vnode,
&volume->cb_interests[loop], server);
if (ret < 0)
goto error;
afs_get_server(server);
up_read(&volume->server_sem);
_leave(" = %p (picked %pIS)",
server, &server->addr.transport);
return server;
}
/*
* release a server after use
* - releases the ref on the server struct that was acquired by picking
* - records result of using a particular server to access a volume
* - return 0 to try again, 1 if okay or to issue error
* - the caller must release the server struct if result was 0
*/
int afs_volume_release_fileserver(struct afs_vnode *vnode,
struct afs_server *server,
int result)
{
struct afs_volume *volume = vnode->volume;
unsigned loop;
_enter("%s,%pIS,%d",
volume->vlocation->vldb.name, &server->addr.transport, result);
switch (result) {
/* success */
case 0:
server->fs_state = 0;
_leave("");
return 1;
/* the fileserver denied all knowledge of the volume */
case -ENOMEDIUM:
down_write(&volume->server_sem);
/* firstly, find where the server is in the active list (if it
* is) */
for (loop = 0; loop < volume->nservers; loop++)
if (volume->servers[loop] == server)
goto present;
/* no longer there - may have been discarded by another op */
goto try_next_server_upw;
present:
volume->nservers--;
memmove(&volume->servers[loop],
&volume->servers[loop + 1],
sizeof(volume->servers[loop]) *
(volume->nservers - loop));
volume->servers[volume->nservers] = NULL;
afs_put_server(afs_v2net(vnode), server);
volume->rjservers++;
if (volume->nservers > 0)
/* another server might acknowledge its existence */
goto try_next_server_upw;
/* handle the case where all the fileservers have rejected the
* volume
* - TODO: try asking the fileservers for volume information
* - TODO: contact the VL server again to see if the volume is
* no longer registered
*/
up_write(&volume->server_sem);
afs_put_server(afs_v2net(vnode), server);
_leave(" [completely rejected]");
return 1;
/* problem reaching the server */
case -ENETUNREACH:
case -EHOSTUNREACH:
case -ECONNREFUSED:
case -ETIME:
case -ETIMEDOUT:
case -EREMOTEIO:
/* mark the server as dead
* TODO: vary dead timeout depending on error
*/
spin_lock(&server->fs_lock);
if (!server->fs_state) {
server->fs_state = result;
printk("kAFS: SERVER DEAD state=%d\n", result);
}
spin_unlock(&server->fs_lock);
goto try_next_server;
/* miscellaneous error */
default:
case -ENOMEM:
case -ENONET:
/* tell the caller to accept the result */
afs_put_server(afs_v2net(vnode), server);
_leave(" [local failure]");
return 1;
}
/* tell the caller to loop around and try the next server */
try_next_server_upw:
up_write(&volume->server_sem);
try_next_server:
afs_put_server(afs_v2net(vnode), server);
_leave(" [try next server]");
return 0;
}