linux_dsm_epyc7002/fs/cifs/smb2misc.c

825 lines
23 KiB
C
Raw Normal View History

/*
* fs/cifs/smb2misc.c
*
* Copyright (C) International Business Machines Corp., 2002,2011
* Etersoft, 2012
* Author(s): Steve French (sfrench@us.ibm.com)
* Pavel Shilovsky (pshilovsky@samba.org) 2012
*
* This library is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
* the GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/ctype.h>
#include "smb2pdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
#include "smb2proto.h"
#include "cifs_debug.h"
#include "cifs_unicode.h"
#include "smb2status.h"
#include "smb2glob.h"
static int
check_smb2_hdr(struct smb2_sync_hdr *shdr, __u64 mid)
{
__u64 wire_mid = le64_to_cpu(shdr->MessageId);
/*
* Make sure that this really is an SMB, that it is a response,
* and that the message ids match.
*/
if ((shdr->ProtocolId == SMB2_PROTO_NUMBER) &&
(mid == wire_mid)) {
if (shdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
return 0;
else {
/* only one valid case where server sends us request */
if (shdr->Command == SMB2_OPLOCK_BREAK)
return 0;
else
cifs_dbg(VFS, "Received Request not response\n");
}
} else { /* bad signature or mid */
if (shdr->ProtocolId != SMB2_PROTO_NUMBER)
cifs_dbg(VFS, "Bad protocol string signature header %x\n",
le32_to_cpu(shdr->ProtocolId));
if (mid != wire_mid)
cifs_dbg(VFS, "Mids do not match: %llu and %llu\n",
mid, wire_mid);
}
cifs_dbg(VFS, "Bad SMB detected. The Mid=%llu\n", wire_mid);
return 1;
}
/*
* The following table defines the expected "StructureSize" of SMB2 responses
* in order by SMB2 command. This is similar to "wct" in SMB/CIFS responses.
*
* Note that commands are defined in smb2pdu.h in le16 but the array below is
* indexed by command in host byte order
*/
static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
/* SMB2_NEGOTIATE */ cpu_to_le16(65),
/* SMB2_SESSION_SETUP */ cpu_to_le16(9),
/* SMB2_LOGOFF */ cpu_to_le16(4),
/* SMB2_TREE_CONNECT */ cpu_to_le16(16),
/* SMB2_TREE_DISCONNECT */ cpu_to_le16(4),
/* SMB2_CREATE */ cpu_to_le16(89),
/* SMB2_CLOSE */ cpu_to_le16(60),
/* SMB2_FLUSH */ cpu_to_le16(4),
/* SMB2_READ */ cpu_to_le16(17),
/* SMB2_WRITE */ cpu_to_le16(17),
/* SMB2_LOCK */ cpu_to_le16(4),
/* SMB2_IOCTL */ cpu_to_le16(49),
/* BB CHECK this ... not listed in documentation */
/* SMB2_CANCEL */ cpu_to_le16(0),
/* SMB2_ECHO */ cpu_to_le16(4),
/* SMB2_QUERY_DIRECTORY */ cpu_to_le16(9),
/* SMB2_CHANGE_NOTIFY */ cpu_to_le16(9),
/* SMB2_QUERY_INFO */ cpu_to_le16(9),
/* SMB2_SET_INFO */ cpu_to_le16(2),
/* BB FIXME can also be 44 for lease break */
/* SMB2_OPLOCK_BREAK */ cpu_to_le16(24)
};
static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len,
__u32 non_ctxlen)
{
__u16 neg_count;
__u32 nc_offset, size_of_pad_before_neg_ctxts;
struct smb2_negotiate_rsp *pneg_rsp = (struct smb2_negotiate_rsp *)hdr;
/* Negotiate contexts are only valid for latest dialect SMB3.11 */
neg_count = le16_to_cpu(pneg_rsp->NegotiateContextCount);
if ((neg_count == 0) ||
(pneg_rsp->DialectRevision != cpu_to_le16(SMB311_PROT_ID)))
return 0;
/* Make sure that negotiate contexts start after gss security blob */
nc_offset = le32_to_cpu(pneg_rsp->NegotiateContextOffset);
if (nc_offset < non_ctxlen) {
printk_once(KERN_WARNING "invalid negotiate context offset\n");
return 0;
}
size_of_pad_before_neg_ctxts = nc_offset - non_ctxlen;
/* Verify that at least minimal negotiate contexts fit within frame */
if (len < nc_offset + (neg_count * sizeof(struct smb2_neg_context))) {
printk_once(KERN_WARNING "negotiate context goes beyond end\n");
return 0;
}
cifs_dbg(FYI, "length of negcontexts %d pad %d\n",
len - nc_offset, size_of_pad_before_neg_ctxts);
/* length of negcontexts including pad from end of sec blob to them */
return (len - nc_offset) + size_of_pad_before_neg_ctxts;
}
int
smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
{
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)shdr;
__u64 mid;
__u32 clc_len; /* calculated length */
int command;
int pdu_size = sizeof(struct smb2_sync_pdu);
int hdr_size = sizeof(struct smb2_sync_hdr);
/*
* Add function to do table lookup of StructureSize by command
* ie Validate the wct via smb2_struct_sizes table above
*/
if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
struct smb2_transform_hdr *thdr =
(struct smb2_transform_hdr *)buf;
struct cifs_ses *ses = NULL;
struct list_head *tmp;
/* decrypt frame now that it is completely read in */
spin_lock(&cifs_tcp_ses_lock);
list_for_each(tmp, &srvr->smb_ses_list) {
ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
if (ses->Suid == thdr->SessionId)
break;
ses = NULL;
}
spin_unlock(&cifs_tcp_ses_lock);
if (ses == NULL) {
cifs_dbg(VFS, "no decryption - session id not found\n");
return 1;
}
}
mid = le64_to_cpu(shdr->MessageId);
if (len < pdu_size) {
if ((len >= hdr_size)
&& (shdr->Status != 0)) {
pdu->StructureSize2 = 0;
/*
* As with SMB/CIFS, on some error cases servers may
* not return wct properly
*/
return 0;
} else {
cifs_dbg(VFS, "Length less than SMB header size\n");
}
return 1;
}
if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE) {
cifs_dbg(VFS, "SMB length greater than maximum, mid=%llu\n",
mid);
return 1;
}
if (check_smb2_hdr(shdr, mid))
return 1;
if (shdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) {
cifs_dbg(VFS, "Illegal structure size %u\n",
le16_to_cpu(shdr->StructureSize));
return 1;
}
command = le16_to_cpu(shdr->Command);
if (command >= NUMBER_OF_SMB2_COMMANDS) {
cifs_dbg(VFS, "Illegal SMB2 command %d\n", command);
return 1;
}
if (smb2_rsp_struct_sizes[command] != pdu->StructureSize2) {
if (command != SMB2_OPLOCK_BREAK_HE && (shdr->Status == 0 ||
pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) {
/* error packets have 9 byte structure size */
cifs_dbg(VFS, "Illegal response size %u for command %d\n",
le16_to_cpu(pdu->StructureSize2), command);
return 1;
} else if (command == SMB2_OPLOCK_BREAK_HE
&& (shdr->Status == 0)
&& (le16_to_cpu(pdu->StructureSize2) != 44)
&& (le16_to_cpu(pdu->StructureSize2) != 36)) {
/* special case for SMB2.1 lease break message */
cifs_dbg(VFS, "Illegal response size %d for oplock break\n",
le16_to_cpu(pdu->StructureSize2));
return 1;
}
}
clc_len = smb2_calc_size(buf, srvr);
if (shdr->Command == SMB2_NEGOTIATE)
clc_len += get_neg_ctxt_len(shdr, len, clc_len);
if (len != clc_len) {
cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n",
clc_len, len, mid);
/* create failed on symlink */
if (command == SMB2_CREATE_HE &&
shdr->Status == STATUS_STOPPED_ON_SYMLINK)
return 0;
/* Windows 7 server returns 24 bytes more */
if (clc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE)
return 0;
/* server can return one byte more due to implied bcc[0] */
if (clc_len == len + 1)
return 0;
/*
* Some windows servers (win2016) will pad also the final
* PDU in a compound to 8 bytes.
*/
if (((clc_len + 7) & ~7) == len)
return 0;
/*
* MacOS server pads after SMB2.1 write response with 3 bytes
* of junk. Other servers match RFC1001 len to actual
* SMB2/SMB3 frame length (header + smb2 response specific data)
* Some windows servers do too when compounding is used.
* Log the server error (once), but allow it and continue
* since the frame is parseable.
*/
if (clc_len < len) {
printk_once(KERN_WARNING
"SMB2 server sent bad RFC1001 len %d not %d\n",
len, clc_len);
return 0;
}
return 1;
}
return 0;
}
/*
* The size of the variable area depends on the offset and length fields
* located in different fields for various SMB2 responses. SMB2 responses
* with no variable length info, show an offset of zero for the offset field.
*/
static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
/* SMB2_NEGOTIATE */ true,
/* SMB2_SESSION_SETUP */ true,
/* SMB2_LOGOFF */ false,
/* SMB2_TREE_CONNECT */ false,
/* SMB2_TREE_DISCONNECT */ false,
/* SMB2_CREATE */ true,
/* SMB2_CLOSE */ false,
/* SMB2_FLUSH */ false,
/* SMB2_READ */ true,
/* SMB2_WRITE */ false,
/* SMB2_LOCK */ false,
/* SMB2_IOCTL */ true,
/* SMB2_CANCEL */ false, /* BB CHECK this not listed in documentation */
/* SMB2_ECHO */ false,
/* SMB2_QUERY_DIRECTORY */ true,
/* SMB2_CHANGE_NOTIFY */ true,
/* SMB2_QUERY_INFO */ true,
/* SMB2_SET_INFO */ false,
/* SMB2_OPLOCK_BREAK */ false
};
/*
* Returns the pointer to the beginning of the data area. Length of the data
* area and the offset to it (from the beginning of the smb are also returned.
*/
char *
smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
{
*off = 0;
*len = 0;
/* error responses do not have data area */
if (shdr->Status && shdr->Status != STATUS_MORE_PROCESSING_REQUIRED &&
(((struct smb2_err_rsp *)shdr)->StructureSize) ==
SMB2_ERROR_STRUCTURE_SIZE2)
return NULL;
/*
* Following commands have data areas so we have to get the location
* of the data buffer offset and data buffer length for the particular
* command.
*/
switch (shdr->Command) {
case SMB2_NEGOTIATE:
*off = le16_to_cpu(
((struct smb2_negotiate_rsp *)shdr)->SecurityBufferOffset);
*len = le16_to_cpu(
((struct smb2_negotiate_rsp *)shdr)->SecurityBufferLength);
break;
case SMB2_SESSION_SETUP:
*off = le16_to_cpu(
((struct smb2_sess_setup_rsp *)shdr)->SecurityBufferOffset);
*len = le16_to_cpu(
((struct smb2_sess_setup_rsp *)shdr)->SecurityBufferLength);
break;
case SMB2_CREATE:
*off = le32_to_cpu(
((struct smb2_create_rsp *)shdr)->CreateContextsOffset);
*len = le32_to_cpu(
((struct smb2_create_rsp *)shdr)->CreateContextsLength);
break;
case SMB2_QUERY_INFO:
*off = le16_to_cpu(
((struct smb2_query_info_rsp *)shdr)->OutputBufferOffset);
*len = le32_to_cpu(
((struct smb2_query_info_rsp *)shdr)->OutputBufferLength);
break;
case SMB2_READ:
/* TODO: is this a bug ? */
*off = ((struct smb2_read_rsp *)shdr)->DataOffset;
*len = le32_to_cpu(((struct smb2_read_rsp *)shdr)->DataLength);
break;
case SMB2_QUERY_DIRECTORY:
*off = le16_to_cpu(
((struct smb2_query_directory_rsp *)shdr)->OutputBufferOffset);
*len = le32_to_cpu(
((struct smb2_query_directory_rsp *)shdr)->OutputBufferLength);
break;
case SMB2_IOCTL:
*off = le32_to_cpu(
((struct smb2_ioctl_rsp *)shdr)->OutputOffset);
*len = le32_to_cpu(
((struct smb2_ioctl_rsp *)shdr)->OutputCount);
break;
case SMB2_CHANGE_NOTIFY:
default:
/* BB FIXME for unimplemented cases above */
cifs_dbg(VFS, "no length check for command\n");
break;
}
/*
* Invalid length or offset probably means data area is invalid, but
* we have little choice but to ignore the data area in this case.
*/
if (*off > 4096) {
cifs_dbg(VFS, "offset %d too large, data area ignored\n", *off);
*len = 0;
*off = 0;
} else if (*off < 0) {
cifs_dbg(VFS, "negative offset %d to data invalid ignore data area\n",
*off);
*off = 0;
*len = 0;
} else if (*len < 0) {
cifs_dbg(VFS, "negative data length %d invalid, data area ignored\n",
*len);
*len = 0;
} else if (*len > 128 * 1024) {
cifs_dbg(VFS, "data area larger than 128K: %d\n", *len);
*len = 0;
}
/* return pointer to beginning of data area, ie offset from SMB start */
if ((*off != 0) && (*len != 0))
return (char *)shdr + *off;
else
return NULL;
}
/*
* Calculate the size of the SMB message based on the fixed header
* portion, the number of word parameters and the data portion of the message.
*/
unsigned int
smb2_calc_size(void *buf, struct TCP_Server_Info *srvr)
{
struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)buf;
struct smb2_sync_hdr *shdr = &pdu->sync_hdr;
int offset; /* the offset from the beginning of SMB to data area */
int data_length; /* the length of the variable length data area */
/* Structure Size has already been checked to make sure it is 64 */
int len = le16_to_cpu(shdr->StructureSize);
/*
* StructureSize2, ie length of fixed parameter area has already
* been checked to make sure it is the correct length.
*/
len += le16_to_cpu(pdu->StructureSize2);
if (has_smb2_data_area[le16_to_cpu(shdr->Command)] == false)
goto calc_size_exit;
smb2_get_data_area_len(&offset, &data_length, shdr);
cifs_dbg(FYI, "SMB2 data length %d offset %d\n", data_length, offset);
if (data_length > 0) {
/*
* Check to make sure that data area begins after fixed area,
* Note that last byte of the fixed area is part of data area
* for some commands, typically those with odd StructureSize,
* so we must add one to the calculation.
*/
if (offset + 1 < len) {
cifs_dbg(VFS, "data area offset %d overlaps SMB2 header %d\n",
offset + 1, len);
data_length = 0;
} else {
len = offset + data_length;
}
}
calc_size_exit:
cifs_dbg(FYI, "SMB2 len %d\n", len);
return len;
}
/* Note: caller must free return buffer */
__le16 *
cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb)
{
int len;
const char *start_of_path;
__le16 *to;
int map_type;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR)
map_type = SFM_MAP_UNI_RSVD;
else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
map_type = SFU_MAP_UNI_RSVD;
else
map_type = NO_MAP_UNI_RSVD;
/* Windows doesn't allow paths beginning with \ */
if (from[0] == '\\')
start_of_path = from + 1;
/* SMB311 POSIX extensions paths do not include leading slash */
else if (cifs_sb_master_tlink(cifs_sb) &&
cifs_sb_master_tcon(cifs_sb)->posix_extensions &&
(from[0] == '/')) {
start_of_path = from + 1;
} else
start_of_path = from;
to = cifs_strndup_to_utf16(start_of_path, PATH_MAX, &len,
cifs_sb->local_nls, map_type);
return to;
}
__le32
smb2_get_lease_state(struct cifsInodeInfo *cinode)
{
__le32 lease = 0;
if (CIFS_CACHE_WRITE(cinode))
lease |= SMB2_LEASE_WRITE_CACHING;
if (CIFS_CACHE_HANDLE(cinode))
lease |= SMB2_LEASE_HANDLE_CACHING;
if (CIFS_CACHE_READ(cinode))
lease |= SMB2_LEASE_READ_CACHING;
return lease;
}
struct smb2_lease_break_work {
struct work_struct lease_break;
struct tcon_link *tlink;
__u8 lease_key[16];
__le32 lease_state;
};
static void
cifs_ses_oplock_break(struct work_struct *work)
{
struct smb2_lease_break_work *lw = container_of(work,
struct smb2_lease_break_work, lease_break);
int rc = 0;
rc = SMB2_lease_break(0, tlink_tcon(lw->tlink), lw->lease_key,
lw->lease_state);
cifs_dbg(FYI, "Lease release rc %d\n", rc);
cifs_put_tlink(lw->tlink);
kfree(lw);
}
static bool
smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
struct smb2_lease_break_work *lw)
{
bool found;
__u8 lease_state;
struct list_head *tmp;
struct cifsFileInfo *cfile;
struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_pending_open *open;
struct cifsInodeInfo *cinode;
int ack_req = le32_to_cpu(rsp->Flags &
SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED);
lease_state = le32_to_cpu(rsp->NewLeaseState);
list_for_each(tmp, &tcon->openFileList) {
cfile = list_entry(tmp, struct cifsFileInfo, tlist);
cinode = CIFS_I(d_inode(cfile->dentry));
if (memcmp(cinode->lease_key, rsp->LeaseKey,
SMB2_LEASE_KEY_SIZE))
continue;
cifs_dbg(FYI, "found in the open list\n");
cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
le32_to_cpu(rsp->NewLeaseState));
server->ops->set_oplock_level(cinode, lease_state, 0, NULL);
if (ack_req)
cfile->oplock_break_cancelled = false;
else
cfile->oplock_break_cancelled = true;
CIFS: fix oplock break deadlocks When the final cifsFileInfo_put() is called from cifsiod and an oplock break work is queued, lockdep complains loudly: ============================================= [ INFO: possible recursive locking detected ] 4.11.0+ #21 Not tainted --------------------------------------------- kworker/0:2/78 is trying to acquire lock: ("cifsiod"){++++.+}, at: flush_work+0x215/0x350 but task is already holding lock: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock("cifsiod"); lock("cifsiod"); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by kworker/0:2/78: #0: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 #1: ((&wdata->work)){+.+...}, at: process_one_work+0x255/0x8e0 stack backtrace: CPU: 0 PID: 78 Comm: kworker/0:2 Not tainted 4.11.0+ #21 Workqueue: cifsiod cifs_writev_complete Call Trace: dump_stack+0x85/0xc2 __lock_acquire+0x17dd/0x2260 ? match_held_lock+0x20/0x2b0 ? trace_hardirqs_off_caller+0x86/0x130 ? mark_lock+0xa6/0x920 lock_acquire+0xcc/0x260 ? lock_acquire+0xcc/0x260 ? flush_work+0x215/0x350 flush_work+0x236/0x350 ? flush_work+0x215/0x350 ? destroy_worker+0x170/0x170 __cancel_work_timer+0x17d/0x210 ? ___preempt_schedule+0x16/0x18 cancel_work_sync+0x10/0x20 cifsFileInfo_put+0x338/0x7f0 cifs_writedata_release+0x2a/0x40 ? cifs_writedata_release+0x2a/0x40 cifs_writev_complete+0x29d/0x850 ? preempt_count_sub+0x18/0xd0 process_one_work+0x304/0x8e0 worker_thread+0x9b/0x6a0 kthread+0x1b2/0x200 ? process_one_work+0x8e0/0x8e0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x31/0x40 This is a real warning. Since the oplock is queued on the same workqueue this can deadlock if there is only one worker thread active for the workqueue (which will be the case during memory pressure when the rescuer thread is handling it). Furthermore, there is at least one other kind of hang possible due to the oplock break handling if there is only worker. (This can be reproduced without introducing memory pressure by having passing 1 for the max_active parameter of cifsiod.) cifs_oplock_break() can wait indefintely in the filemap_fdatawait() while the cifs_writev_complete() work is blocked: sysrq: SysRq : Show Blocked State task PC stack pid father kworker/0:1 D 0 16 2 0x00000000 Workqueue: cifsiod cifs_oplock_break Call Trace: __schedule+0x562/0xf40 ? mark_held_locks+0x4a/0xb0 schedule+0x57/0xe0 io_schedule+0x21/0x50 wait_on_page_bit+0x143/0x190 ? add_to_page_cache_lru+0x150/0x150 __filemap_fdatawait_range+0x134/0x190 ? do_writepages+0x51/0x70 filemap_fdatawait_range+0x14/0x30 filemap_fdatawait+0x3b/0x40 cifs_oplock_break+0x651/0x710 ? preempt_count_sub+0x18/0xd0 process_one_work+0x304/0x8e0 worker_thread+0x9b/0x6a0 kthread+0x1b2/0x200 ? process_one_work+0x8e0/0x8e0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x31/0x40 dd D 0 683 171 0x00000000 Call Trace: __schedule+0x562/0xf40 ? mark_held_locks+0x29/0xb0 schedule+0x57/0xe0 io_schedule+0x21/0x50 wait_on_page_bit+0x143/0x190 ? add_to_page_cache_lru+0x150/0x150 __filemap_fdatawait_range+0x134/0x190 ? do_writepages+0x51/0x70 filemap_fdatawait_range+0x14/0x30 filemap_fdatawait+0x3b/0x40 filemap_write_and_wait+0x4e/0x70 cifs_flush+0x6a/0xb0 filp_close+0x52/0xa0 __close_fd+0xdc/0x150 SyS_close+0x33/0x60 entry_SYSCALL_64_fastpath+0x1f/0xbe Showing all locks held in the system: 2 locks held by kworker/0:1/16: #0: ("cifsiod"){.+.+.+}, at: process_one_work+0x255/0x8e0 #1: ((&cfile->oplock_break)){+.+.+.}, at: process_one_work+0x255/0x8e0 Showing busy workqueues and worker pools: workqueue cifsiod: flags=0xc pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/1 in-flight: 16:cifs_oplock_break delayed: cifs_writev_complete, cifs_echo_request pool 0: cpus=0 node=0 flags=0x0 nice=0 hung=0s workers=3 idle: 750 3 Fix these problems by creating a a new workqueue (with a rescuer) for the oplock break work. Signed-off-by: Rabin Vincent <rabinv@axis.com> Signed-off-by: Steve French <smfrench@gmail.com> CC: Stable <stable@vger.kernel.org>
2017-05-03 22:54:01 +07:00
queue_work(cifsoplockd_wq, &cfile->oplock_break);
kfree(lw);
return true;
}
found = false;
list_for_each_entry(open, &tcon->pending_opens, olist) {
if (memcmp(open->lease_key, rsp->LeaseKey,
SMB2_LEASE_KEY_SIZE))
continue;
if (!found && ack_req) {
found = true;
memcpy(lw->lease_key, open->lease_key,
SMB2_LEASE_KEY_SIZE);
lw->tlink = cifs_get_tlink(open->tlink);
queue_work(cifsiod_wq, &lw->lease_break);
}
cifs_dbg(FYI, "found in the pending open list\n");
cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
le32_to_cpu(rsp->NewLeaseState));
open->oplock = lease_state;
}
return found;
}
static bool
smb2_is_valid_lease_break(char *buffer)
{
struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer;
struct list_head *tmp, *tmp1, *tmp2;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct smb2_lease_break_work *lw;
lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL);
if (!lw)
return false;
INIT_WORK(&lw->lease_break, cifs_ses_oplock_break);
lw->lease_state = rsp->NewLeaseState;
cifs_dbg(FYI, "Checking for lease break\n");
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
list_for_each(tmp, &cifs_tcp_ses_list) {
server = list_entry(tmp, struct TCP_Server_Info, tcp_ses_list);
list_for_each(tmp1, &server->smb_ses_list) {
ses = list_entry(tmp1, struct cifs_ses, smb_ses_list);
list_for_each(tmp2, &ses->tcon_list) {
tcon = list_entry(tmp2, struct cifs_tcon,
tcon_list);
spin_lock(&tcon->open_file_lock);
cifs_stats_inc(
&tcon->stats.cifs_stats.num_oplock_brks);
if (smb2_tcon_has_lease(tcon, rsp, lw)) {
spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
return true;
}
spin_unlock(&tcon->open_file_lock);
if (tcon->crfid.is_valid &&
!memcmp(rsp->LeaseKey,
tcon->crfid.fid->lease_key,
SMB2_LEASE_KEY_SIZE)) {
INIT_WORK(&tcon->crfid.lease_break,
smb2_cached_lease_break);
queue_work(cifsiod_wq,
&tcon->crfid.lease_break);
spin_unlock(&cifs_tcp_ses_lock);
return true;
}
}
}
}
spin_unlock(&cifs_tcp_ses_lock);
kfree(lw);
cifs_dbg(FYI, "Can not process lease break - no lease matched\n");
return false;
}
bool
smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
{
struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer;
struct list_head *tmp, *tmp1, *tmp2;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct cifsInodeInfo *cinode;
struct cifsFileInfo *cfile;
cifs_dbg(FYI, "Checking for oplock break\n");
if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK)
return false;
if (rsp->StructureSize !=
smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) {
if (le16_to_cpu(rsp->StructureSize) == 44)
return smb2_is_valid_lease_break(buffer);
else
return false;
}
cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel);
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
list_for_each(tmp, &server->smb_ses_list) {
ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
list_for_each(tmp1, &ses->tcon_list) {
tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
spin_lock(&tcon->open_file_lock);
list_for_each(tmp2, &tcon->openFileList) {
cfile = list_entry(tmp2, struct cifsFileInfo,
tlist);
if (rsp->PersistentFid !=
cfile->fid.persistent_fid ||
rsp->VolatileFid !=
cfile->fid.volatile_fid)
continue;
cifs_dbg(FYI, "file id match, oplock break\n");
cinode = CIFS_I(d_inode(cfile->dentry));
spin_lock(&cfile->file_info_lock);
if (!CIFS_CACHE_WRITE(cinode) &&
rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE)
cfile->oplock_break_cancelled = true;
else
cfile->oplock_break_cancelled = false;
cifs: Wait for writebacks to complete before attempting write. Problem reported in Red Hat bz 1040329 for strict writes where we cache only when we hold oplock and write direct to the server when we don't. When we receive an oplock break, we first change the oplock value for the inode in cifsInodeInfo->oplock to indicate that we no longer hold the oplock before we enqueue a task to flush changes to the backing device. Once we have completed flushing the changes, we return the oplock to the server. There are 2 ways here where we can have data corruption 1) While we flush changes to the backing device as part of the oplock break, we can have processes write to the file. These writes check for the oplock, find none and attempt to write directly to the server. These direct writes made while we are flushing from cache could be overwritten by data being flushed from the cache causing data corruption. 2) While a thread runs in cifs_strict_writev, the machine could receive and process an oplock break after the thread has checked the oplock and found that it allows us to cache and before we have made changes to the cache. In that case, we end up with a dirty page in cache when we shouldn't have any. This will be flushed later and will overwrite all subsequent writes to the part of the file represented by this page. Before making any writes to the server, we need to confirm that we are not in the process of flushing data to the server and if we are, we should wait until the process is complete before we attempt the write. We should also wait for existing writes to complete before we process an oplock break request which changes oplock values. We add a version specific downgrade_oplock() operation to allow for differences in the oplock values set for the different smb versions. Cc: stable@vger.kernel.org Signed-off-by: Sachin Prabhu <sprabhu@redhat.com> Reviewed-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru> Signed-off-by: Steve French <smfrench@gmail.com>
2014-03-11 23:11:47 +07:00
set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
&cinode->flags);
/*
* Set flag if the server downgrades the oplock
* to L2 else clear.
*/
if (rsp->OplockLevel)
set_bit(
CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
&cinode->flags);
else
clear_bit(
CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
&cinode->flags);
spin_unlock(&cfile->file_info_lock);
CIFS: fix oplock break deadlocks When the final cifsFileInfo_put() is called from cifsiod and an oplock break work is queued, lockdep complains loudly: ============================================= [ INFO: possible recursive locking detected ] 4.11.0+ #21 Not tainted --------------------------------------------- kworker/0:2/78 is trying to acquire lock: ("cifsiod"){++++.+}, at: flush_work+0x215/0x350 but task is already holding lock: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock("cifsiod"); lock("cifsiod"); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by kworker/0:2/78: #0: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 #1: ((&wdata->work)){+.+...}, at: process_one_work+0x255/0x8e0 stack backtrace: CPU: 0 PID: 78 Comm: kworker/0:2 Not tainted 4.11.0+ #21 Workqueue: cifsiod cifs_writev_complete Call Trace: dump_stack+0x85/0xc2 __lock_acquire+0x17dd/0x2260 ? match_held_lock+0x20/0x2b0 ? trace_hardirqs_off_caller+0x86/0x130 ? mark_lock+0xa6/0x920 lock_acquire+0xcc/0x260 ? lock_acquire+0xcc/0x260 ? flush_work+0x215/0x350 flush_work+0x236/0x350 ? flush_work+0x215/0x350 ? destroy_worker+0x170/0x170 __cancel_work_timer+0x17d/0x210 ? ___preempt_schedule+0x16/0x18 cancel_work_sync+0x10/0x20 cifsFileInfo_put+0x338/0x7f0 cifs_writedata_release+0x2a/0x40 ? cifs_writedata_release+0x2a/0x40 cifs_writev_complete+0x29d/0x850 ? preempt_count_sub+0x18/0xd0 process_one_work+0x304/0x8e0 worker_thread+0x9b/0x6a0 kthread+0x1b2/0x200 ? process_one_work+0x8e0/0x8e0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x31/0x40 This is a real warning. Since the oplock is queued on the same workqueue this can deadlock if there is only one worker thread active for the workqueue (which will be the case during memory pressure when the rescuer thread is handling it). Furthermore, there is at least one other kind of hang possible due to the oplock break handling if there is only worker. (This can be reproduced without introducing memory pressure by having passing 1 for the max_active parameter of cifsiod.) cifs_oplock_break() can wait indefintely in the filemap_fdatawait() while the cifs_writev_complete() work is blocked: sysrq: SysRq : Show Blocked State task PC stack pid father kworker/0:1 D 0 16 2 0x00000000 Workqueue: cifsiod cifs_oplock_break Call Trace: __schedule+0x562/0xf40 ? mark_held_locks+0x4a/0xb0 schedule+0x57/0xe0 io_schedule+0x21/0x50 wait_on_page_bit+0x143/0x190 ? add_to_page_cache_lru+0x150/0x150 __filemap_fdatawait_range+0x134/0x190 ? do_writepages+0x51/0x70 filemap_fdatawait_range+0x14/0x30 filemap_fdatawait+0x3b/0x40 cifs_oplock_break+0x651/0x710 ? preempt_count_sub+0x18/0xd0 process_one_work+0x304/0x8e0 worker_thread+0x9b/0x6a0 kthread+0x1b2/0x200 ? process_one_work+0x8e0/0x8e0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x31/0x40 dd D 0 683 171 0x00000000 Call Trace: __schedule+0x562/0xf40 ? mark_held_locks+0x29/0xb0 schedule+0x57/0xe0 io_schedule+0x21/0x50 wait_on_page_bit+0x143/0x190 ? add_to_page_cache_lru+0x150/0x150 __filemap_fdatawait_range+0x134/0x190 ? do_writepages+0x51/0x70 filemap_fdatawait_range+0x14/0x30 filemap_fdatawait+0x3b/0x40 filemap_write_and_wait+0x4e/0x70 cifs_flush+0x6a/0xb0 filp_close+0x52/0xa0 __close_fd+0xdc/0x150 SyS_close+0x33/0x60 entry_SYSCALL_64_fastpath+0x1f/0xbe Showing all locks held in the system: 2 locks held by kworker/0:1/16: #0: ("cifsiod"){.+.+.+}, at: process_one_work+0x255/0x8e0 #1: ((&cfile->oplock_break)){+.+.+.}, at: process_one_work+0x255/0x8e0 Showing busy workqueues and worker pools: workqueue cifsiod: flags=0xc pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/1 in-flight: 16:cifs_oplock_break delayed: cifs_writev_complete, cifs_echo_request pool 0: cpus=0 node=0 flags=0x0 nice=0 hung=0s workers=3 idle: 750 3 Fix these problems by creating a a new workqueue (with a rescuer) for the oplock break work. Signed-off-by: Rabin Vincent <rabinv@axis.com> Signed-off-by: Steve French <smfrench@gmail.com> CC: Stable <stable@vger.kernel.org>
2017-05-03 22:54:01 +07:00
queue_work(cifsoplockd_wq,
&cfile->oplock_break);
spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
return true;
}
spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "No matching file for oplock break\n");
return true;
}
}
spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n");
return false;
}
void
smb2_cancelled_close_fid(struct work_struct *work)
{
struct close_cancelled_open *cancelled = container_of(work,
struct close_cancelled_open, work);
cifs_dbg(VFS, "Close unmatched open\n");
SMB2_close(0, cancelled->tcon, cancelled->fid.persistent_fid,
cancelled->fid.volatile_fid);
cifs_put_tcon(cancelled->tcon);
kfree(cancelled);
}
int
smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server)
{
struct smb2_sync_hdr *sync_hdr = (struct smb2_sync_hdr *)buffer;
struct smb2_create_rsp *rsp = (struct smb2_create_rsp *)buffer;
struct cifs_tcon *tcon;
struct close_cancelled_open *cancelled;
if (sync_hdr->Command != SMB2_CREATE ||
sync_hdr->Status != STATUS_SUCCESS)
return 0;
cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL);
if (!cancelled)
return -ENOMEM;
tcon = smb2_find_smb_tcon(server, sync_hdr->SessionId,
sync_hdr->TreeId);
if (!tcon) {
kfree(cancelled);
return -ENOENT;
}
cancelled->fid.persistent_fid = rsp->PersistentFileId;
cancelled->fid.volatile_fid = rsp->VolatileFileId;
cancelled->tcon = tcon;
INIT_WORK(&cancelled->work, smb2_cancelled_close_fid);
queue_work(cifsiod_wq, &cancelled->work);
return 0;
}
/**
* smb311_update_preauth_hash - update @ses hash with the packet data in @iov
*
* Assumes @iov does not contain the rfc1002 length and iov[0] has the
* SMB2 header.
*/
int
smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec)
{
int i, rc;
struct sdesc *d;
struct smb2_sync_hdr *hdr;
if (ses->server->tcpStatus == CifsGood) {
/* skip non smb311 connections */
if (ses->server->dialect != SMB311_PROT_ID)
return 0;
/* skip last sess setup response */
hdr = (struct smb2_sync_hdr *)iov[0].iov_base;
if (hdr->Flags & SMB2_FLAGS_SIGNED)
return 0;
}
rc = smb311_crypto_shash_allocate(ses->server);
if (rc)
return rc;
d = ses->server->secmech.sdescsha512;
rc = crypto_shash_init(&d->shash);
if (rc) {
cifs_dbg(VFS, "%s: could not init sha512 shash\n", __func__);
return rc;
}
rc = crypto_shash_update(&d->shash, ses->preauth_sha_hash,
SMB2_PREAUTH_HASH_SIZE);
if (rc) {
cifs_dbg(VFS, "%s: could not update sha512 shash\n", __func__);
return rc;
}
for (i = 0; i < nvec; i++) {
rc = crypto_shash_update(&d->shash,
iov[i].iov_base, iov[i].iov_len);
if (rc) {
cifs_dbg(VFS, "%s: could not update sha512 shash\n",
__func__);
return rc;
}
}
rc = crypto_shash_final(&d->shash, ses->preauth_sha_hash);
if (rc) {
cifs_dbg(VFS, "%s: could not finalize sha512 shash\n",
__func__);
return rc;
}
return 0;
}