[kernel] r19614 - dists/squeeze/linux-2.6/debian/patches/features/all/hpsa

Sun Dec 16 22:31:40 UTC 2012

Author: benh
Date: Sun Dec 16 22:31:39 2012
New Revision: 19614

Log:
hpsa: Backport changes up to Linux 3.2.35 (Closes: #690100), part 2

Added:
   dists/squeeze/linux-2.6/debian/patches/features/all/hpsa/0137-hpsa-Use-LUN-reset-instead-of-target-reset.patch
   dists/squeeze/linux-2.6/debian/patches/features/all/hpsa/0138-hpsa-dial-down-lockup-detection-during-firmware-flas.patch

Added: dists/squeeze/linux-2.6/debian/patches/features/all/hpsa/0137-hpsa-Use-LUN-reset-instead-of-target-reset.patch
==============================================================================

--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ dists/squeeze/linux-2.6/debian/patches/features/all/hpsa/0137-hpsa-Use-LUN-reset-instead-of-target-reset.patch	Sun Dec 16 22:31:39 2012	(r19614)
@@ -0,0 +1,52 @@
+From 2239182005cc209df395cae2316f4e6688642b8d Mon Sep 17 00:00:00 2001
+From: "Stephen M. Cameron" <scameron at beardog.cce.hp.com>
+Date: Thu, 26 Jul 2012 11:34:10 -0500
+Subject: [PATCH 137/138] hpsa: Use LUN reset instead of target reset
+
+commit 21e89afd325849eb38adccf382df16cc895911f9 upstream.
+
+It turns out Smart Array logical drives do not support target
+reset and when the target reset fails, the logical drive will
+be taken off line.  Symptoms look like this:
+
+hpsa 0000:03:00.0: Abort request on C1:B0:T0:L0
+hpsa 0000:03:00.0: resetting device 1:0:0:0
+hpsa 0000:03:00.0: cp ffff880037c56000 is reported invalid (probably means target device no longer present)
+hpsa 0000:03:00.0: resetting device failed.
+sd 1:0:0:0: Device offlined - not ready after error recovery
+sd 1:0:0:0: rejecting I/O to offline device
+EXT3-fs error (device sdb1): read_block_bitmap:
+
+LUN reset is supported though, and is what we should be using.
+Target reset is also disruptive in shared SAS situations,
+for example, an external MSA1210m which does support target
+reset attached to Smart Arrays in multiple hosts -- a target
+reset from one host is disruptive to other hosts as all LUNs
+on the target will be reset and will abort all outstanding i/os
+back to all the attached hosts.  So we should use LUN reset,
+not target reset.
+
+Tested this with Smart Array logical drives and with tape drives.
+Not sure how this bug survived since 2009, except it must be very
+rare for a Smart Array to require more than 30s to complete a request.
+
+Signed-off-by: Stephen M. Cameron <scameron at beardog.cce.hp.com>
+Signed-off-by: James Bottomley <JBottomley at Parallels.com>
+Signed-off-by: Ben Hutchings <ben at decadent.org.uk>
+---
+ drivers/scsi/hpsa.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
+index be9aad8..a133724 100644
+--- a/drivers/scsi/hpsa.c
++++ b/drivers/scsi/hpsa.c
+@@ -2926,7 +2926,7 @@ static void fill_cmd(struct CommandList *c, u8 cmd, struct ctlr_info *h,
+ 			c->Request.Timeout = 0; /* Don't time out */
+ 			memset(&c->Request.CDB[0], 0, sizeof(c->Request.CDB));
+ 			c->Request.CDB[0] =  cmd;
+-			c->Request.CDB[1] = 0x03;  /* Reset target above */
++			c->Request.CDB[1] = HPSA_RESET_TYPE_LUN;
+ 			/* If bytes 4-7 are zero, it means reset the */
+ 			/* LunID device */
+ 			c->Request.CDB[4] = 0x00;

Added: dists/squeeze/linux-2.6/debian/patches/features/all/hpsa/0138-hpsa-dial-down-lockup-detection-during-firmware-flas.patch
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ dists/squeeze/linux-2.6/debian/patches/features/all/hpsa/0138-hpsa-dial-down-lockup-detection-during-firmware-flas.patch	Sun Dec 16 22:31:39 2012	(r19614)
@@ -0,0 +1,144 @@
+From def274ac605a69628f7d0d2d666484d209b0b6ee Mon Sep 17 00:00:00 2001
+From: "Stephen M. Cameron" <scameron at beardog.cce.hp.com>
+Date: Tue, 1 May 2012 11:43:42 -0500
+Subject: [PATCH 138/138] hpsa: dial down lockup detection during firmware flash
+
+commit e85c59746957fd6e3595d02cf614370056b5816e upstream.
+
+Dial back the aggressiveness of the controller lockup detection thread.
+Currently it will declare the controller to be locked up if it goes
+for 10 seconds with no interrupts and no change in the heartbeat
+register.  Dial back this to 30 seconds with no heartbeat change, and
+also snoop the ioctl path and if a firmware flash command is detected,
+dial it back further to 4 minutes until the firmware flash command
+completes.  The reason for this is that during the firmware flash
+operation, the controller apparently doesn't update the heartbeat
+register as frequently as it is supposed to, and we can get a false
+positive.
+
+Signed-off-by: Stephen M. Cameron <scameron at beardog.cce.hp.com>
+Signed-off-by: James Bottomley <JBottomley at Parallels.com>
+[bwh: Backported to 3.2: adjust context]
+Signed-off-by: Ben Hutchings <ben at decadent.org.uk>
+---
+ drivers/scsi/hpsa.c     |   39 ++++++++++++++++++++++++++++++++++-----
+ drivers/scsi/hpsa.h     |    2 ++
+ drivers/scsi/hpsa_cmd.h |    1 +
+ 3 files changed, 37 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
+index a133724..22523aa 100644
+--- a/drivers/scsi/hpsa.c
++++ b/drivers/scsi/hpsa.c
+@@ -532,12 +532,42 @@ static void set_performant_mode(struct ctlr_info *h, struct CommandList *c)
+ 		c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1);
+ }
+ 
++static int is_firmware_flash_cmd(u8 *cdb)
++{
++	return cdb[0] == BMIC_WRITE && cdb[6] == BMIC_FLASH_FIRMWARE;
++}
++
++/*
++ * During firmware flash, the heartbeat register may not update as frequently
++ * as it should.  So we dial down lockup detection during firmware flash. and
++ * dial it back up when firmware flash completes.
++ */
++#define HEARTBEAT_SAMPLE_INTERVAL_DURING_FLASH (240 * HZ)
++#define HEARTBEAT_SAMPLE_INTERVAL (30 * HZ)
++static void dial_down_lockup_detection_during_fw_flash(struct ctlr_info *h,
++		struct CommandList *c)
++{
++	if (!is_firmware_flash_cmd(c->Request.CDB))
++		return;
++	atomic_inc(&h->firmware_flash_in_progress);
++	h->heartbeat_sample_interval = HEARTBEAT_SAMPLE_INTERVAL_DURING_FLASH;
++}
++
++static void dial_up_lockup_detection_on_fw_flash_complete(struct ctlr_info *h,
++		struct CommandList *c)
++{
++	if (is_firmware_flash_cmd(c->Request.CDB) &&
++		atomic_dec_and_test(&h->firmware_flash_in_progress))
++		h->heartbeat_sample_interval = HEARTBEAT_SAMPLE_INTERVAL;
++}
++
+ static void enqueue_cmd_and_start_io(struct ctlr_info *h,
+ 	struct CommandList *c)
+ {
+ 	unsigned long flags;
+ 
+ 	set_performant_mode(h, c);
++	dial_down_lockup_detection_during_fw_flash(h, c);
+ 	spin_lock_irqsave(&h->lock, flags);
+ 	addQ(&h->reqQ, c);
+ 	h->Qdepth++;
+@@ -3032,6 +3062,7 @@ static inline int bad_tag(struct ctlr_info *h, u32 tag_index,
+ static inline void finish_cmd(struct CommandList *c, u32 raw_tag)
+ {
+ 	removeQ(c);
++	dial_up_lockup_detection_on_fw_flash_complete(c->h, c);
+ 	if (likely(c->cmd_type == CMD_SCSI))
+ 		complete_scsi_command(c);
+ 	else if (c->cmd_type == CMD_IOCTL_PEND)
+@@ -4172,9 +4203,6 @@ static void controller_lockup_detected(struct ctlr_info *h)
+ 	spin_unlock_irqrestore(&h->lock, flags);
+ }
+ 
+-#define HEARTBEAT_SAMPLE_INTERVAL (10 * HZ)
+-#define HEARTBEAT_CHECK_MINIMUM_INTERVAL (HEARTBEAT_SAMPLE_INTERVAL / 2)
+-
+ static void detect_controller_lockup(struct ctlr_info *h)
+ {
+ 	u64 now;
+@@ -4185,7 +4213,7 @@ static void detect_controller_lockup(struct ctlr_info *h)
+ 	now = get_jiffies_64();
+ 	/* If we've received an interrupt recently, we're ok. */
+ 	if (time_after64(h->last_intr_timestamp +
+-				(HEARTBEAT_CHECK_MINIMUM_INTERVAL), now))
++				(h->heartbeat_sample_interval), now))
+ 		return;
+ 
+ 	/*
+@@ -4194,7 +4222,7 @@ static void detect_controller_lockup(struct ctlr_info *h)
+ 	 * otherwise don't care about signals in this thread.
+ 	 */
+ 	if (time_after64(h->last_heartbeat_timestamp +
+-				(HEARTBEAT_CHECK_MINIMUM_INTERVAL), now))
++				(h->heartbeat_sample_interval), now))
+ 		return;
+ 
+ 	/* If heartbeat has not changed since we last looked, we're not ok. */
+@@ -4236,6 +4264,7 @@ static void add_ctlr_to_lockup_detector_list(struct ctlr_info *h)
+ {
+ 	unsigned long flags;
+ 
++	h->heartbeat_sample_interval = HEARTBEAT_SAMPLE_INTERVAL;
+ 	spin_lock_irqsave(&lockup_detector_lock, flags);
+ 	list_add_tail(&h->lockup_list, &hpsa_ctlr_list);
+ 	spin_unlock_irqrestore(&lockup_detector_lock, flags);
+diff --git a/drivers/scsi/hpsa.h b/drivers/scsi/hpsa.h
+index 91edafb..c721509 100644
+--- a/drivers/scsi/hpsa.h
++++ b/drivers/scsi/hpsa.h
+@@ -124,6 +124,8 @@ struct ctlr_info {
+ 	u64 last_intr_timestamp;
+ 	u32 last_heartbeat;
+ 	u64 last_heartbeat_timestamp;
++	u32 heartbeat_sample_interval;
++	atomic_t firmware_flash_in_progress;
+ 	u32 lockup_detected;
+ 	struct list_head lockup_list;
+ };
+diff --git a/drivers/scsi/hpsa_cmd.h b/drivers/scsi/hpsa_cmd.h
+index 3fd4715..e4ea0a3 100644
+--- a/drivers/scsi/hpsa_cmd.h
++++ b/drivers/scsi/hpsa_cmd.h
+@@ -163,6 +163,7 @@ struct SenseSubsystem_info {
+ #define BMIC_WRITE 0x27
+ #define BMIC_CACHE_FLUSH 0xc2
+ #define HPSA_CACHE_FLUSH 0x01	/* C2 was already being used by HPSA */
++#define BMIC_FLASH_FIRMWARE 0xF7
+ 
+ /* Command List Structure */
+ union SCSI3Addr {