[MLX4] on catastrophic error, dump error buffer before reset. [mlnx: 4636]
[mirror/winof/.git] / hw / mlx4 / kernel / bus / net / catas.c
1 /*
2  * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32
33 #include "mlx4.h"
34
35 enum {
36         MLX4_CATAS_POLL_INTERVAL        = 5 * HZ,
37 };
38
39 static DEFINE_SPINLOCK(catas_lock);
40 static LIST_HEAD(catas_list);
41
42 void mlx4_dispatch_reset_event(struct ib_device *ibdev, enum ib_event_type type)
43 {
44         unsigned long flags;
45         struct ib_event event;
46         struct ib_event_handler *handler;
47
48         event.device = ibdev;
49         event.event = type;
50
51         spin_lock_irqsave(&ibdev->event_handler_lock, &flags);
52
53         list_for_each_entry(handler, &ibdev->event_handler_list, list, struct ib_event_handler)
54         {
55                 // notify only soft reset handlers
56                 if ( handler->flags & IB_IVH_RESET_CB )
57                         // notify only those, that are not yet notified
58                         if ( !(handler->flags & IB_IVH_NOTIFIED) ) {
59                                 // notify only those that are ready to get the notification
60                                 if ( handler->flags & IB_IVH_NOTIF_READY ) {
61                                         // insure not to notify once more 
62                                         handler->flags |= IB_IVH_NOTIFIED;
63                                         handler->flags &= ~(IB_IVH_NOTIF_READY | 
64                                                 IB_IVH_RESET_D_PENDING | IB_IVH_RESET_C_PENDING);
65                                         handler->handler(handler, &event);
66                                 }
67                                 else {
68                                         // pend the notification
69                                         if (type == IB_EVENT_RESET_DRIVER) 
70                                                 handler->flags |= IB_IVH_RESET_D_PENDING;
71                                         else 
72                                                 handler->flags |= IB_IVH_RESET_C_PENDING;
73                                 }
74                         }
75         }
76
77         spin_unlock_irqrestore(&ibdev->event_handler_lock, flags);
78 }
79
80 /**
81  * get_event_handlers - return list of handlers of the device
82  * @device:device
83  * @tlist:list
84  *
85  * get_event_handlers() remove all the device event handlers and put them in 'tlist'
86  */
87 static void get_event_handlers(struct ib_device *device, struct list_head *tlist)
88 {
89         unsigned long flags;
90         struct ib_event_handler *handler, *thandler;
91
92         spin_lock_irqsave(&device->event_handler_lock, &flags);
93
94         list_for_each_entry_safe(handler, thandler, &device->event_handler_list, 
95                 list, struct ib_event_handler, struct ib_event_handler)
96         {
97                 // take out only reset callbacks
98                 if ( handler->flags & IB_IVH_RESET_CB ) {
99                         list_del( &handler->list );
100                         list_add_tail( &handler->list, tlist );
101                 }
102         }
103
104         spin_unlock_irqrestore(&device->event_handler_lock, flags);
105 }
106
107
108 static void dump_err_buf(struct mlx4_dev *dev)
109 {
110         struct mlx4_priv *priv = mlx4_priv(dev);
111
112         u32 i;
113
114         mlx4_err(dev, "Internal error detected:\n");
115         for (i = 0; i < priv->fw.catas_size; ++i)
116                 mlx4_warn(dev, "  buf[%02x]: %08x\n",
117                          i, swab32(readl(priv->catas_err.map + i)));
118 }
119
120 static void
121 catas_reset_wi(
122         IN                              DEVICE_OBJECT*                          p_dev_obj,
123         IN                              struct mlx4_dev *                       dev )
124 {
125         NTSTATUS status;
126         long do_reset;
127         UNUSED_PARAM(p_dev_obj);
128
129         dump_err_buf(dev);
130
131         do_reset = InterlockedCompareExchange(&dev->reset_pending, 1, 0);
132         if (do_reset == 0) {
133                 status = mlx4_reset(dev);
134                 if ( !NT_SUCCESS( status ) ) {
135                         mlx4_err(dev, "Failed to reset HCA, aborting.(status %#x)\n", status);
136                 }
137                 
138                 dev->flags |= MLX4_FLAG_RESET_DRIVER;   // bar the device
139         }
140
141         mlx4_dispatch_event(dev, MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR, 0, 0);
142         if (dev->pdev->ib_dev)
143                 mlx4_dispatch_reset_event(dev->pdev->ib_dev, IB_EVENT_RESET_DRIVER);
144 }
145
146 /* polling on DISPATCH_LEVEL */
147 static void poll_catas(struct mlx4_dev *dev)
148 {
149         struct mlx4_priv *priv = mlx4_priv(dev);
150
151         if (readl(priv->catas_err.map)) {
152                 
153                 mlx4_warn(dev, "Detected catastrophic error on mdev %p\n", dev);
154                 IoQueueWorkItem( priv->catas_err.catas_work, catas_reset_wi, DelayedWorkQueue, dev );
155         } else {
156                 spin_lock_dpc(&catas_lock);
157                 if (!priv->catas_err.stop) {
158                         KeSetTimerEx( &priv->catas_err.timer, priv->catas_err.interval, 
159                                 0, &priv->catas_err.timer_dpc );
160                 }
161                 spin_unlock_dpc(&catas_lock);
162         }
163 }
164
165 static void  timer_dpc(
166         IN struct _KDPC  *Dpc,
167         IN PVOID  DeferredContext,
168         IN PVOID  SystemArgument1,
169         IN PVOID  SystemArgument2
170         )
171 {
172         struct mlx4_dev *dev = (struct mlx4_dev *)DeferredContext;
173         UNREFERENCED_PARAMETER(Dpc);
174         UNREFERENCED_PARAMETER(SystemArgument1);
175         UNREFERENCED_PARAMETER(SystemArgument2);
176         poll_catas( dev );
177 }
178
179 int mlx4_start_catas_poll(struct mlx4_dev *dev)
180 {
181         struct mlx4_priv *priv = mlx4_priv(dev);
182         u64 addr;
183         int err;
184
185         priv->catas_err.map = NULL;
186
187         addr = pci_resource_start(dev->pdev, priv->fw.catas_bar) +
188                 priv->fw.catas_offset;
189
190         priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
191         if (!priv->catas_err.map) {
192                 mlx4_warn(dev, "Failed to map internal error buffer at 0x%lx\n",
193                           addr);
194                 err = -ENOMEM;
195                 goto err_map;
196         }
197         
198         priv->catas_err.catas_work = IoAllocateWorkItem( dev->pdev->p_self_do );
199         if (!priv->catas_err.catas_work) {
200                 mlx4_warn(dev, "Failed to allocate work item from polling thread\n");
201                 err = -EFAULT;
202                 goto err_alloc;
203         }
204
205         priv->catas_err.stop = 0;
206         spin_lock_init( &catas_lock );
207         KeInitializeDpc(  &priv->catas_err.timer_dpc, timer_dpc, dev );
208         KeInitializeTimer( &priv->catas_err.timer );
209         priv->catas_err.interval.QuadPart  = (-10)* (__int64)MLX4_CATAS_POLL_INTERVAL;
210         KeSetTimerEx( &priv->catas_err.timer, priv->catas_err.interval, 
211                 0, &priv->catas_err.timer_dpc );
212         return 0;
213
214
215 err_alloc:
216         iounmap(priv->catas_err.map, priv->fw.catas_size * 4);
217 err_map:
218         return err;
219 }
220
221 void mlx4_stop_catas_poll(struct mlx4_dev *dev)
222 {
223         struct mlx4_priv *priv = mlx4_priv(dev);
224
225         spin_lock_irq(&catas_lock);
226         if (priv->catas_err.stop) {
227                 spin_unlock_irq(&catas_lock);
228                 return;
229         }
230         priv->catas_err.stop = 1;
231         spin_unlock_irq(&catas_lock);
232
233         KeCancelTimer(&priv->catas_err.timer);
234         KeFlushQueuedDpcs();
235         if (priv->catas_err.map)
236                 iounmap(priv->catas_err.map, priv->fw.catas_size * 4);
237
238         if (priv->catas_err.catas_work) 
239                 IoFreeWorkItem( priv->catas_err.catas_work );
240 }
241
242 static int wait4reset(struct ib_event_handler *event_handler)
243 {
244         int n_not_ready = 0;
245         unsigned long flags;
246         struct ib_event_handler *handler;
247         struct ib_device *ibdev = event_handler->device;
248
249         spin_lock_irqsave(&ibdev->event_handler_lock, &flags);
250
251         // mark this handler (=client) reset-ready
252         event_handler->flags |= IB_IVH_RESET_READY;
253
254         // check the number of still not ready client
255         
256         list_for_each_entry(handler, &ibdev->event_handler_list, list, struct ib_event_handler)
257                 if ( handler->flags & IB_IVH_RESET_CB )
258                         if ( !(handler->flags & IB_IVH_RESET_READY) ) 
259                                 ++n_not_ready;
260         
261         spin_unlock_irqrestore(&ibdev->event_handler_lock, flags);
262
263         return n_not_ready;
264 }
265
266 int mlx4_reset_ready( struct ib_event_handler *event_handler )
267 {
268         unsigned long flags;
269         struct ib_device *ibdev = event_handler->device;
270
271         ASSERT(KeGetCurrentIrql() == PASSIVE_LEVEL);
272         
273         spin_lock_irqsave(&ibdev->event_handler_lock, &flags);
274         event_handler->flags |= IB_IVH_NOTIF_READY;
275         spin_unlock_irqrestore(&ibdev->event_handler_lock, flags);
276         if (event_handler->flags & IB_IVH_RESET_D_PENDING)
277                 mlx4_dispatch_reset_event(ibdev, IB_EVENT_RESET_DRIVER);
278         else
279         if (event_handler->flags & IB_IVH_RESET_C_PENDING)
280                 mlx4_dispatch_reset_event(ibdev, IB_EVENT_RESET_CLIENT);
281         return 0;
282 }
283
284 int mlx4_reset_execute( struct ib_event_handler *event_handler )
285 {
286         int err;
287         struct ib_event event;
288         struct list_head tlist;
289         struct ib_event_handler *handler, *thandler;
290         struct ib_device *ibdev = event_handler->device;
291         struct pci_dev *pdev = ibdev->dma_device->pdev;
292
293         // mark client as "ready for reset" and check whether we can do reset
294         if (wait4reset(event_handler)) {
295                 return 0;
296         }
297
298         // fully bar the device
299         ibdev->dma_device->flags |= MLX4_FLAG_RESET_STARTED;
300         
301         // get old handler list 
302         INIT_LIST_HEAD(&tlist);
303         get_event_handlers(ibdev, &tlist);
304
305         // restart the device
306         mlx4_info(pdev->dev, "\n Performing HCA restart ... \n\n");
307         WriteEventLogEntryData( pdev->p_self_do, (ULONG)EVENT_MLX4_INFO_RESET_START, 0, 0, 0 );
308         err = mlx4_restart_one(pdev);
309         if (err || mlx4_is_livefish(pdev->dev)) {
310                 event.event = IB_EVENT_RESET_FAILED;
311                 mlx4_err(pdev->dev, "\n HCa restart failed. \n\n");
312         }
313         else {
314                 // recreate interfaces
315                 fix_bus_ifc(pdev);
316                 event.event = IB_EVENT_RESET_END;
317                 mlx4_info(pdev->dev, "\n HCA restart finished. Notifying the clients ... \n\n");
318                 WriteEventLogEntryData( pdev->p_self_do, (ULONG)EVENT_MLX4_INFO_RESET_END, 0, 0, 0 );
319         }
320
321         // notify the clients
322         list_for_each_entry_safe(handler, thandler, &tlist, 
323                 list, struct ib_event_handler, struct ib_event_handler)
324         {
325                 // because 'handler' will be re-registered during the next call
326                 list_del( &handler->list );
327                 handler->handler(handler, &event);
328         }
329         
330         return err;
331 }
332
333 static void
334 card_reset_wi(
335         IN                              DEVICE_OBJECT*                          p_dev_obj,
336         IN                              struct ib_event_handler *       event_handler )
337 {
338         struct ib_device *ibdev = event_handler->device;
339
340         UNUSED_PARAM(p_dev_obj);
341         IoFreeWorkItem( event_handler->rsrv_ptr );
342
343         // notify the clients
344         mlx4_dispatch_reset_event(ibdev, IB_EVENT_RESET_CLIENT);
345 }
346
347 int mlx4_reset_request( struct ib_event_handler *event_handler )
348 {
349         struct ib_device *ibdev;
350         struct mlx4_dev *dev;
351
352         unsigned long flags;
353
354         ibdev = event_handler->device;
355         if (ibdev == NULL)
356                 return -EFAULT;
357
358         dev = ibdev->dma_device;
359         if (ibdev == NULL)
360                 return -EFAULT;
361
362         spin_lock_irqsave(&ibdev->event_handler_lock, &flags);
363
364         
365         // set device to RESET_PENDING mode
366         if (!(dev->flags & (MLX4_FLAG_RESET_CLIENT | MLX4_FLAG_RESET_DRIVER))) {
367                 PIO_WORKITEM reset_work;
368
369                 // bar the device
370                 dev->flags |= MLX4_FLAG_RESET_CLIENT;
371
372                 // delay reset to a system thread
373                 // to allow for end of operations that are in progress
374                 reset_work = IoAllocateWorkItem( dev->pdev->p_self_do );
375                 if (!reset_work) {
376             spin_unlock_irqrestore(&ibdev->event_handler_lock, flags);
377                         mlx4_err(dev, "mlx4_reset_request IoAllocateWorkItem failed, reset will not be propagated\n");
378                         return -EFAULT;
379                 }
380                 event_handler->rsrv_ptr = reset_work;
381                 IoQueueWorkItem( reset_work, card_reset_wi, DelayedWorkQueue, event_handler );
382         }
383
384         spin_unlock_irqrestore(&ibdev->event_handler_lock, flags);
385
386
387         return 0;
388 }
389
390 int mlx4_reset_cb_register( struct ib_event_handler *event_handler )
391 {
392         if (mlx4_is_in_reset(event_handler->device->dma_device))
393                 return -EBUSY;
394
395         return ib_register_event_handler(event_handler);
396 }
397
398 int mlx4_reset_cb_unregister( struct ib_event_handler *event_handler )
399 {
400         return ib_unregister_event_handler(event_handler);
401 }
402
403