[MLX4] Soft Reset: (bugfix in error flow) handling a case when MLX4_BUS fails during...
[mirror/winof/.git] / hw / mlx4 / kernel / bus / net / catas.c
1 /*
2  * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32
33 #include "mlx4.h"
34
35 enum {
36         MLX4_CATAS_POLL_INTERVAL        = 5 * HZ,
37 };
38
39 static DEFINE_SPINLOCK(catas_lock);
40 static LIST_HEAD(catas_list);
41
42 // TODO: put into Globals
43 // "Reset device on internal errors if non-zero (default 1)")
44 int g_internal_err_reset = 0;
45
46 static void dispatch_event(struct ib_device *ibdev, enum ib_event_type type)
47 {
48         unsigned long flags;
49         struct ib_event event;
50         struct ib_event_handler *handler;
51
52         event.device = ibdev;
53         event.event = type;
54
55         spin_lock_irqsave(&ibdev->event_handler_lock, &flags);
56
57         list_for_each_entry(handler, &ibdev->event_handler_list, list, struct ib_event_handler)
58         {
59                 // notify only those, that are not notified
60                 if ( handler->flags & IB_IVH_RESET_CB )
61                         if ( !(handler->flags & IB_IVH_NOTIFIED) ) {
62                                 handler->flags |= IB_IVH_NOTIFIED;
63                                 handler->handler(handler, &event);
64                         }
65         }
66
67         spin_unlock_irqrestore(&ibdev->event_handler_lock, flags);
68 }
69
70 /**
71  * get_event_handlers - return list of handlers of the device
72  * @device:device
73  * @tlist:list
74  *
75  * get_event_handlers() remove all the device event handlers and put them in 'tlist'
76  */
77 static void get_event_handlers(struct ib_device *device, struct list_head *tlist)
78 {
79         unsigned long flags;
80         struct ib_event_handler *handler, *thandler;
81
82         spin_lock_irqsave(&device->event_handler_lock, &flags);
83
84         list_for_each_entry_safe(handler, thandler, &device->event_handler_list, 
85                 list, struct ib_event_handler, struct ib_event_handler)
86         {
87                 // take out only reset callbacks
88                 if ( handler->flags & IB_IVH_RESET_CB ) {
89                         list_del( &handler->list );
90                         list_add_tail( &handler->list, tlist );
91                 }
92         }
93
94         spin_unlock_irqrestore(&device->event_handler_lock, flags);
95 }
96
97
98 static void dump_err_buf(struct mlx4_dev *dev)
99 {
100         struct mlx4_priv *priv = mlx4_priv(dev);
101
102         u32 i;
103
104         mlx4_err(dev, "Internal error detected:\n");
105         for (i = 0; i < priv->fw.catas_size; ++i)
106                 mlx4_err(dev, "  buf[%02x]: %08x\n",
107                          i, swab32(readl(priv->catas_err.map + i)));
108 }
109
110 static void catas_reset()
111 {
112         struct mlx4_priv *priv, *tmppriv;
113         struct mlx4_dev *dev;
114         struct list_head tlist;
115         int ret;
116
117         INIT_LIST_HEAD(&tlist);
118         spin_lock_irq(&catas_lock);
119         list_splice_init(&catas_list, &tlist);
120         spin_unlock_irq(&catas_lock);
121
122         list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list, struct mlx4_priv, struct mlx4_priv) {
123                 ret = mlx4_restart_one(priv->dev.pdev);
124                 dev = &priv->dev;
125                 if (ret)
126                         mlx4_err(dev, "Reset failed (%d)\n", ret);
127                 else
128                         mlx4_dbg(dev, "Reset succeeded\n");
129         }
130 }
131
132 static void
133 catas_reset_wi(
134         IN                              DEVICE_OBJECT*                          p_dev_obj,
135         IN                              void*                                           context )
136 {
137         UNUSED_PARAM(p_dev_obj);
138         IoFreeWorkItem( context );
139         catas_reset();
140 }
141
142 /* polling on DISPATCH_LEVEL */
143 static void poll_catas(struct mlx4_dev *dev)
144 {
145         struct mlx4_priv *priv = mlx4_priv(dev);
146
147         if (readl(priv->catas_err.map)) {
148                 dump_err_buf(dev);
149
150                 mlx4_dispatch_event(dev, MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR, 0, 0);
151
152                 // bar the device
153                 dev->flags |= MLX4_FLAG_RESET_DRIVER;
154
155                 // notify the clients
156                 dispatch_event(dev->pdev->ib_dev, IB_EVENT_RESET_DRIVER);
157
158                 if (g_internal_err_reset) {
159                         PIO_WORKITEM catas_work = IoAllocateWorkItem( dev->pdev->p_self_do );
160
161                         spin_lock_dpc(&catas_lock);
162                         list_add(&priv->catas_err.list, &catas_list);
163                         spin_unlock_dpc(&catas_lock);
164
165                         if (!catas_work)
166                                 IoQueueWorkItem( catas_work, catas_reset_wi, DelayedWorkQueue, catas_work );
167                 }
168         } else {
169                 spin_lock_dpc(&catas_lock);
170                 if (!priv->catas_err.stop) {
171                         KeSetTimerEx( &priv->catas_err.timer, priv->catas_err.interval, 
172                                 0, &priv->catas_err.timer_dpc );
173                 }
174                 spin_unlock_dpc(&catas_lock);
175         }
176 }
177
178 static void  timer_dpc(
179         IN struct _KDPC  *Dpc,
180         IN PVOID  DeferredContext,
181         IN PVOID  SystemArgument1,
182         IN PVOID  SystemArgument2
183         )
184 {
185         struct mlx4_dev *dev = (struct mlx4_dev *)DeferredContext;
186         UNREFERENCED_PARAMETER(Dpc);
187         UNREFERENCED_PARAMETER(SystemArgument1);
188         UNREFERENCED_PARAMETER(SystemArgument2);
189         poll_catas( dev );
190 }
191
192 void mlx4_start_catas_poll(struct mlx4_dev *dev)
193 {
194         struct mlx4_priv *priv = mlx4_priv(dev);
195         u64 addr;
196
197         INIT_LIST_HEAD(&priv->catas_err.list);
198         priv->catas_err.map = NULL;
199
200         addr = pci_resource_start(dev->pdev, priv->fw.catas_bar) +
201                 priv->fw.catas_offset;
202
203         priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
204         if (!priv->catas_err.map) {
205                 mlx4_warn(dev, "Failed to map internal error buffer at 0x%lx\n",
206                           addr);
207                 return;
208         }
209
210         priv->catas_err.stop = 0;
211         spin_lock_init( &catas_lock );
212         KeInitializeDpc(  &priv->catas_err.timer_dpc, timer_dpc, dev );
213         KeInitializeTimer( &priv->catas_err.timer );
214         priv->catas_err.interval.QuadPart  = (-10)* (__int64)MLX4_CATAS_POLL_INTERVAL;
215         KeSetTimerEx( &priv->catas_err.timer, priv->catas_err.interval, 
216                 0, &priv->catas_err.timer_dpc );
217 }
218
219 void mlx4_stop_catas_poll(struct mlx4_dev *dev)
220 {
221         struct mlx4_priv *priv = mlx4_priv(dev);
222
223         spin_lock_irq(&catas_lock);
224         if (priv->catas_err.stop) {
225                 spin_unlock_irq(&catas_lock);
226                 return;
227         }
228         priv->catas_err.stop = 1;
229         spin_unlock_irq(&catas_lock);
230
231         KeCancelTimer(&priv->catas_err.timer);
232         KeFlushQueuedDpcs();
233
234         if (priv->catas_err.map)
235                 iounmap(priv->catas_err.map, priv->fw.catas_size * 4);
236
237         spin_lock_irq(&catas_lock);
238         list_del(&priv->catas_err.list);
239         spin_unlock_irq(&catas_lock);
240 }
241
242 static int wait4reset(struct ib_event_handler *event_handler)
243 {
244         int n_not_ready = 0;
245         unsigned long flags;
246         struct ib_event_handler *handler;
247         struct ib_device *ibdev = event_handler->device;
248
249         spin_lock_irqsave(&ibdev->event_handler_lock, &flags);
250
251         // mark this handler (=client) reset-ready
252         event_handler->flags |= IB_IVH_RESET_READY;
253
254         // check the number of still not ready client
255         
256         list_for_each_entry(handler, &ibdev->event_handler_list, list, struct ib_event_handler)
257                 if ( handler->flags & IB_IVH_RESET_CB )
258                         if ( !(handler->flags & IB_IVH_RESET_READY) ) 
259                                 ++n_not_ready;
260         
261         spin_unlock_irqrestore(&ibdev->event_handler_lock, flags);
262
263         return n_not_ready;
264 }
265
266 int mlx4_reset_execute( struct ib_event_handler *event_handler )
267 {
268         int err;
269         struct ib_event event;
270         struct list_head tlist;
271         struct ib_event_handler *handler, *thandler;
272         struct ib_device *ibdev = event_handler->device;
273         struct pci_dev *pdev = ibdev->dma_device->pdev;
274
275         // mark client as "ready for reset" and check whether we can do reset
276         if (wait4reset(event_handler))
277                 return 0;
278
279         // fully bar the device
280         ibdev->dma_device->flags |= MLX4_FLAG_RESET_STARTED;
281         
282         // get old handler list 
283         INIT_LIST_HEAD(&tlist);
284         get_event_handlers(ibdev, &tlist);
285
286         // restart the device
287         err = mlx4_restart_one(pdev);
288         if (err) {
289                 event.event = IB_EVENT_RESET_FAILED;
290         }
291         else {
292                 // recreate interfaces
293                 fix_bus_ifc(pdev);
294                 event.event = IB_EVENT_RESET_END;
295         }
296
297         // notify the clients
298         list_for_each_entry_safe(handler, thandler, &tlist, 
299                 list, struct ib_event_handler, struct ib_event_handler)
300         {
301                 // because 'handler' will be re-registered during the next call
302                 list_del( &handler->list );
303                 handler->handler(handler, &event);
304         }
305         
306         return err;
307 }
308
309 static void
310 card_reset_wi(
311         IN                              DEVICE_OBJECT*                          p_dev_obj,
312         IN                              struct ib_event_handler *       event_handler )
313 {
314         struct ib_device *ibdev = event_handler->device;
315
316         UNUSED_PARAM(p_dev_obj);
317         IoFreeWorkItem( event_handler->rsrv_ptr );
318
319         // notify the clients
320         dispatch_event(ibdev, IB_EVENT_RESET_CLIENT);
321 }
322
323 int mlx4_reset_request( struct ib_event_handler *event_handler )
324 {
325         struct ib_device *ibdev = event_handler->device;
326         struct mlx4_dev *dev = ibdev->dma_device;
327         
328         // set device to RESET_PENDING mode
329         if (!mlx4_is_barred(dev)) {
330                 PIO_WORKITEM reset_work;
331
332                 // bar the device
333                 dev->flags |= MLX4_FLAG_RESET_CLIENT;
334
335                 // delay reset to a system thread
336                 // to allow for end of operations that are in progress
337                 reset_work = IoAllocateWorkItem( dev->pdev->p_self_do );
338                 if (!reset_work)
339                         return -EFAULT;
340                 event_handler->rsrv_ptr = reset_work;
341                 IoQueueWorkItem( reset_work, card_reset_wi, DelayedWorkQueue, event_handler );
342         }
343
344         return 0;
345 }
346
347 int mlx4_reset_cb_register( struct ib_event_handler *event_handler )
348 {
349         if (mlx4_is_in_reset(event_handler->device->dma_device))
350                 return -EBUSY;
351
352         return ib_register_event_handler(event_handler);
353 }
354
355 int mlx4_reset_cb_unregister( struct ib_event_handler *event_handler )
356 {
357         return ib_unregister_event_handler(event_handler);
358 }
359
360