- Use vfs_readv/vfs_writev instead of direct calls to VFS functions
[mirror/scst/.git] / scst / src / scst_mem.c
1 /*
2  *  scst_mem.c
3  *
4  *  Copyright (C) 2006 - 2009 Vladislav Bolkhovitin <vst@vlnb.net>
5  *  Copyright (C) 2007 - 2009 ID7 Ltd.
6  *
7  *  This program is free software; you can redistribute it and/or
8  *  modify it under the terms of the GNU General Public License
9  *  as published by the Free Software Foundation, version 2
10  *  of the License.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  *  GNU General Public License for more details.
16  */
17
18 #include <linux/init.h>
19 #include <linux/kernel.h>
20 #include <linux/errno.h>
21 #include <linux/list.h>
22 #include <linux/spinlock.h>
23 #include <linux/slab.h>
24 #include <linux/sched.h>
25 #include <linux/mm.h>
26 #include <linux/unistd.h>
27 #include <linux/string.h>
28
29 #include "scst.h"
30 #include "scst_priv.h"
31 #include "scst_mem.h"
32
33 #define SGV_DEFAULT_PURGE_INTERVAL      (60 * HZ)
34 #define SGV_MIN_SHRINK_INTERVAL         (1 * HZ)
35
36 /* Max pages freed from a pool per shrinking iteration */
37 #define MAX_PAGES_PER_POOL      50
38
39 static struct sgv_pool *sgv_norm_clust_pool, *sgv_norm_pool, *sgv_dma_pool;
40
41 static atomic_t sgv_pages_total = ATOMIC_INIT(0);
42
43 /* Both read-only */
44 static int sgv_hi_wmk;
45 static int sgv_lo_wmk;
46
47 static int sgv_max_local_pages, sgv_max_trans_pages;
48
49 static DEFINE_SPINLOCK(sgv_pools_lock); /* inner lock for sgv_pool_lock! */
50 static DEFINE_MUTEX(sgv_pools_mutex);
51
52 /* Both protected by sgv_pools_lock */
53 static struct sgv_pool *sgv_cur_purge_pool;
54 static LIST_HEAD(sgv_active_pools_list);
55
56 static atomic_t sgv_releases_on_hiwmk = ATOMIC_INIT(0);
57 static atomic_t sgv_releases_on_hiwmk_failed = ATOMIC_INIT(0);
58
59 static atomic_t sgv_other_total_alloc = ATOMIC_INIT(0);
60
61 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23))
62 static struct shrinker *sgv_shrinker;
63 #else
64 static struct shrinker sgv_shrinker;
65 #endif
66
67 /*
68  * Protected by sgv_pools_mutex AND sgv_pools_lock for writes,
69  * either one for reads.
70  */
71 static LIST_HEAD(sgv_pools_list);
72
73 static inline bool sgv_pool_clustered(const struct sgv_pool *pool)
74 {
75         return pool->clustering_type != sgv_no_clustering;
76 }
77
78 void scst_sgv_pool_use_norm(struct scst_tgt_dev *tgt_dev)
79 {
80         tgt_dev->gfp_mask = __GFP_NOWARN;
81         tgt_dev->pool = sgv_norm_pool;
82         clear_bit(SCST_TGT_DEV_CLUST_POOL, &tgt_dev->tgt_dev_flags);
83 }
84
85 void scst_sgv_pool_use_norm_clust(struct scst_tgt_dev *tgt_dev)
86 {
87         TRACE_MEM("%s", "Use clustering");
88         tgt_dev->gfp_mask = __GFP_NOWARN;
89         tgt_dev->pool = sgv_norm_clust_pool;
90         set_bit(SCST_TGT_DEV_CLUST_POOL, &tgt_dev->tgt_dev_flags);
91 }
92
93 void scst_sgv_pool_use_dma(struct scst_tgt_dev *tgt_dev)
94 {
95         TRACE_MEM("%s", "Use ISA DMA memory");
96         tgt_dev->gfp_mask = __GFP_NOWARN | GFP_DMA;
97         tgt_dev->pool = sgv_dma_pool;
98         clear_bit(SCST_TGT_DEV_CLUST_POOL, &tgt_dev->tgt_dev_flags);
99 }
100
101 /* Must be no locks */
102 static void sgv_dtor_and_free(struct sgv_pool_obj *obj)
103 {
104         struct sgv_pool *pool = obj->owner_pool;
105
106         TRACE_MEM("Destroying sgv obj %p", obj);
107
108         if (obj->sg_count != 0) {
109                 pool->alloc_fns.free_pages_fn(obj->sg_entries,
110                         obj->sg_count, obj->allocator_priv);
111         }
112         if (obj->sg_entries != obj->sg_entries_data) {
113                 if (obj->trans_tbl !=
114                     (struct trans_tbl_ent *)obj->sg_entries_data) {
115                         /* kfree() handles NULL parameter */
116                         kfree(obj->trans_tbl);
117                         obj->trans_tbl = NULL;
118                 }
119                 kfree(obj->sg_entries);
120         }
121
122         kmem_cache_free(pool->caches[obj->cache_num], obj);
123         return;
124 }
125
126 /* Might be called under sgv_pool_lock */
127 static inline void sgv_del_from_active(struct sgv_pool *pool)
128 {
129         struct list_head *next;
130
131         TRACE_MEM("Deleting sgv pool %p from the active list", pool);
132
133         spin_lock_bh(&sgv_pools_lock);
134
135         next = pool->sgv_active_pools_list_entry.next;
136         list_del(&pool->sgv_active_pools_list_entry);
137
138         if (sgv_cur_purge_pool == pool) {
139                 TRACE_MEM("Sgv pool %p is sgv cur purge pool", pool);
140
141                 if (next == &sgv_active_pools_list)
142                         next = next->next;
143
144                 if (next == &sgv_active_pools_list) {
145                         sgv_cur_purge_pool = NULL;
146                         TRACE_MEM("%s", "Sgv active list now empty");
147                 } else {
148                         sgv_cur_purge_pool = list_entry(next, typeof(*pool),
149                                 sgv_active_pools_list_entry);
150                         TRACE_MEM("New sgv cur purge pool %p",
151                                 sgv_cur_purge_pool);
152                 }
153         }
154
155         spin_unlock_bh(&sgv_pools_lock);
156         return;
157 }
158
159 /* Must be called under sgv_pool_lock held */
160 static void sgv_dec_cached_entries(struct sgv_pool *pool, int pages)
161 {
162         pool->cached_entries--;
163         pool->cached_pages -= pages;
164
165         if (pool->cached_entries == 0)
166                 sgv_del_from_active(pool);
167
168         return;
169 }
170
171 /* Must be called under sgv_pool_lock held */
172 static void __sgv_purge_from_cache(struct sgv_pool_obj *obj)
173 {
174         int pages = obj->pages;
175         struct sgv_pool *pool = obj->owner_pool;
176
177         TRACE_MEM("Purging sgv obj %p from pool %p (new cached_entries %d)",
178                 obj, pool, pool->cached_entries-1);
179
180         list_del(&obj->sorted_recycling_list_entry);
181         list_del(&obj->recycling_list_entry);
182
183         pool->inactive_cached_pages -= pages;
184         sgv_dec_cached_entries(pool, pages);
185
186         atomic_sub(pages, &sgv_pages_total);
187
188         return;
189 }
190
191 /* Must be called under sgv_pool_lock held */
192 static bool sgv_purge_from_cache(struct sgv_pool_obj *obj, int min_interval,
193         unsigned long cur_time)
194 {
195         EXTRACHECKS_BUG_ON(min_interval < 0);
196
197         TRACE_MEM("Checking if sgv obj %p should be purged (cur time %ld, "
198                 "obj time %ld, time to purge %ld)", obj, cur_time,
199                 obj->time_stamp, obj->time_stamp + min_interval);
200
201         if (time_after_eq(cur_time, (obj->time_stamp + min_interval))) {
202                 __sgv_purge_from_cache(obj);
203                 return true;
204         }
205         return false;
206 }
207
208 /* No locks */
209 static int sgv_shrink_pool(struct sgv_pool *pool, int nr, int min_interval,
210         unsigned long cur_time)
211 {
212         int freed = 0;
213
214         TRACE_ENTRY();
215
216         TRACE_MEM("Trying to shrink pool %p (nr %d, min_interval %d)",
217                 pool, nr, min_interval);
218
219         if (pool->purge_interval < 0) {
220                 TRACE_MEM("Not shrinkable pool %p, skipping", pool);
221                 goto out;
222         }
223
224         spin_lock_bh(&pool->sgv_pool_lock);
225
226         while (!list_empty(&pool->sorted_recycling_list) &&
227                         (atomic_read(&sgv_pages_total) > sgv_lo_wmk)) {
228                 struct sgv_pool_obj *obj = list_entry(
229                         pool->sorted_recycling_list.next,
230                         struct sgv_pool_obj, sorted_recycling_list_entry);
231
232                 if (sgv_purge_from_cache(obj, min_interval, cur_time)) {
233                         int pages = obj->pages;
234
235                         freed += pages;
236                         nr -= pages;
237
238                         TRACE_MEM("%d pages purged from pool %p (nr left %d, "
239                                 "total freed %d)", pages, pool, nr, freed);
240
241                         spin_unlock_bh(&pool->sgv_pool_lock);
242                         sgv_dtor_and_free(obj);
243                         spin_lock_bh(&pool->sgv_pool_lock);
244                 } else
245                         break;
246
247                 if ((nr <= 0) || (freed >= MAX_PAGES_PER_POOL)) {
248                         if (freed >= MAX_PAGES_PER_POOL)
249                                 TRACE_MEM("%d pages purged from pool %p, "
250                                         "leaving", freed, pool);
251                         break;
252                 }
253         }
254
255         spin_unlock_bh(&pool->sgv_pool_lock);
256
257 out:
258         TRACE_EXIT_RES(nr);
259         return nr;
260 }
261
262 /* No locks */
263 static int __sgv_shrink(int nr, int min_interval)
264 {
265         struct sgv_pool *pool;
266         unsigned long cur_time = jiffies;
267         int prev_nr = nr;
268         bool circle = false;
269
270         TRACE_ENTRY();
271
272         TRACE_MEM("Trying to shrink %d pages from all sgv pools "
273                 "(min_interval %d)", nr, min_interval);
274
275         while (nr > 0) {
276                 struct list_head *next;
277
278                 spin_lock_bh(&sgv_pools_lock);
279
280                 pool = sgv_cur_purge_pool;
281                 if (pool == NULL) {
282                         if (list_empty(&sgv_active_pools_list)) {
283                                 TRACE_MEM("%s", "Active pools list is empty");
284                                 goto out_unlock;
285                         }
286
287                         pool = list_entry(sgv_active_pools_list.next,
288                                         typeof(*pool),
289                                         sgv_active_pools_list_entry);
290                 }
291                 sgv_pool_get(pool);
292
293                 next = pool->sgv_active_pools_list_entry.next;
294                 if (next == &sgv_active_pools_list) {
295                         if (circle && (prev_nr == nr)) {
296                                 TRACE_MEM("Full circle done, but no progress, "
297                                         "leaving (nr %d)", nr);
298                                 goto out_unlock_put;
299                         }
300                         circle = true;
301                         prev_nr = nr;
302
303                         next = next->next;
304                 }
305
306                 sgv_cur_purge_pool = list_entry(next, typeof(*pool),
307                         sgv_active_pools_list_entry);
308                 TRACE_MEM("New cur purge pool %p", sgv_cur_purge_pool);
309
310                 spin_unlock_bh(&sgv_pools_lock);
311
312                 nr = sgv_shrink_pool(pool, nr, min_interval, cur_time);
313
314                 sgv_pool_put(pool);
315         }
316
317 out:
318         TRACE_EXIT_RES(nr);
319         return nr;
320
321 out_unlock:
322         spin_unlock_bh(&sgv_pools_lock);
323         goto out;
324
325 out_unlock_put:
326         spin_unlock_bh(&sgv_pools_lock);
327         sgv_pool_put(pool);
328         goto out;
329 }
330
331 static int sgv_shrink(int nr, gfp_t gfpm)
332 {
333         TRACE_ENTRY();
334
335         if (nr > 0) {
336                 nr = __sgv_shrink(nr, SGV_MIN_SHRINK_INTERVAL);
337                 TRACE_MEM("Left %d", nr);
338         } else {
339                 struct sgv_pool *pool;
340                 int inactive_pages = 0;
341
342                 spin_lock_bh(&sgv_pools_lock);
343                 list_for_each_entry(pool, &sgv_active_pools_list,
344                                 sgv_active_pools_list_entry) {
345                         if (pool->purge_interval > 0)
346                                 inactive_pages += pool->inactive_cached_pages;
347                 }
348                 spin_unlock_bh(&sgv_pools_lock);
349
350                 nr = max((int)0, inactive_pages - sgv_lo_wmk);
351                 TRACE_MEM("Can free %d (total %d)", nr,
352                         atomic_read(&sgv_pages_total));
353         }
354
355         TRACE_EXIT_RES(nr);
356         return nr;
357 }
358
359 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20)
360 static void sgv_purge_work_fn(void *p)
361 #else
362 static void sgv_purge_work_fn(struct delayed_work *work)
363 #endif
364 {
365         unsigned long cur_time = jiffies;
366 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20)
367         struct sgv_pool *pool = (struct sgv_pool *)p;
368 #else
369         struct sgv_pool *pool = container_of(work, struct sgv_pool,
370                                         sgv_purge_work);
371 #endif
372
373         TRACE_ENTRY();
374
375         TRACE_MEM("Purge work for pool %p", pool);
376
377         spin_lock_bh(&pool->sgv_pool_lock);
378
379         pool->purge_work_scheduled = false;
380
381         while (!list_empty(&pool->sorted_recycling_list)) {
382                 struct sgv_pool_obj *obj = list_entry(
383                         pool->sorted_recycling_list.next,
384                         struct sgv_pool_obj, sorted_recycling_list_entry);
385
386                 if (sgv_purge_from_cache(obj, pool->purge_interval, cur_time)) {
387                         spin_unlock_bh(&pool->sgv_pool_lock);
388                         sgv_dtor_and_free(obj);
389                         spin_lock_bh(&pool->sgv_pool_lock);
390                 } else {
391                         /*
392                          * Let's reschedule it for full period to not get here
393                          * too often. In the worst case we have shrinker
394                          * to reclaim buffers quickier.
395                          */
396                         TRACE_MEM("Rescheduling purge work for pool %p (delay "
397                                 "%d HZ/%d sec)", pool, pool->purge_interval,
398                                 pool->purge_interval/HZ);
399                         schedule_delayed_work(&pool->sgv_purge_work,
400                                 pool->purge_interval);
401                         pool->purge_work_scheduled = true;
402                         break;
403                 }
404         }
405
406         spin_unlock_bh(&pool->sgv_pool_lock);
407
408         TRACE_MEM("Leaving purge work for pool %p", pool);
409
410         TRACE_EXIT();
411         return;
412 }
413
414 static int sgv_check_full_clustering(struct scatterlist *sg, int cur, int hint)
415 {
416         int res = -1;
417         int i = hint;
418         unsigned long pfn_cur = page_to_pfn(sg_page(&sg[cur]));
419         int len_cur = sg[cur].length;
420         unsigned long pfn_cur_next = pfn_cur + (len_cur >> PAGE_SHIFT);
421         int full_page_cur = (len_cur & (PAGE_SIZE - 1)) == 0;
422         unsigned long pfn, pfn_next;
423         bool full_page;
424
425 #if 0
426         TRACE_MEM("pfn_cur %ld, pfn_cur_next %ld, len_cur %d, full_page_cur %d",
427                 pfn_cur, pfn_cur_next, len_cur, full_page_cur);
428 #endif
429
430         /* check the hint first */
431         if (i >= 0) {
432                 pfn = page_to_pfn(sg_page(&sg[i]));
433                 pfn_next = pfn + (sg[i].length >> PAGE_SHIFT);
434                 full_page = (sg[i].length & (PAGE_SIZE - 1)) == 0;
435
436                 if ((pfn == pfn_cur_next) && full_page_cur)
437                         goto out_head;
438
439                 if ((pfn_next == pfn_cur) && full_page)
440                         goto out_tail;
441         }
442
443         /* ToDo: implement more intelligent search */
444         for (i = cur - 1; i >= 0; i--) {
445                 pfn = page_to_pfn(sg_page(&sg[i]));
446                 pfn_next = pfn + (sg[i].length >> PAGE_SHIFT);
447                 full_page = (sg[i].length & (PAGE_SIZE - 1)) == 0;
448
449                 if ((pfn == pfn_cur_next) && full_page_cur)
450                         goto out_head;
451
452                 if ((pfn_next == pfn_cur) && full_page)
453                         goto out_tail;
454         }
455
456 out:
457         return res;
458
459 out_tail:
460         TRACE_MEM("SG segment %d will be tail merged with segment %d", cur, i);
461         sg[i].length += len_cur;
462         sg_clear(&sg[cur]);
463         res = i;
464         goto out;
465
466 out_head:
467         TRACE_MEM("SG segment %d will be head merged with segment %d", cur, i);
468         sg_assign_page(&sg[i], sg_page(&sg[cur]));
469         sg[i].length += len_cur;
470         sg_clear(&sg[cur]);
471         res = i;
472         goto out;
473 }
474
475 static int sgv_check_tail_clustering(struct scatterlist *sg, int cur, int hint)
476 {
477         int res = -1;
478         unsigned long pfn_cur = page_to_pfn(sg_page(&sg[cur]));
479         int len_cur = sg[cur].length;
480         int prev;
481         unsigned long pfn_prev;
482         bool full_page;
483
484 #ifdef SCST_HIGHMEM
485         if (page >= highmem_start_page) {
486                 TRACE_MEM("%s", "HIGHMEM page allocated, no clustering")
487                 goto out;
488         }
489 #endif
490
491 #if 0
492         TRACE_MEM("pfn_cur %ld, pfn_cur_next %ld, len_cur %d, full_page_cur %d",
493                 pfn_cur, pfn_cur_next, len_cur, full_page_cur);
494 #endif
495
496         if (cur == 0)
497                 goto out;
498
499         prev = cur - 1;
500         pfn_prev = page_to_pfn(sg_page(&sg[prev])) +
501                         (sg[prev].length >> PAGE_SHIFT);
502         full_page = (sg[prev].length & (PAGE_SIZE - 1)) == 0;
503
504         if ((pfn_prev == pfn_cur) && full_page) {
505                 TRACE_MEM("SG segment %d will be tail merged with segment %d",
506                         cur, prev);
507                 sg[prev].length += len_cur;
508                 sg_clear(&sg[cur]);
509                 res = prev;
510         }
511
512 out:
513         return res;
514 }
515
516 static void sgv_free_sys_sg_entries(struct scatterlist *sg, int sg_count,
517         void *priv)
518 {
519         int i;
520
521         TRACE_MEM("sg=%p, sg_count=%d", sg, sg_count);
522
523         for (i = 0; i < sg_count; i++) {
524                 struct page *p = sg_page(&sg[i]);
525                 int len = sg[i].length;
526                 int pages =
527                         (len >> PAGE_SHIFT) + ((len & ~PAGE_MASK) != 0);
528
529                 TRACE_MEM("page %lx, len %d, pages %d",
530                         (unsigned long)p, len, pages);
531
532                 while (pages > 0) {
533                         int order = 0;
534
535 /*
536  * __free_pages() doesn't like freeing pages with not that order with
537  * which they were allocated, so disable this small optimization.
538  */
539 #if 0
540                         if (len > 0) {
541                                 while (((1 << order) << PAGE_SHIFT) < len)
542                                         order++;
543                                 len = 0;
544                         }
545 #endif
546                         TRACE_MEM("free_pages(): order %d, page %lx",
547                                 order, (unsigned long)p);
548
549                         __free_pages(p, order);
550
551                         pages -= 1 << order;
552                         p += 1 << order;
553                 }
554         }
555 }
556
557 static struct page *sgv_alloc_sys_pages(struct scatterlist *sg,
558         gfp_t gfp_mask, void *priv)
559 {
560         struct page *page = alloc_pages(gfp_mask, 0);
561
562         sg_set_page(sg, page, PAGE_SIZE, 0);
563         TRACE_MEM("page=%p, sg=%p, priv=%p", page, sg, priv);
564         if (page == NULL) {
565                 TRACE(TRACE_OUT_OF_MEM, "%s", "Allocation of "
566                         "sg page failed");
567         }
568         return page;
569 }
570
571 static int sgv_alloc_sg_entries(struct scatterlist *sg, int pages,
572         gfp_t gfp_mask, enum sgv_clustering_types clustering_type,
573         struct trans_tbl_ent *trans_tbl,
574         const struct sgv_pool_alloc_fns *alloc_fns, void *priv)
575 {
576         int sg_count = 0;
577         int pg, i, j;
578         int merged = -1;
579
580         TRACE_MEM("pages=%d, clustering_type=%d", pages, clustering_type);
581
582 #if 0
583         gfp_mask |= __GFP_COLD;
584 #endif
585 #ifdef CONFIG_SCST_STRICT_SECURITY
586         gfp_mask |= __GFP_ZERO;
587 #endif
588
589         for (pg = 0; pg < pages; pg++) {
590                 void *rc;
591 #ifdef CONFIG_SCST_DEBUG_OOM
592                 if (((gfp_mask & __GFP_NOFAIL) != __GFP_NOFAIL) &&
593                     ((scst_random() % 10000) == 55))
594                         rc = NULL;
595                 else
596 #endif
597                         rc = alloc_fns->alloc_pages_fn(&sg[sg_count], gfp_mask,
598                                 priv);
599                 if (rc == NULL)
600                         goto out_no_mem;
601
602                 /*
603                  * This code allows compiler to see full body of the clustering
604                  * functions and gives it a chance to generate better code.
605                  * At least, the resulting code is smaller, comparing to
606                  * calling them using a function pointer.
607                  */
608                 if (clustering_type == sgv_full_clustering)
609                         merged = sgv_check_full_clustering(sg, sg_count, merged);
610                 else if (clustering_type == sgv_tail_clustering)
611                         merged = sgv_check_tail_clustering(sg, sg_count, merged);
612                 else
613                         merged = -1;
614
615                 if (merged == -1)
616                         sg_count++;
617
618                 TRACE_MEM("pg=%d, merged=%d, sg_count=%d", pg, merged,
619                         sg_count);
620         }
621
622         if ((clustering_type != sgv_no_clustering) && (trans_tbl != NULL)) {
623                 pg = 0;
624                 for (i = 0; i < pages; i++) {
625                         int n = (sg[i].length >> PAGE_SHIFT) +
626                                 ((sg[i].length & ~PAGE_MASK) != 0);
627                         trans_tbl[i].pg_count = pg;
628                         for (j = 0; j < n; j++)
629                                 trans_tbl[pg++].sg_num = i+1;
630                         TRACE_MEM("i=%d, n=%d, pg_count=%d", i, n,
631                                 trans_tbl[i].pg_count);
632                 }
633         }
634
635 out:
636         TRACE_MEM("sg_count=%d", sg_count);
637         return sg_count;
638
639 out_no_mem:
640         alloc_fns->free_pages_fn(sg, sg_count, priv);
641         sg_count = 0;
642         goto out;
643 }
644
645 static int sgv_alloc_arrays(struct sgv_pool_obj *obj,
646         int pages_to_alloc, gfp_t gfp_mask)
647 {
648         int sz, tsz = 0;
649         int res = 0;
650
651         TRACE_ENTRY();
652
653         sz = pages_to_alloc * sizeof(obj->sg_entries[0]);
654
655         obj->sg_entries = kmalloc(sz, gfp_mask);
656         if (unlikely(obj->sg_entries == NULL)) {
657                 TRACE(TRACE_OUT_OF_MEM, "Allocation of sgv_pool_obj "
658                         "SG vector failed (size %d)", sz);
659                 res = -ENOMEM;
660                 goto out;
661         }
662
663         sg_init_table(obj->sg_entries, pages_to_alloc);
664
665         if (sgv_pool_clustered(obj->owner_pool)) {
666                 if (pages_to_alloc <= sgv_max_trans_pages) {
667                         obj->trans_tbl =
668                                 (struct trans_tbl_ent *)obj->sg_entries_data;
669                         /*
670                          * No need to clear trans_tbl, if needed, it will be
671                          * fully rewritten in sgv_alloc_sg_entries()
672                          */
673                 } else {
674                         tsz = pages_to_alloc * sizeof(obj->trans_tbl[0]);
675                         obj->trans_tbl = kzalloc(tsz, gfp_mask);
676                         if (unlikely(obj->trans_tbl == NULL)) {
677                                 TRACE(TRACE_OUT_OF_MEM, "Allocation of "
678                                         "trans_tbl failed (size %d)", tsz);
679                                 res = -ENOMEM;
680                                 goto out_free;
681                         }
682                 }
683         }
684
685         TRACE_MEM("pages_to_alloc %d, sz %d, tsz %d, obj %p, sg_entries %p, "
686                 "trans_tbl %p", pages_to_alloc, sz, tsz, obj, obj->sg_entries,
687                 obj->trans_tbl);
688
689 out:
690         TRACE_EXIT_RES(res);
691         return res;
692
693 out_free:
694         kfree(obj->sg_entries);
695         obj->sg_entries = NULL;
696         goto out;
697 }
698
699 static struct sgv_pool_obj *sgv_get_obj(struct sgv_pool *pool, int cache_num,
700         int pages, gfp_t gfp_mask, bool get_new)
701 {
702         struct sgv_pool_obj *obj;
703
704         spin_lock_bh(&pool->sgv_pool_lock);
705
706         if (unlikely(get_new)) {
707                 /* Used only for buffers preallocation */
708                 goto get_new;
709         }
710
711         if (likely(!list_empty(&pool->recycling_lists[cache_num]))) {
712                 obj = list_entry(pool->recycling_lists[cache_num].next,
713                          struct sgv_pool_obj, recycling_list_entry);
714
715                 list_del(&obj->sorted_recycling_list_entry);
716                 list_del(&obj->recycling_list_entry);
717
718                 pool->inactive_cached_pages -= pages;
719
720                 spin_unlock_bh(&pool->sgv_pool_lock);
721                 goto out;
722         }
723
724 get_new:
725         if (pool->cached_entries == 0) {
726                 TRACE_MEM("Adding pool %p to the active list", pool);
727                 spin_lock_bh(&sgv_pools_lock);
728                 list_add_tail(&pool->sgv_active_pools_list_entry,
729                         &sgv_active_pools_list);
730                 spin_unlock_bh(&sgv_pools_lock);
731         }
732
733         pool->cached_entries++;
734         pool->cached_pages += pages;
735
736         spin_unlock_bh(&pool->sgv_pool_lock);
737
738         TRACE_MEM("New cached entries %d (pool %p)", pool->cached_entries,
739                 pool);
740
741         obj = kmem_cache_alloc(pool->caches[cache_num],
742                 gfp_mask & ~(__GFP_HIGHMEM|GFP_DMA));
743         if (likely(obj)) {
744                 memset(obj, 0, sizeof(*obj));
745                 obj->cache_num = cache_num;
746                 obj->pages = pages;
747                 obj->owner_pool = pool;
748         } else {
749                 spin_lock_bh(&pool->sgv_pool_lock);
750                 sgv_dec_cached_entries(pool, pages);
751                 spin_unlock_bh(&pool->sgv_pool_lock);
752         }
753
754 out:
755         return obj;
756 }
757
758 static void sgv_put_obj(struct sgv_pool_obj *obj)
759 {
760         struct sgv_pool *pool = obj->owner_pool;
761         struct list_head *entry;
762         struct list_head *list = &pool->recycling_lists[obj->cache_num];
763         int pages = obj->pages;
764
765         spin_lock_bh(&pool->sgv_pool_lock);
766
767         TRACE_MEM("sgv %p, cache num %d, pages %d, sg_count %d", obj,
768                 obj->cache_num, pages, obj->sg_count);
769
770         if (sgv_pool_clustered(pool)) {
771                 /* Make objects with less entries more preferred */
772                 __list_for_each(entry, list) {
773                         struct sgv_pool_obj *tmp = list_entry(entry,
774                                 struct sgv_pool_obj, recycling_list_entry);
775
776                         TRACE_MEM("tmp %p, cache num %d, pages %d, sg_count %d",
777                                 tmp, tmp->cache_num, tmp->pages, tmp->sg_count);
778
779                         if (obj->sg_count <= tmp->sg_count)
780                                 break;
781                 }
782                 entry = entry->prev;
783         } else
784                 entry = list;
785
786         TRACE_MEM("Adding in %p (list %p)", entry, list);
787         list_add(&obj->recycling_list_entry, entry);
788
789         list_add_tail(&obj->sorted_recycling_list_entry,
790                 &pool->sorted_recycling_list);
791
792         obj->time_stamp = jiffies;
793
794         pool->inactive_cached_pages += pages;
795
796         if (!pool->purge_work_scheduled) {
797                 TRACE_MEM("Scheduling purge work for pool %p", pool);
798                 pool->purge_work_scheduled = true;
799                 schedule_delayed_work(&pool->sgv_purge_work,
800                         pool->purge_interval);
801         }
802
803         spin_unlock_bh(&pool->sgv_pool_lock);
804         return;
805 }
806
807 /* No locks */
808 static int sgv_hiwmk_check(int pages_to_alloc)
809 {
810         int res = 0;
811         int pages = pages_to_alloc;
812
813         pages += atomic_read(&sgv_pages_total);
814
815         if (unlikely(pages > sgv_hi_wmk)) {
816                 pages -= sgv_hi_wmk;
817                 atomic_inc(&sgv_releases_on_hiwmk);
818
819                 pages = __sgv_shrink(pages, 0);
820                 if (pages > 0) {
821                         TRACE(TRACE_OUT_OF_MEM, "Requested amount of "
822                             "memory (%d pages) for being executed "
823                             "commands together with the already "
824                             "allocated memory exceeds the allowed "
825                             "maximum %d. Should you increase "
826                             "scst_max_cmd_mem?", pages_to_alloc,
827                            sgv_hi_wmk);
828                         atomic_inc(&sgv_releases_on_hiwmk_failed);
829                         res = -ENOMEM;
830                         goto out_unlock;
831                 }
832         }
833
834         atomic_add(pages_to_alloc, &sgv_pages_total);
835
836 out_unlock:
837         TRACE_MEM("pages_to_alloc %d, new total %d", pages_to_alloc,
838                 atomic_read(&sgv_pages_total));
839
840         return res;
841 }
842
843 /* No locks */
844 static void sgv_hiwmk_uncheck(int pages)
845 {
846         atomic_sub(pages, &sgv_pages_total);
847         TRACE_MEM("pages %d, new total %d", pages,
848                 atomic_read(&sgv_pages_total));
849         return;
850 }
851
852 /* No locks */
853 static bool sgv_check_allowed_mem(struct scst_mem_lim *mem_lim, int pages)
854 {
855         int alloced;
856         bool res = true;
857
858         alloced = atomic_add_return(pages, &mem_lim->alloced_pages);
859         if (unlikely(alloced > mem_lim->max_allowed_pages)) {
860                 TRACE(TRACE_OUT_OF_MEM, "Requested amount of memory "
861                         "(%d pages) for being executed commands on a device "
862                         "together with the already allocated memory exceeds "
863                         "the allowed maximum %d. Should you increase "
864                         "scst_max_dev_cmd_mem?", pages,
865                         mem_lim->max_allowed_pages);
866                 atomic_sub(pages, &mem_lim->alloced_pages);
867                 res = false;
868         }
869
870         TRACE_MEM("mem_lim %p, pages %d, res %d, new alloced %d", mem_lim,
871                 pages, res, atomic_read(&mem_lim->alloced_pages));
872
873         return res;
874 }
875
876 /* No locks */
877 static void sgv_uncheck_allowed_mem(struct scst_mem_lim *mem_lim, int pages)
878 {
879         atomic_sub(pages, &mem_lim->alloced_pages);
880
881         TRACE_MEM("mem_lim %p, pages %d, new alloced %d", mem_lim,
882                 pages, atomic_read(&mem_lim->alloced_pages));
883         return;
884 }
885
886 struct scatterlist *sgv_pool_alloc(struct sgv_pool *pool, unsigned int size,
887         gfp_t gfp_mask, int flags, int *count,
888         struct sgv_pool_obj **sgv, struct scst_mem_lim *mem_lim, void *priv)
889 {
890         struct sgv_pool_obj *obj;
891         int cache_num, pages, cnt;
892         struct scatterlist *res = NULL;
893         int pages_to_alloc;
894         int no_cached = flags & SGV_POOL_ALLOC_NO_CACHED;
895         bool allowed_mem_checked = false, hiwmk_checked = false;
896
897         TRACE_ENTRY();
898
899         if (unlikely(size == 0))
900                 goto out;
901
902         EXTRACHECKS_BUG_ON((gfp_mask & __GFP_NOFAIL) == __GFP_NOFAIL);
903
904         pages = ((size + PAGE_SIZE - 1) >> PAGE_SHIFT);
905         if (pool->single_alloc_pages == 0) {
906                 int pages_order = get_order(size);
907                 cache_num = pages_order;
908                 pages_to_alloc = (1 << pages_order);
909         } else {
910                 cache_num = 0;
911                 pages_to_alloc = max(pool->single_alloc_pages, pages);
912         }
913
914         TRACE_MEM("size=%d, pages=%d, pages_to_alloc=%d, cache num=%d, "
915                 "flags=%x, no_cached=%d, *sgv=%p", size, pages,
916                 pages_to_alloc, cache_num, flags, no_cached, *sgv);
917
918         if (*sgv != NULL) {
919                 obj = *sgv;
920
921                 TRACE_MEM("Supplied obj %p, cache num %d", obj, obj->cache_num);
922
923                 EXTRACHECKS_BUG_ON(obj->sg_count != 0);
924
925                 if (unlikely(!sgv_check_allowed_mem(mem_lim, pages_to_alloc)))
926                         goto out_fail_free_sg_entries;
927                 allowed_mem_checked = true;
928
929                 if (unlikely(sgv_hiwmk_check(pages_to_alloc) != 0))
930                         goto out_fail_free_sg_entries;
931                 hiwmk_checked = true;
932         } else if ((pages_to_alloc <= pool->max_cached_pages) && !no_cached) {
933                 if (unlikely(!sgv_check_allowed_mem(mem_lim, pages_to_alloc)))
934                         goto out_fail;
935                 allowed_mem_checked = true;
936
937                 obj = sgv_get_obj(pool, cache_num, pages_to_alloc, gfp_mask,
938                         flags & SGV_POOL_ALLOC_GET_NEW);
939                 if (unlikely(obj == NULL)) {
940                         TRACE(TRACE_OUT_OF_MEM, "Allocation of "
941                                 "sgv_pool_obj failed (size %d)", size);
942                         goto out_fail;
943                 }
944
945                 if (obj->sg_count != 0) {
946                         TRACE_MEM("Cached obj %p", obj);
947                         atomic_inc(&pool->cache_acc[cache_num].hit_alloc);
948                         goto success;
949                 }
950
951                 if (flags & SGV_POOL_NO_ALLOC_ON_CACHE_MISS) {
952                         if (!(flags & SGV_POOL_RETURN_OBJ_ON_ALLOC_FAIL))
953                                 goto out_fail_free;
954                 }
955
956                 TRACE_MEM("Brand new obj %p", obj);
957
958                 if (pages_to_alloc <= sgv_max_local_pages) {
959                         obj->sg_entries = obj->sg_entries_data;
960                         sg_init_table(obj->sg_entries, pages_to_alloc);
961                         TRACE_MEM("sg_entries %p", obj->sg_entries);
962                         if (sgv_pool_clustered(pool)) {
963                                 obj->trans_tbl = (struct trans_tbl_ent *)
964                                         (obj->sg_entries + pages_to_alloc);
965                                 TRACE_MEM("trans_tbl %p", obj->trans_tbl);
966                                 /*
967                                  * No need to clear trans_tbl, if needed, it
968                                  * will be fully rewritten in
969                                  * sgv_alloc_sg_entries().
970                                  */
971                         }
972                 } else {
973                         if (unlikely(sgv_alloc_arrays(obj, pages_to_alloc,
974                                         gfp_mask) != 0))
975                                 goto out_fail_free;
976                 }
977
978                 if ((flags & SGV_POOL_NO_ALLOC_ON_CACHE_MISS) &&
979                     (flags & SGV_POOL_RETURN_OBJ_ON_ALLOC_FAIL))
980                         goto out_return;
981
982                 obj->allocator_priv = priv;
983
984                 if (unlikely(sgv_hiwmk_check(pages_to_alloc) != 0))
985                         goto out_fail_free_sg_entries;
986                 hiwmk_checked = true;
987         } else {
988                 int sz;
989
990                 pages_to_alloc = pages;
991
992                 if (unlikely(!sgv_check_allowed_mem(mem_lim, pages_to_alloc)))
993                         goto out_fail;
994                 allowed_mem_checked = true;
995
996                 if (flags & SGV_POOL_NO_ALLOC_ON_CACHE_MISS)
997                         goto out_return2;
998
999                 sz = sizeof(*obj) + pages * sizeof(obj->sg_entries[0]);
1000
1001                 obj = kmalloc(sz, gfp_mask);
1002                 if (unlikely(obj == NULL)) {
1003                         TRACE(TRACE_OUT_OF_MEM, "Allocation of "
1004                                 "sgv_pool_obj failed (size %d)", size);
1005                         goto out_fail;
1006                 }
1007                 memset(obj, 0, sizeof(*obj));
1008
1009                 obj->owner_pool = pool;
1010                 cache_num = -1;
1011                 obj->cache_num = cache_num;
1012                 obj->pages = pages_to_alloc;
1013                 obj->allocator_priv = priv;
1014
1015                 obj->sg_entries = obj->sg_entries_data;
1016                 sg_init_table(obj->sg_entries, pages);
1017
1018                 if (unlikely(sgv_hiwmk_check(pages_to_alloc) != 0))
1019                         goto out_fail_free_sg_entries;
1020                 hiwmk_checked = true;
1021
1022                 TRACE_MEM("Big or no_cached obj %p (size %d)", obj, sz);
1023         }
1024
1025         obj->sg_count = sgv_alloc_sg_entries(obj->sg_entries,
1026                 pages_to_alloc, gfp_mask, pool->clustering_type,
1027                 obj->trans_tbl, &pool->alloc_fns, priv);
1028         if (unlikely(obj->sg_count <= 0)) {
1029                 obj->sg_count = 0;
1030                 if ((flags & SGV_POOL_RETURN_OBJ_ON_ALLOC_FAIL) &&
1031                     (cache_num >= 0))
1032                         goto out_return1;
1033                 else
1034                         goto out_fail_free_sg_entries;
1035         }
1036
1037         if (cache_num >= 0) {
1038                 atomic_add(pages_to_alloc - obj->sg_count,
1039                         &pool->cache_acc[cache_num].merged);
1040         } else {
1041                 if (no_cached) {
1042                         atomic_add(pages_to_alloc,
1043                                 &pool->other_pages);
1044                         atomic_add(pages_to_alloc - obj->sg_count,
1045                                 &pool->other_merged);
1046                 } else {
1047                         atomic_add(pages_to_alloc,
1048                                 &pool->big_pages);
1049                         atomic_add(pages_to_alloc - obj->sg_count,
1050                                 &pool->big_merged);
1051                 }
1052         }
1053
1054 success:
1055         if (cache_num >= 0) {
1056                 int sg;
1057                 atomic_inc(&pool->cache_acc[cache_num].total_alloc);
1058                 if (sgv_pool_clustered(pool))
1059                         cnt = obj->trans_tbl[pages-1].sg_num;
1060                 else
1061                         cnt = pages;
1062                 sg = cnt-1;
1063                 obj->orig_sg = sg;
1064                 obj->orig_length = obj->sg_entries[sg].length;
1065                 if (sgv_pool_clustered(pool)) {
1066                         obj->sg_entries[sg].length =
1067                                 (pages - obj->trans_tbl[sg].pg_count) << PAGE_SHIFT;
1068                 }
1069         } else {
1070                 cnt = obj->sg_count;
1071                 if (no_cached)
1072                         atomic_inc(&pool->other_alloc);
1073                 else
1074                         atomic_inc(&pool->big_alloc);
1075         }
1076
1077         *count = cnt;
1078         res = obj->sg_entries;
1079         *sgv = obj;
1080
1081         if (size & ~PAGE_MASK)
1082                 obj->sg_entries[cnt-1].length -=
1083                         PAGE_SIZE - (size & ~PAGE_MASK);
1084
1085         TRACE_MEM("obj=%p, sg_entries %p (size=%d, pages=%d, sg_count=%d, "
1086                 "count=%d, last_len=%d)", obj, obj->sg_entries, size, pages,
1087                 obj->sg_count, *count, obj->sg_entries[obj->orig_sg].length);
1088
1089 out:
1090         TRACE_EXIT_HRES(res);
1091         return res;
1092
1093 out_return:
1094         obj->allocator_priv = priv;
1095         obj->owner_pool = pool;
1096
1097 out_return1:
1098         *sgv = obj;
1099         TRACE_MEM("Returning failed obj %p (count %d)", obj, *count);
1100
1101 out_return2:
1102         *count = pages_to_alloc;
1103         res = NULL;
1104         goto out_uncheck;
1105
1106 out_fail_free_sg_entries:
1107         if (obj->sg_entries != obj->sg_entries_data) {
1108                 if (obj->trans_tbl !=
1109                         (struct trans_tbl_ent *)obj->sg_entries_data) {
1110                         /* kfree() handles NULL parameter */
1111                         kfree(obj->trans_tbl);
1112                         obj->trans_tbl = NULL;
1113                 }
1114                 kfree(obj->sg_entries);
1115                 obj->sg_entries = NULL;
1116         }
1117
1118 out_fail_free:
1119         if (cache_num >= 0) {
1120                 spin_lock_bh(&pool->sgv_pool_lock);
1121                 sgv_dec_cached_entries(pool, pages_to_alloc);
1122                 spin_unlock_bh(&pool->sgv_pool_lock);
1123
1124                 kmem_cache_free(pool->caches[obj->cache_num], obj);
1125         } else
1126                 kfree(obj);
1127
1128 out_fail:
1129         res = NULL;
1130         *count = 0;
1131         *sgv = NULL;
1132         TRACE_MEM("%s", "Allocation failed");
1133
1134 out_uncheck:
1135         if (hiwmk_checked)
1136                 sgv_hiwmk_uncheck(pages_to_alloc);
1137         if (allowed_mem_checked)
1138                 sgv_uncheck_allowed_mem(mem_lim, pages_to_alloc);
1139         goto out;
1140 }
1141 EXPORT_SYMBOL(sgv_pool_alloc);
1142
1143 void *sgv_get_priv(struct sgv_pool_obj *obj)
1144 {
1145         return obj->allocator_priv;
1146 }
1147 EXPORT_SYMBOL(sgv_get_priv);
1148
1149 void sgv_pool_free(struct sgv_pool_obj *obj, struct scst_mem_lim *mem_lim)
1150 {
1151         int pages = (obj->sg_count != 0) ? obj->pages : 0;
1152
1153         TRACE_MEM("Freeing obj %p, cache num %d, pages %d, sg_entries %p, "
1154                 "sg_count %d, allocator_priv %p", obj, obj->cache_num, pages,
1155                 obj->sg_entries, obj->sg_count, obj->allocator_priv);
1156         if (obj->cache_num >= 0) {
1157                 obj->sg_entries[obj->orig_sg].length = obj->orig_length;
1158                 sgv_put_obj(obj);
1159         } else {
1160                 obj->owner_pool->alloc_fns.free_pages_fn(obj->sg_entries,
1161                         obj->sg_count, obj->allocator_priv);
1162                 kfree(obj);
1163                 sgv_hiwmk_uncheck(pages);
1164         }
1165
1166         sgv_uncheck_allowed_mem(mem_lim, pages);
1167         return;
1168 }
1169 EXPORT_SYMBOL(sgv_pool_free);
1170
1171 struct scatterlist *scst_alloc(int size, gfp_t gfp_mask, int *count)
1172 {
1173         struct scatterlist *res;
1174         int pages = (size >> PAGE_SHIFT) + ((size & ~PAGE_MASK) != 0);
1175         struct sgv_pool_alloc_fns sys_alloc_fns = {
1176                 sgv_alloc_sys_pages, sgv_free_sys_sg_entries };
1177         int no_fail = ((gfp_mask & __GFP_NOFAIL) == __GFP_NOFAIL);
1178
1179         TRACE_ENTRY();
1180
1181         atomic_inc(&sgv_other_total_alloc);
1182
1183         if (unlikely(sgv_hiwmk_check(pages) != 0)) {
1184                 if (!no_fail) {
1185                         res = NULL;
1186                         goto out;
1187                 } else {
1188                         /*
1189                          * Update active_pages_total since alloc can't fail.
1190                          * If it wasn't updated then the counter would cross 0
1191                          * on free again.
1192                          */
1193                         sgv_hiwmk_uncheck(-pages);
1194                  }
1195         }
1196
1197         res = kmalloc(pages*sizeof(*res), gfp_mask);
1198         if (res == NULL) {
1199                 TRACE(TRACE_OUT_OF_MEM, "Unable to allocate sg for %d pages",
1200                         pages);
1201                 goto out_uncheck;
1202         }
1203
1204         sg_init_table(res, pages);
1205
1206         /*
1207          * If we allow use clustering here, we will have troubles in
1208          * scst_free() to figure out how many pages are in the SG vector.
1209          * So, always don't use clustering.
1210          */
1211         *count = sgv_alloc_sg_entries(res, pages, gfp_mask, sgv_no_clustering,
1212                         NULL, &sys_alloc_fns, NULL);
1213         if (*count <= 0)
1214                 goto out_free;
1215
1216 out:
1217         TRACE_MEM("Alloced sg %p (count %d) \"no fail\" %d", res, *count, no_fail);
1218
1219         TRACE_EXIT_HRES(res);
1220         return res;
1221
1222 out_free:
1223         kfree(res);
1224         res = NULL;
1225
1226 out_uncheck:
1227         if (!no_fail)
1228                 sgv_hiwmk_uncheck(pages);
1229         goto out;
1230 }
1231 EXPORT_SYMBOL(scst_alloc);
1232
1233 void scst_free(struct scatterlist *sg, int count)
1234 {
1235         TRACE_MEM("Freeing sg=%p", sg);
1236
1237         sgv_hiwmk_uncheck(count);
1238
1239         sgv_free_sys_sg_entries(sg, count, NULL);
1240         kfree(sg);
1241         return;
1242 }
1243 EXPORT_SYMBOL(scst_free);
1244
1245 /* Must be called under sgv_pools_mutex */
1246 static void sgv_pool_init_cache(struct sgv_pool *pool, int cache_num)
1247 {
1248         int size;
1249         int pages;
1250         struct sgv_pool_obj *obj;
1251
1252         atomic_set(&pool->cache_acc[cache_num].total_alloc, 0);
1253         atomic_set(&pool->cache_acc[cache_num].hit_alloc, 0);
1254         atomic_set(&pool->cache_acc[cache_num].merged, 0);
1255
1256         if (pool->single_alloc_pages == 0)
1257                 pages = 1 << cache_num;
1258         else
1259                 pages = pool->single_alloc_pages;
1260
1261         if (pages <= sgv_max_local_pages) {
1262                 size = sizeof(*obj) + pages *
1263                         (sizeof(obj->sg_entries[0]) +
1264                          ((pool->clustering_type != sgv_no_clustering) ?
1265                                 sizeof(obj->trans_tbl[0]) : 0));
1266         } else if (pages <= sgv_max_trans_pages) {
1267                 /*
1268                  * sg_entries is allocated outside object,
1269                  * but trans_tbl is still embedded.
1270                  */
1271                 size = sizeof(*obj) + pages *
1272                         (((pool->clustering_type != sgv_no_clustering) ?
1273                                 sizeof(obj->trans_tbl[0]) : 0));
1274         } else {
1275                 size = sizeof(*obj);
1276                 /* both sgv and trans_tbl are kmalloc'ed() */
1277         }
1278
1279         TRACE_MEM("pages=%d, size=%d", pages, size);
1280
1281         scnprintf(pool->cache_names[cache_num],
1282                 sizeof(pool->cache_names[cache_num]),
1283                 "%s-%uK", pool->name, (pages << PAGE_SHIFT) >> 10);
1284         pool->caches[cache_num] = kmem_cache_create(
1285                 pool->cache_names[cache_num], size, 0, SCST_SLAB_FLAGS, NULL
1286 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23))
1287                 , NULL);
1288 #else
1289                 );
1290 #endif
1291         return;
1292 }
1293
1294 /* Must be called under sgv_pools_mutex */
1295 static int sgv_pool_init(struct sgv_pool *pool, const char *name,
1296         enum sgv_clustering_types clustering_type, int single_alloc_pages,
1297         int purge_interval)
1298 {
1299         int res = -ENOMEM;
1300         int i;
1301
1302         TRACE_ENTRY();
1303
1304         if (single_alloc_pages < 0) {
1305                 PRINT_ERROR("Wrong single_alloc_pages value %d",
1306                         single_alloc_pages);
1307                 res = -EINVAL;
1308                 goto out;
1309         }
1310
1311         memset(pool, 0, sizeof(*pool));
1312
1313         atomic_set(&pool->big_alloc, 0);
1314         atomic_set(&pool->big_pages, 0);
1315         atomic_set(&pool->big_merged, 0);
1316         atomic_set(&pool->other_alloc, 0);
1317         atomic_set(&pool->other_pages, 0);
1318         atomic_set(&pool->other_merged, 0);
1319
1320         pool->clustering_type = clustering_type;
1321         pool->single_alloc_pages = single_alloc_pages;
1322         if (purge_interval != 0) {
1323                 pool->purge_interval = purge_interval;
1324                 if (purge_interval < 0) {
1325                         /* Let's pretend that it's always scheduled */
1326                         pool->purge_work_scheduled = 1;
1327                 }
1328         } else
1329                 pool->purge_interval = SGV_DEFAULT_PURGE_INTERVAL;
1330         if (single_alloc_pages == 0) {
1331                 pool->max_caches = SGV_POOL_ELEMENTS;
1332                 pool->max_cached_pages = 1 << SGV_POOL_ELEMENTS;
1333         } else {
1334                 pool->max_caches = 1;
1335                 pool->max_cached_pages = single_alloc_pages;
1336         }
1337         pool->alloc_fns.alloc_pages_fn = sgv_alloc_sys_pages;
1338         pool->alloc_fns.free_pages_fn = sgv_free_sys_sg_entries;
1339
1340         TRACE_MEM("name %s, sizeof(*obj)=%zd, clustering_type=%d, "
1341                 "single_alloc_pages=%d, max_caches=%d, max_cached_pages=%d",
1342                 name, sizeof(struct sgv_pool_obj), clustering_type,
1343                 single_alloc_pages, pool->max_caches, pool->max_cached_pages);
1344
1345         strncpy(pool->name, name, sizeof(pool->name)-1);
1346         pool->name[sizeof(pool->name)-1] = '\0';
1347
1348         pool->owner_mm = current->mm;
1349
1350         for (i = 0; i < pool->max_caches; i++) {
1351                 sgv_pool_init_cache(pool, i);
1352                 if (pool->caches[i] == NULL) {
1353                         TRACE(TRACE_OUT_OF_MEM, "Allocation of sgv_pool "
1354                                 "cache %s(%d) failed", name, i);
1355                         goto out_free;
1356                 }
1357         }
1358
1359         atomic_set(&pool->sgv_pool_ref, 1);
1360         spin_lock_init(&pool->sgv_pool_lock);
1361         INIT_LIST_HEAD(&pool->sorted_recycling_list);
1362         for (i = 0; i < pool->max_caches; i++)
1363                 INIT_LIST_HEAD(&pool->recycling_lists[i]);
1364
1365 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 20))
1366         INIT_DELAYED_WORK(&pool->sgv_purge_work,
1367                 (void (*)(struct work_struct *))sgv_purge_work_fn);
1368 #else
1369         INIT_WORK(&pool->sgv_purge_work, sgv_purge_work_fn, pool);
1370 #endif
1371
1372         spin_lock_bh(&sgv_pools_lock);
1373         list_add_tail(&pool->sgv_pools_list_entry, &sgv_pools_list);
1374         spin_unlock_bh(&sgv_pools_lock);
1375
1376         res = 0;
1377
1378 out:
1379         TRACE_EXIT_RES(res);
1380         return res;
1381
1382 out_free:
1383         for (i = 0; i < pool->max_caches; i++) {
1384                 if (pool->caches[i]) {
1385                         kmem_cache_destroy(pool->caches[i]);
1386                         pool->caches[i] = NULL;
1387                 } else
1388                         break;
1389         }
1390         goto out;
1391 }
1392
1393 static void sgv_evaluate_local_max_pages(void)
1394 {
1395         int space4sgv_ttbl = PAGE_SIZE - sizeof(struct sgv_pool_obj);
1396
1397         sgv_max_local_pages = space4sgv_ttbl /
1398                   (sizeof(struct trans_tbl_ent) + sizeof(struct scatterlist));
1399
1400         sgv_max_trans_pages =  space4sgv_ttbl / sizeof(struct trans_tbl_ent);
1401
1402         TRACE_MEM("sgv_max_local_pages %d, sgv_max_trans_pages %d",
1403                 sgv_max_local_pages, sgv_max_trans_pages);
1404         return;
1405 }
1406
1407 void sgv_pool_flush(struct sgv_pool *pool)
1408 {
1409         int i;
1410
1411         TRACE_ENTRY();
1412
1413         for (i = 0; i < pool->max_caches; i++) {
1414                 struct sgv_pool_obj *obj;
1415
1416                 spin_lock_bh(&pool->sgv_pool_lock);
1417
1418                 while (!list_empty(&pool->recycling_lists[i])) {
1419                         obj = list_entry(pool->recycling_lists[i].next,
1420                                 struct sgv_pool_obj, recycling_list_entry);
1421
1422                         __sgv_purge_from_cache(obj);
1423
1424                         spin_unlock_bh(&pool->sgv_pool_lock);
1425
1426                         EXTRACHECKS_BUG_ON(obj->owner_pool != pool);
1427                         sgv_dtor_and_free(obj);
1428
1429                         spin_lock_bh(&pool->sgv_pool_lock);
1430                 }
1431                 spin_unlock_bh(&pool->sgv_pool_lock);
1432         }
1433
1434         TRACE_EXIT();
1435         return;
1436 }
1437 EXPORT_SYMBOL(sgv_pool_flush);
1438
1439 static void sgv_pool_deinit_put(struct sgv_pool *pool)
1440 {
1441         int i;
1442
1443         TRACE_ENTRY();
1444
1445         cancel_delayed_work_sync(&pool->sgv_purge_work);
1446
1447         sgv_pool_flush(pool);
1448
1449         mutex_lock(&sgv_pools_mutex);
1450         spin_lock_bh(&sgv_pools_lock);
1451         list_del(&pool->sgv_pools_list_entry);
1452         spin_unlock_bh(&sgv_pools_lock);
1453         mutex_unlock(&sgv_pools_mutex);
1454
1455         for (i = 0; i < pool->max_caches; i++) {
1456                 if (pool->caches[i])
1457                         kmem_cache_destroy(pool->caches[i]);
1458                 pool->caches[i] = NULL;
1459         }
1460
1461         scst_sgv_sysfs_put(pool);
1462
1463         /* pool can be dead here */
1464
1465         TRACE_EXIT();
1466         return;
1467 }
1468
1469 void sgv_pool_set_allocator(struct sgv_pool *pool,
1470         struct page *(*alloc_pages_fn)(struct scatterlist *, gfp_t, void *),
1471         void (*free_pages_fn)(struct scatterlist *, int, void *))
1472 {
1473         pool->alloc_fns.alloc_pages_fn = alloc_pages_fn;
1474         pool->alloc_fns.free_pages_fn = free_pages_fn;
1475         return;
1476 }
1477 EXPORT_SYMBOL(sgv_pool_set_allocator);
1478
1479 struct sgv_pool *sgv_pool_create(const char *name,
1480         enum sgv_clustering_types clustering_type,
1481         int single_alloc_pages, bool shared, int purge_interval)
1482 {
1483         struct sgv_pool *pool;
1484         int rc;
1485
1486         TRACE_ENTRY();
1487
1488         mutex_lock(&sgv_pools_mutex);
1489         list_for_each_entry(pool, &sgv_pools_list, sgv_pools_list_entry) {
1490                 if (strcmp(pool->name, name) == 0) {
1491                         if (shared) {
1492                                 if (pool->owner_mm != current->mm) {
1493                                         PRINT_ERROR("Attempt of a shared use "
1494                                                 "of SGV pool %s with "
1495                                                 "different MM", name);
1496                                         goto out_err_unlock;
1497                                 }
1498                                 sgv_pool_get(pool);
1499                                 goto out_unlock;
1500                         } else {
1501                                 PRINT_ERROR("SGV pool %s already exists", name);
1502                                 goto out_err_unlock;
1503                         }
1504                 }
1505         }
1506
1507         pool = kzalloc(sizeof(*pool), GFP_KERNEL);
1508         if (pool == NULL) {
1509                 TRACE(TRACE_OUT_OF_MEM, "%s", "Allocation of sgv_pool failed");
1510                 goto out_unlock;
1511         }
1512
1513         rc = sgv_pool_init(pool, name, clustering_type, single_alloc_pages,
1514                                 purge_interval);
1515         if (rc != 0)
1516                 goto out_free_unlock;
1517
1518         rc = scst_create_sgv_sysfs(pool);
1519         if (rc != 0)
1520                 goto out_err_unlock_put;
1521
1522 out_unlock:
1523         mutex_unlock(&sgv_pools_mutex);
1524
1525         TRACE_EXIT_RES(pool != NULL);
1526         return pool;
1527
1528 out_free_unlock:
1529         kfree(pool);
1530
1531 out_err_unlock:
1532         pool = NULL;
1533         goto out_unlock;
1534
1535 out_err_unlock_put:
1536         mutex_unlock(&sgv_pools_mutex);
1537         sgv_pool_deinit_put(pool);
1538         goto out_err_unlock;
1539 }
1540 EXPORT_SYMBOL(sgv_pool_create);
1541
1542 void sgv_pool_destroy(struct sgv_pool *pool)
1543 {
1544         TRACE_ENTRY();
1545
1546         kfree(pool);
1547
1548         TRACE_EXIT();
1549         return;
1550 }
1551
1552 void sgv_pool_get(struct sgv_pool *pool)
1553 {
1554         atomic_inc(&pool->sgv_pool_ref);
1555         TRACE_MEM("Incrementing sgv pool %p ref (new value %d)",
1556                 pool, atomic_read(&pool->sgv_pool_ref));
1557         return;
1558 }
1559 EXPORT_SYMBOL(sgv_pool_get);
1560
1561 void sgv_pool_put(struct sgv_pool *pool)
1562 {
1563         TRACE_MEM("Decrementing sgv pool %p ref (new value %d)",
1564                 pool, atomic_read(&pool->sgv_pool_ref)-1);
1565         if (atomic_dec_and_test(&pool->sgv_pool_ref))
1566                 sgv_pool_deinit_put(pool);
1567         return;
1568 }
1569 EXPORT_SYMBOL(sgv_pool_put);
1570
1571 void sgv_pool_del(struct sgv_pool *pool)
1572 {
1573         TRACE_ENTRY();
1574
1575         sgv_pool_put(pool);
1576
1577         TRACE_EXIT();
1578         return;
1579 }
1580 EXPORT_SYMBOL(sgv_pool_del);
1581
1582 /* Both parameters in pages */
1583 int scst_sgv_pools_init(unsigned long mem_hwmark, unsigned long mem_lwmark)
1584 {
1585         int res = 0;
1586
1587         TRACE_ENTRY();
1588
1589         sgv_hi_wmk = mem_hwmark;
1590         sgv_lo_wmk = mem_lwmark;
1591
1592         sgv_evaluate_local_max_pages();
1593
1594         sgv_norm_pool = sgv_pool_create("sgv", sgv_no_clustering, 0, false, 0);
1595         if (sgv_norm_pool == NULL)
1596                 goto out_err;
1597
1598         sgv_norm_clust_pool = sgv_pool_create("sgv-clust",
1599                 sgv_full_clustering, 0, false, 0);
1600         if (sgv_norm_clust_pool == NULL)
1601                 goto out_free_norm;
1602
1603         sgv_dma_pool = sgv_pool_create("sgv-dma", sgv_no_clustering, 0,
1604                                 false, 0);
1605         if (sgv_dma_pool == NULL)
1606                 goto out_free_clust;
1607
1608 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23))
1609         sgv_shrinker = set_shrinker(DEFAULT_SEEKS, sgv_shrink);
1610 #else
1611         sgv_shrinker.shrink = sgv_shrink;
1612         sgv_shrinker.seeks = DEFAULT_SEEKS;
1613         register_shrinker(&sgv_shrinker);
1614 #endif
1615
1616 out:
1617         TRACE_EXIT_RES(res);
1618         return res;
1619
1620 out_free_clust:
1621         sgv_pool_deinit_put(sgv_norm_clust_pool);
1622
1623 out_free_norm:
1624         sgv_pool_deinit_put(sgv_norm_pool);
1625
1626 out_err:
1627         res = -ENOMEM;
1628         goto out;
1629 }
1630
1631 void scst_sgv_pools_deinit(void)
1632 {
1633         TRACE_ENTRY();
1634
1635 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23))
1636         remove_shrinker(sgv_shrinker);
1637 #else
1638         unregister_shrinker(&sgv_shrinker);
1639 #endif
1640
1641         sgv_pool_deinit_put(sgv_dma_pool);
1642         sgv_pool_deinit_put(sgv_norm_pool);
1643         sgv_pool_deinit_put(sgv_norm_clust_pool);
1644
1645         flush_scheduled_work();
1646
1647         TRACE_EXIT();
1648         return;
1649 }
1650
1651 #ifdef CONFIG_SCST_PROC
1652
1653 static void sgv_do_proc_read(struct seq_file *seq, const struct sgv_pool *pool)
1654 {
1655         int i, total = 0, hit = 0, merged = 0, allocated = 0;
1656         int oa, om;
1657
1658         for (i = 0; i < pool->max_caches; i++) {
1659                 int t;
1660
1661                 hit += atomic_read(&pool->cache_acc[i].hit_alloc);
1662                 total += atomic_read(&pool->cache_acc[i].total_alloc);
1663
1664                 t = atomic_read(&pool->cache_acc[i].total_alloc) -
1665                         atomic_read(&pool->cache_acc[i].hit_alloc);
1666                 if (pool->single_alloc_pages == 0)
1667                         allocated += t * (1 << i);
1668                 else
1669                         allocated += t * pool->single_alloc_pages;
1670                 merged += atomic_read(&pool->cache_acc[i].merged);
1671         }
1672
1673         seq_printf(seq, "\n%-30s %-11d %-11d %-11d %d/%d/%d\n", pool->name,
1674                 hit, total, (allocated != 0) ? merged*100/allocated : 0,
1675                 pool->cached_pages, pool->inactive_cached_pages,
1676                 pool->cached_entries);
1677
1678         for (i = 0; i < pool->max_caches; i++) {
1679                 int t = atomic_read(&pool->cache_acc[i].total_alloc) -
1680                         atomic_read(&pool->cache_acc[i].hit_alloc);
1681                 if (pool->single_alloc_pages == 0)
1682                         allocated = t * (1 << i);
1683                 else
1684                         allocated = t * pool->single_alloc_pages;
1685                 merged = atomic_read(&pool->cache_acc[i].merged);
1686
1687                 seq_printf(seq, "  %-28s %-11d %-11d %d\n",
1688                         pool->cache_names[i],
1689                         atomic_read(&pool->cache_acc[i].hit_alloc),
1690                         atomic_read(&pool->cache_acc[i].total_alloc),
1691                         (allocated != 0) ? merged*100/allocated : 0);
1692         }
1693
1694         allocated = atomic_read(&pool->big_pages);
1695         merged = atomic_read(&pool->big_merged);
1696         oa = atomic_read(&pool->other_pages);
1697         om = atomic_read(&pool->other_merged);
1698
1699         seq_printf(seq, "  %-40s %d/%-9d %d/%d\n", "big/other",
1700                 atomic_read(&pool->big_alloc), atomic_read(&pool->other_alloc),
1701                 (allocated != 0) ? merged*100/allocated : 0,
1702                 (oa != 0) ? om/oa : 0);
1703
1704         return;
1705 }
1706
1707 int sgv_procinfo_show(struct seq_file *seq, void *v)
1708 {
1709         struct sgv_pool *pool;
1710         int inactive_pages = 0;
1711
1712         TRACE_ENTRY();
1713
1714         spin_lock_bh(&sgv_pools_lock);
1715         list_for_each_entry(pool, &sgv_active_pools_list,
1716                         sgv_active_pools_list_entry) {
1717                 inactive_pages += pool->inactive_cached_pages;
1718         }
1719         spin_unlock_bh(&sgv_pools_lock);
1720
1721         seq_printf(seq, "%-42s %d/%d\n%-42s %d/%d\n%-42s %d/%d\n\n",
1722                 "Inactive/active pages", inactive_pages,
1723                 atomic_read(&sgv_pages_total) - inactive_pages,
1724                 "Hi/lo watermarks [pages]", sgv_hi_wmk, sgv_lo_wmk,
1725                 "Hi watermark releases/failures",
1726                 atomic_read(&sgv_releases_on_hiwmk),
1727                 atomic_read(&sgv_releases_on_hiwmk_failed));
1728
1729         seq_printf(seq, "%-30s %-11s %-11s %-11s %-11s", "Name", "Hit", "Total",
1730                 "% merged", "Cached (P/I/O)");
1731
1732         mutex_lock(&sgv_pools_mutex);
1733         list_for_each_entry(pool, &sgv_pools_list, sgv_pools_list_entry) {
1734                 sgv_do_proc_read(seq, pool);
1735         }
1736         mutex_unlock(&sgv_pools_mutex);
1737
1738         seq_printf(seq, "\n%-42s %-11d\n", "other",
1739                 atomic_read(&sgv_other_total_alloc));
1740
1741         TRACE_EXIT();
1742         return 0;
1743 }
1744
1745 #else /* CONFIG_SCST_PROC */
1746
1747 ssize_t sgv_sysfs_stat_show(struct kobject *kobj,
1748         struct kobj_attribute *attr, char *buf)
1749 {
1750         struct sgv_pool *pool;
1751         int i, total = 0, hit = 0, merged = 0, allocated = 0;
1752         int oa, om, res;
1753
1754         pool = container_of(kobj, struct sgv_pool, sgv_kobj);
1755
1756         for (i = 0; i < SGV_POOL_ELEMENTS; i++) {
1757                 int t;
1758
1759                 hit += atomic_read(&pool->cache_acc[i].hit_alloc);
1760                 total += atomic_read(&pool->cache_acc[i].total_alloc);
1761
1762                 t = atomic_read(&pool->cache_acc[i].total_alloc) -
1763                         atomic_read(&pool->cache_acc[i].hit_alloc);
1764                 allocated += t * (1 << i);
1765                 merged += atomic_read(&pool->cache_acc[i].merged);
1766         }
1767
1768         res = sprintf(buf, "%-30s %-11s %-11s %-11s %-11s", "Name", "Hit", "Total",
1769                 "% merged", "Cached (P/I/O)");
1770
1771         res += sprintf(&buf[res], "\n%-30s %-11d %-11d %-11d %d/%d/%d\n",
1772                 pool->name, hit, total,
1773                 (allocated != 0) ? merged*100/allocated : 0,
1774                 pool->cached_pages, pool->inactive_cached_pages,
1775                 pool->cached_entries);
1776
1777         for (i = 0; i < SGV_POOL_ELEMENTS; i++) {
1778                 int t = atomic_read(&pool->cache_acc[i].total_alloc) -
1779                         atomic_read(&pool->cache_acc[i].hit_alloc);
1780                 allocated = t * (1 << i);
1781                 merged = atomic_read(&pool->cache_acc[i].merged);
1782
1783                 res += sprintf(&buf[res], "  %-28s %-11d %-11d %d\n",
1784                         pool->cache_names[i],
1785                         atomic_read(&pool->cache_acc[i].hit_alloc),
1786                         atomic_read(&pool->cache_acc[i].total_alloc),
1787                         (allocated != 0) ? merged*100/allocated : 0);
1788         }
1789
1790         allocated = atomic_read(&pool->big_pages);
1791         merged = atomic_read(&pool->big_merged);
1792         oa = atomic_read(&pool->other_pages);
1793         om = atomic_read(&pool->other_merged);
1794
1795         res += sprintf(&buf[res], "  %-40s %d/%-9d %d/%d\n", "big/other",
1796                 atomic_read(&pool->big_alloc), atomic_read(&pool->other_alloc),
1797                 (allocated != 0) ? merged*100/allocated : 0,
1798                 (oa != 0) ? om/oa : 0);
1799
1800         return res;
1801 }
1802
1803 ssize_t sgv_sysfs_global_stat_show(struct kobject *kobj,
1804         struct kobj_attribute *attr, char *buf)
1805 {
1806         struct sgv_pool *pool;
1807         int inactive_pages = 0, res;
1808
1809         TRACE_ENTRY();
1810
1811         spin_lock_bh(&sgv_pools_lock);
1812         list_for_each_entry(pool, &sgv_active_pools_list,
1813                         sgv_active_pools_list_entry) {
1814                 inactive_pages += pool->inactive_cached_pages;
1815         }
1816         spin_unlock_bh(&sgv_pools_lock);
1817
1818         res = sprintf(buf, "%-42s %d/%d\n%-42s %d/%d\n%-42s %d/%d\n"
1819                 "%-42s %-11d\n",
1820                 "Inactive/active pages", inactive_pages,
1821                 atomic_read(&sgv_pages_total) - inactive_pages,
1822                 "Hi/lo watermarks [pages]", sgv_hi_wmk, sgv_lo_wmk,
1823                 "Hi watermark releases/failures",
1824                 atomic_read(&sgv_releases_on_hiwmk),
1825                 atomic_read(&sgv_releases_on_hiwmk_failed),
1826                 "Other allocs", atomic_read(&sgv_other_total_alloc));
1827
1828         TRACE_EXIT();
1829         return res;
1830 }
1831
1832 #endif /* CONFIG_SCST_PROC */