- Possibility to operate with single size entries as well as control of the reclamat...
[mirror/scst/.git] / scst / src / scst_mem.c
1 /*
2  *  scst_mem.c
3  *
4  *  Copyright (C) 2006 - 2009 Vladislav Bolkhovitin <vst@vlnb.net>
5  *  Copyright (C) 2007 - 2009 ID7 Ltd.
6  *
7  *  This program is free software; you can redistribute it and/or
8  *  modify it under the terms of the GNU General Public License
9  *  as published by the Free Software Foundation, version 2
10  *  of the License.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  *  GNU General Public License for more details.
16  */
17
18 #include <linux/init.h>
19 #include <linux/kernel.h>
20 #include <linux/errno.h>
21 #include <linux/list.h>
22 #include <linux/spinlock.h>
23 #include <linux/slab.h>
24 #include <linux/sched.h>
25 #include <linux/mm.h>
26 #include <linux/unistd.h>
27 #include <linux/string.h>
28
29 #include "scst.h"
30 #include "scst_priv.h"
31 #include "scst_mem.h"
32
33 #define SGV_DEFAULT_PURGE_INTERVAL      (60 * HZ)
34 #define SGV_MIN_SHRINK_INTERVAL         (1 * HZ)
35
36 /* Max pages freed from a pool per shrinking iteration */
37 #define MAX_PAGES_PER_POOL      50
38
39 static struct sgv_pool sgv_norm_clust_pool, sgv_norm_pool, sgv_dma_pool;
40
41 static atomic_t sgv_pages_total = ATOMIC_INIT(0);
42
43 /* Both read-only */
44 static int sgv_hi_wmk;
45 static int sgv_lo_wmk;
46
47 static int sgv_max_local_pages, sgv_max_trans_pages;
48
49 static DEFINE_SPINLOCK(sgv_pools_lock); /* inner lock for sgv_pool_lock! */
50 static DEFINE_MUTEX(sgv_pools_mutex);
51
52 /* Both protected by sgv_pools_lock */
53 static struct sgv_pool *sgv_cur_purge_pool;
54 static LIST_HEAD(sgv_active_pools_list);
55
56 static atomic_t sgv_releases_on_hiwmk = ATOMIC_INIT(0);
57 static atomic_t sgv_releases_on_hiwmk_failed = ATOMIC_INIT(0);
58
59 static atomic_t sgv_other_total_alloc = ATOMIC_INIT(0);
60
61 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23))
62 static struct shrinker *sgv_shrinker;
63 #else
64 static struct shrinker sgv_shrinker;
65 #endif
66
67 /*
68  * Protected by sgv_pools_mutex AND sgv_pools_lock for writes,
69  * either one for reads.
70  */
71 static LIST_HEAD(sgv_pools_list);
72
73 static void sgv_pool_get(struct sgv_pool *pool);
74 static void sgv_pool_put(struct sgv_pool *pool);
75
76 static inline bool sgv_pool_clustered(const struct sgv_pool *pool)
77 {
78         return pool->clustering_type != sgv_no_clustering;
79 }
80
81 void scst_sgv_pool_use_norm(struct scst_tgt_dev *tgt_dev)
82 {
83         tgt_dev->gfp_mask = __GFP_NOWARN;
84         tgt_dev->pool = &sgv_norm_pool;
85         clear_bit(SCST_TGT_DEV_CLUST_POOL, &tgt_dev->tgt_dev_flags);
86 }
87
88 void scst_sgv_pool_use_norm_clust(struct scst_tgt_dev *tgt_dev)
89 {
90         TRACE_MEM("%s", "Use clustering");
91         tgt_dev->gfp_mask = __GFP_NOWARN;
92         tgt_dev->pool = &sgv_norm_clust_pool;
93         set_bit(SCST_TGT_DEV_CLUST_POOL, &tgt_dev->tgt_dev_flags);
94 }
95
96 void scst_sgv_pool_use_dma(struct scst_tgt_dev *tgt_dev)
97 {
98         TRACE_MEM("%s", "Use ISA DMA memory");
99         tgt_dev->gfp_mask = __GFP_NOWARN | GFP_DMA;
100         tgt_dev->pool = &sgv_dma_pool;
101         clear_bit(SCST_TGT_DEV_CLUST_POOL, &tgt_dev->tgt_dev_flags);
102 }
103
104 /* Must be no locks */
105 static void sgv_dtor_and_free(struct sgv_pool_obj *obj)
106 {
107         struct sgv_pool *pool = obj->owner_pool;
108
109         TRACE_MEM("Destroying sgv obj %p", obj);
110
111         if (obj->sg_count != 0) {
112                 pool->alloc_fns.free_pages_fn(obj->sg_entries,
113                         obj->sg_count, obj->allocator_priv);
114         }
115         if (obj->sg_entries != obj->sg_entries_data) {
116                 if (obj->trans_tbl !=
117                     (struct trans_tbl_ent *)obj->sg_entries_data) {
118                         /* kfree() handles NULL parameter */
119                         kfree(obj->trans_tbl);
120                         obj->trans_tbl = NULL;
121                 }
122                 kfree(obj->sg_entries);
123         }
124
125         kmem_cache_free(pool->caches[obj->cache_num], obj);
126         return;
127 }
128
129 /* Might be called under sgv_pool_lock */
130 static inline void sgv_del_from_active(struct sgv_pool *pool)
131 {
132         struct list_head *next;
133
134         TRACE_MEM("Deleting sgv pool %p from the active list", pool);
135
136         spin_lock_bh(&sgv_pools_lock);
137
138         next = pool->sgv_active_pools_list_entry.next;
139         list_del(&pool->sgv_active_pools_list_entry);
140
141         if (sgv_cur_purge_pool == pool) {
142                 TRACE_MEM("Sgv pool %p is sgv cur purge pool", pool);
143
144                 if (next == &sgv_active_pools_list)
145                         next = next->next;
146
147                 if (next == &sgv_active_pools_list) {
148                         sgv_cur_purge_pool = NULL;
149                         TRACE_MEM("%s", "Sgv active list now empty");
150                 } else {
151                         sgv_cur_purge_pool = list_entry(next, typeof(*pool),
152                                 sgv_active_pools_list_entry);
153                         TRACE_MEM("New sgv cur purge pool %p",
154                                 sgv_cur_purge_pool);
155                 }
156         }
157
158         spin_unlock_bh(&sgv_pools_lock);
159         return;
160 }
161
162 /* Must be called under sgv_pool_lock held */
163 static void sgv_dec_cached_entries(struct sgv_pool *pool, int pages)
164 {
165         pool->cached_entries--;
166         pool->cached_pages -= pages;
167
168         if (pool->cached_entries == 0)
169                 sgv_del_from_active(pool);
170
171         return;
172 }
173
174 /* Must be called under sgv_pool_lock held */
175 static void __sgv_purge_from_cache(struct sgv_pool_obj *obj)
176 {
177         int pages = obj->pages;
178         struct sgv_pool *pool = obj->owner_pool;
179
180         TRACE_MEM("Purging sgv obj %p from pool %p (new cached_entries %d)",
181                 obj, pool, pool->cached_entries-1);
182
183         list_del(&obj->sorted_recycling_list_entry);
184         list_del(&obj->recycling_list_entry);
185
186         pool->inactive_cached_pages -= pages;
187         sgv_dec_cached_entries(pool, pages);
188
189         atomic_sub(pages, &sgv_pages_total);
190
191         return;
192 }
193
194 /* Must be called under sgv_pool_lock held */
195 static bool sgv_purge_from_cache(struct sgv_pool_obj *obj, int min_interval,
196         unsigned long cur_time)
197 {
198         EXTRACHECKS_BUG_ON(min_interval < 0);
199
200         TRACE_MEM("Checking if sgv obj %p should be purged (cur time %ld, "
201                 "obj time %ld, time to purge %ld)", obj, cur_time,
202                 obj->time_stamp, obj->time_stamp + min_interval);
203
204         if (time_after_eq(cur_time, (obj->time_stamp + min_interval))) {
205                 __sgv_purge_from_cache(obj);
206                 return true;
207         }
208         return false;
209 }
210
211 /* No locks */
212 static int sgv_shrink_pool(struct sgv_pool *pool, int nr, int min_interval,
213         unsigned long cur_time)
214 {
215         int freed = 0;
216
217         TRACE_ENTRY();
218
219         TRACE_MEM("Trying to shrink pool %p (nr %d, min_interval %d)",
220                 pool, nr, min_interval);
221
222         if (pool->purge_interval < 0) {
223                 TRACE_MEM("Not shrinkable pool %p, skipping", pool);
224                 goto out;
225         }
226
227         spin_lock_bh(&pool->sgv_pool_lock);
228
229         while (!list_empty(&pool->sorted_recycling_list) &&
230                         (atomic_read(&sgv_pages_total) > sgv_lo_wmk)) {
231                 struct sgv_pool_obj *obj = list_entry(
232                         pool->sorted_recycling_list.next,
233                         struct sgv_pool_obj, sorted_recycling_list_entry);
234
235                 if (sgv_purge_from_cache(obj, min_interval, cur_time)) {
236                         int pages = obj->pages;
237
238                         freed += pages;
239                         nr -= pages;
240
241                         TRACE_MEM("%d pages purged from pool %p (nr left %d, "
242                                 "total freed %d)", pages, pool, nr, freed);
243
244                         spin_unlock_bh(&pool->sgv_pool_lock);
245                         sgv_dtor_and_free(obj);
246                         spin_lock_bh(&pool->sgv_pool_lock);
247                 } else
248                         break;
249
250                 if ((nr <= 0) || (freed >= MAX_PAGES_PER_POOL)) {
251                         if (freed >= MAX_PAGES_PER_POOL)
252                                 TRACE_MEM("%d pages purged from pool %p, "
253                                         "leaving", freed, pool);
254                         break;
255                 }
256         }
257
258         spin_unlock_bh(&pool->sgv_pool_lock);
259
260 out:
261         TRACE_EXIT_RES(nr);
262         return nr;
263 }
264
265 /* No locks */
266 static int __sgv_shrink(int nr, int min_interval)
267 {
268         struct sgv_pool *pool;
269         unsigned long cur_time = jiffies;
270         int prev_nr = nr;
271         bool circle = false;
272
273         TRACE_ENTRY();
274
275         TRACE_MEM("Trying to shrink %d pages from all sgv pools "
276                 "(min_interval %d)", nr, min_interval);
277
278         while (nr > 0) {
279                 struct list_head *next;
280
281                 spin_lock_bh(&sgv_pools_lock);
282
283                 pool = sgv_cur_purge_pool;
284                 if (pool == NULL) {
285                         if (list_empty(&sgv_active_pools_list)) {
286                                 TRACE_MEM("%s", "Active pools list is empty");
287                                 goto out_unlock;
288                         }
289
290                         pool = list_entry(sgv_active_pools_list.next,
291                                         typeof(*pool),
292                                         sgv_active_pools_list_entry);
293                 }
294                 sgv_pool_get(pool);
295
296                 next = pool->sgv_active_pools_list_entry.next;
297                 if (next == &sgv_active_pools_list) {
298                         if (circle && (prev_nr == nr)) {
299                                 TRACE_MEM("Full circle done, but no progress, "
300                                         "leaving (nr %d)", nr);
301                                 goto out_unlock_put;
302                         }
303                         circle = true;
304                         prev_nr = nr;
305
306                         next = next->next;
307                 }
308
309                 sgv_cur_purge_pool = list_entry(next, typeof(*pool),
310                         sgv_active_pools_list_entry);
311                 TRACE_MEM("New cur purge pool %p", sgv_cur_purge_pool);
312
313                 spin_unlock_bh(&sgv_pools_lock);
314
315                 nr = sgv_shrink_pool(pool, nr, min_interval, cur_time);
316
317                 sgv_pool_put(pool);
318         }
319
320 out:
321         TRACE_EXIT_RES(nr);
322         return nr;
323
324 out_unlock:
325         spin_unlock_bh(&sgv_pools_lock);
326         goto out;
327
328 out_unlock_put:
329         spin_unlock_bh(&sgv_pools_lock);
330         sgv_pool_put(pool);
331         goto out;
332 }
333
334 static int sgv_shrink(int nr, gfp_t gfpm)
335 {
336         TRACE_ENTRY();
337
338         if (nr > 0) {
339                 nr = __sgv_shrink(nr, SGV_MIN_SHRINK_INTERVAL);
340                 TRACE_MEM("Left %d", nr);
341         } else {
342                 struct sgv_pool *pool;
343                 int inactive_pages = 0;
344
345                 spin_lock_bh(&sgv_pools_lock);
346                 list_for_each_entry(pool, &sgv_active_pools_list,
347                                 sgv_active_pools_list_entry) {
348                         if (pool->purge_interval > 0)
349                                 inactive_pages += pool->inactive_cached_pages;
350                 }
351                 spin_unlock_bh(&sgv_pools_lock);
352
353                 nr = max((int)0, inactive_pages - sgv_lo_wmk);
354                 TRACE_MEM("Can free %d (total %d)", nr,
355                         atomic_read(&sgv_pages_total));
356         }
357
358         TRACE_EXIT_RES(nr);
359         return nr;
360 }
361
362 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20)
363 static void sgv_purge_work_fn(void *p)
364 #else
365 static void sgv_purge_work_fn(struct delayed_work *work)
366 #endif
367 {
368         unsigned long cur_time = jiffies;
369 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20)
370         struct sgv_pool *pool = (struct sgv_pool *)p;
371 #else
372         struct sgv_pool *pool = container_of(work, struct sgv_pool,
373                                         sgv_purge_work);
374 #endif
375
376         TRACE_ENTRY();
377
378         TRACE_MEM("Purge work for pool %p", pool);
379
380         spin_lock_bh(&pool->sgv_pool_lock);
381
382         pool->purge_work_scheduled = false;
383
384         while (!list_empty(&pool->sorted_recycling_list)) {
385                 struct sgv_pool_obj *obj = list_entry(
386                         pool->sorted_recycling_list.next,
387                         struct sgv_pool_obj, sorted_recycling_list_entry);
388
389                 if (sgv_purge_from_cache(obj, pool->purge_interval, cur_time)) {
390                         spin_unlock_bh(&pool->sgv_pool_lock);
391                         sgv_dtor_and_free(obj);
392                         spin_lock_bh(&pool->sgv_pool_lock);
393                 } else {
394                         /*
395                          * Let's reschedule it for full period to not get here
396                          * too often. In the worst case we have shrinker
397                          * to reclaim buffers quickier.
398                          */
399                         TRACE_MEM("Rescheduling purge work for pool %p (delay "
400                                 "%d HZ/%d sec)", pool, pool->purge_interval,
401                                 pool->purge_interval/HZ);
402                         schedule_delayed_work(&pool->sgv_purge_work,
403                                 pool->purge_interval);
404                         pool->purge_work_scheduled = true;
405                         break;
406                 }
407         }
408
409         spin_unlock_bh(&pool->sgv_pool_lock);
410
411         TRACE_MEM("Leaving purge work for pool %p", pool);
412
413         TRACE_EXIT();
414         return;
415 }
416
417 static int sgv_check_full_clustering(struct scatterlist *sg, int cur, int hint)
418 {
419         int res = -1;
420         int i = hint;
421         unsigned long pfn_cur = page_to_pfn(sg_page(&sg[cur]));
422         int len_cur = sg[cur].length;
423         unsigned long pfn_cur_next = pfn_cur + (len_cur >> PAGE_SHIFT);
424         int full_page_cur = (len_cur & (PAGE_SIZE - 1)) == 0;
425         unsigned long pfn, pfn_next;
426         bool full_page;
427
428 #if 0
429         TRACE_MEM("pfn_cur %ld, pfn_cur_next %ld, len_cur %d, full_page_cur %d",
430                 pfn_cur, pfn_cur_next, len_cur, full_page_cur);
431 #endif
432
433         /* check the hint first */
434         if (i >= 0) {
435                 pfn = page_to_pfn(sg_page(&sg[i]));
436                 pfn_next = pfn + (sg[i].length >> PAGE_SHIFT);
437                 full_page = (sg[i].length & (PAGE_SIZE - 1)) == 0;
438
439                 if ((pfn == pfn_cur_next) && full_page_cur)
440                         goto out_head;
441
442                 if ((pfn_next == pfn_cur) && full_page)
443                         goto out_tail;
444         }
445
446         /* ToDo: implement more intelligent search */
447         for (i = cur - 1; i >= 0; i--) {
448                 pfn = page_to_pfn(sg_page(&sg[i]));
449                 pfn_next = pfn + (sg[i].length >> PAGE_SHIFT);
450                 full_page = (sg[i].length & (PAGE_SIZE - 1)) == 0;
451
452                 if ((pfn == pfn_cur_next) && full_page_cur)
453                         goto out_head;
454
455                 if ((pfn_next == pfn_cur) && full_page)
456                         goto out_tail;
457         }
458
459 out:
460         return res;
461
462 out_tail:
463         TRACE_MEM("SG segment %d will be tail merged with segment %d", cur, i);
464         sg[i].length += len_cur;
465         sg_clear(&sg[cur]);
466         res = i;
467         goto out;
468
469 out_head:
470         TRACE_MEM("SG segment %d will be head merged with segment %d", cur, i);
471         sg_assign_page(&sg[i], sg_page(&sg[cur]));
472         sg[i].length += len_cur;
473         sg_clear(&sg[cur]);
474         res = i;
475         goto out;
476 }
477
478 static int sgv_check_tail_clustering(struct scatterlist *sg, int cur, int hint)
479 {
480         int res = -1;
481         unsigned long pfn_cur = page_to_pfn(sg_page(&sg[cur]));
482         int len_cur = sg[cur].length;
483         int prev;
484         unsigned long pfn_prev;
485         bool full_page;
486
487 #ifdef SCST_HIGHMEM
488         if (page >= highmem_start_page) {
489                 TRACE_MEM("%s", "HIGHMEM page allocated, no clustering")
490                 goto out;
491         }
492 #endif
493
494 #if 0
495         TRACE_MEM("pfn_cur %ld, pfn_cur_next %ld, len_cur %d, full_page_cur %d",
496                 pfn_cur, pfn_cur_next, len_cur, full_page_cur);
497 #endif
498
499         if (cur == 0)
500                 goto out;
501
502         prev = cur - 1;
503         pfn_prev = page_to_pfn(sg_page(&sg[prev])) +
504                         (sg[prev].length >> PAGE_SHIFT);
505         full_page = (sg[prev].length & (PAGE_SIZE - 1)) == 0;
506
507         if ((pfn_prev == pfn_cur) && full_page) {
508                 TRACE_MEM("SG segment %d will be tail merged with segment %d",
509                         cur, prev);
510                 sg[prev].length += len_cur;
511                 sg_clear(&sg[cur]);
512                 res = prev;
513         }
514
515 out:
516         return res;
517 }
518
519 static void sgv_free_sys_sg_entries(struct scatterlist *sg, int sg_count,
520         void *priv)
521 {
522         int i;
523
524         TRACE_MEM("sg=%p, sg_count=%d", sg, sg_count);
525
526         for (i = 0; i < sg_count; i++) {
527                 struct page *p = sg_page(&sg[i]);
528                 int len = sg[i].length;
529                 int pages =
530                         (len >> PAGE_SHIFT) + ((len & ~PAGE_MASK) != 0);
531
532                 TRACE_MEM("page %lx, len %d, pages %d",
533                         (unsigned long)p, len, pages);
534
535                 while (pages > 0) {
536                         int order = 0;
537
538 /*
539  * __free_pages() doesn't like freeing pages with not that order with
540  * which they were allocated, so disable this small optimization.
541  */
542 #if 0
543                         if (len > 0) {
544                                 while (((1 << order) << PAGE_SHIFT) < len)
545                                         order++;
546                                 len = 0;
547                         }
548 #endif
549                         TRACE_MEM("free_pages(): order %d, page %lx",
550                                 order, (unsigned long)p);
551
552                         __free_pages(p, order);
553
554                         pages -= 1 << order;
555                         p += 1 << order;
556                 }
557         }
558 }
559
560 static struct page *sgv_alloc_sys_pages(struct scatterlist *sg,
561         gfp_t gfp_mask, void *priv)
562 {
563         struct page *page = alloc_pages(gfp_mask, 0);
564
565         sg_set_page(sg, page, PAGE_SIZE, 0);
566         TRACE_MEM("page=%p, sg=%p, priv=%p", page, sg, priv);
567         if (page == NULL) {
568                 TRACE(TRACE_OUT_OF_MEM, "%s", "Allocation of "
569                         "sg page failed");
570         }
571         return page;
572 }
573
574 static int sgv_alloc_sg_entries(struct scatterlist *sg, int pages,
575         gfp_t gfp_mask, enum sgv_clustering_types clustering_type,
576         struct trans_tbl_ent *trans_tbl,
577         const struct sgv_pool_alloc_fns *alloc_fns, void *priv)
578 {
579         int sg_count = 0;
580         int pg, i, j;
581         int merged = -1;
582
583         TRACE_MEM("pages=%d, clustering_type=%d", pages, clustering_type);
584
585 #if 0
586         gfp_mask |= __GFP_COLD;
587 #endif
588 #ifdef CONFIG_SCST_STRICT_SECURITY
589         gfp_mask |= __GFP_ZERO;
590 #endif
591
592         for (pg = 0; pg < pages; pg++) {
593                 void *rc;
594 #ifdef CONFIG_SCST_DEBUG_OOM
595                 if (((gfp_mask & __GFP_NOFAIL) != __GFP_NOFAIL) &&
596                     ((scst_random() % 10000) == 55))
597                         rc = NULL;
598                 else
599 #endif
600                         rc = alloc_fns->alloc_pages_fn(&sg[sg_count], gfp_mask,
601                                 priv);
602                 if (rc == NULL)
603                         goto out_no_mem;
604
605                 /*
606                  * This code allows compiler to see full body of the clustering
607                  * functions and gives it a chance to generate better code.
608                  * At least, the resulting code is smaller, comparing to
609                  * calling them using a function pointer.
610                  */
611                 if (clustering_type == sgv_full_clustering)
612                         merged = sgv_check_full_clustering(sg, sg_count, merged);
613                 else if (clustering_type == sgv_tail_clustering)
614                         merged = sgv_check_tail_clustering(sg, sg_count, merged);
615                 else
616                         merged = -1;
617
618                 if (merged == -1)
619                         sg_count++;
620
621                 TRACE_MEM("pg=%d, merged=%d, sg_count=%d", pg, merged,
622                         sg_count);
623         }
624
625         if ((clustering_type != sgv_no_clustering) && (trans_tbl != NULL)) {
626                 pg = 0;
627                 for (i = 0; i < pages; i++) {
628                         int n = (sg[i].length >> PAGE_SHIFT) +
629                                 ((sg[i].length & ~PAGE_MASK) != 0);
630                         trans_tbl[i].pg_count = pg;
631                         for (j = 0; j < n; j++)
632                                 trans_tbl[pg++].sg_num = i+1;
633                         TRACE_MEM("i=%d, n=%d, pg_count=%d", i, n,
634                                 trans_tbl[i].pg_count);
635                 }
636         }
637
638 out:
639         TRACE_MEM("sg_count=%d", sg_count);
640         return sg_count;
641
642 out_no_mem:
643         alloc_fns->free_pages_fn(sg, sg_count, priv);
644         sg_count = 0;
645         goto out;
646 }
647
648 static int sgv_alloc_arrays(struct sgv_pool_obj *obj,
649         int pages_to_alloc, gfp_t gfp_mask)
650 {
651         int sz, tsz = 0;
652         int res = 0;
653
654         TRACE_ENTRY();
655
656         sz = pages_to_alloc * sizeof(obj->sg_entries[0]);
657
658         obj->sg_entries = kmalloc(sz, gfp_mask);
659         if (unlikely(obj->sg_entries == NULL)) {
660                 TRACE(TRACE_OUT_OF_MEM, "Allocation of sgv_pool_obj "
661                         "SG vector failed (size %d)", sz);
662                 res = -ENOMEM;
663                 goto out;
664         }
665
666         sg_init_table(obj->sg_entries, pages_to_alloc);
667
668         if (sgv_pool_clustered(obj->owner_pool)) {
669                 if (pages_to_alloc <= sgv_max_trans_pages) {
670                         obj->trans_tbl =
671                                 (struct trans_tbl_ent *)obj->sg_entries_data;
672                         /*
673                          * No need to clear trans_tbl, if needed, it will be
674                          * fully rewritten in sgv_alloc_sg_entries()
675                          */
676                 } else {
677                         tsz = pages_to_alloc * sizeof(obj->trans_tbl[0]);
678                         obj->trans_tbl = kzalloc(tsz, gfp_mask);
679                         if (unlikely(obj->trans_tbl == NULL)) {
680                                 TRACE(TRACE_OUT_OF_MEM, "Allocation of "
681                                         "trans_tbl failed (size %d)", tsz);
682                                 res = -ENOMEM;
683                                 goto out_free;
684                         }
685                 }
686         }
687
688         TRACE_MEM("pages_to_alloc %d, sz %d, tsz %d, obj %p, sg_entries %p, "
689                 "trans_tbl %p", pages_to_alloc, sz, tsz, obj, obj->sg_entries,
690                 obj->trans_tbl);
691
692 out:
693         TRACE_EXIT_RES(res);
694         return res;
695
696 out_free:
697         kfree(obj->sg_entries);
698         obj->sg_entries = NULL;
699         goto out;
700 }
701
702 static struct sgv_pool_obj *sgv_get_obj(struct sgv_pool *pool, int cache_num,
703         int pages, gfp_t gfp_mask)
704 {
705         struct sgv_pool_obj *obj;
706
707         spin_lock_bh(&pool->sgv_pool_lock);
708         if (likely(!list_empty(&pool->recycling_lists[cache_num]))) {
709                 obj = list_entry(pool->recycling_lists[cache_num].next,
710                          struct sgv_pool_obj, recycling_list_entry);
711
712                 list_del(&obj->sorted_recycling_list_entry);
713                 list_del(&obj->recycling_list_entry);
714
715                 pool->inactive_cached_pages -= pages;
716
717                 spin_unlock_bh(&pool->sgv_pool_lock);
718                 goto out;
719         }
720
721         if (pool->cached_entries == 0) {
722                 TRACE_MEM("Adding pool %p to the active list", pool);
723                 spin_lock_bh(&sgv_pools_lock);
724                 list_add_tail(&pool->sgv_active_pools_list_entry,
725                         &sgv_active_pools_list);
726                 spin_unlock_bh(&sgv_pools_lock);
727         }
728
729         pool->cached_entries++;
730         pool->cached_pages += pages;
731
732         spin_unlock_bh(&pool->sgv_pool_lock);
733
734         TRACE_MEM("New cached entries %d (pool %p)", pool->cached_entries,
735                 pool);
736
737         obj = kmem_cache_alloc(pool->caches[cache_num],
738                 gfp_mask & ~(__GFP_HIGHMEM|GFP_DMA));
739         if (likely(obj)) {
740                 memset(obj, 0, sizeof(*obj));
741                 obj->cache_num = cache_num;
742                 obj->pages = pages;
743                 obj->owner_pool = pool;
744         } else {
745                 spin_lock_bh(&pool->sgv_pool_lock);
746                 sgv_dec_cached_entries(pool, pages);
747                 spin_unlock_bh(&pool->sgv_pool_lock);
748         }
749
750 out:
751         return obj;
752 }
753
754 static void sgv_put_obj(struct sgv_pool_obj *obj)
755 {
756         struct sgv_pool *pool = obj->owner_pool;
757         struct list_head *entry;
758         struct list_head *list = &pool->recycling_lists[obj->cache_num];
759         int pages = obj->pages;
760
761         spin_lock_bh(&pool->sgv_pool_lock);
762
763         TRACE_MEM("sgv %p, cache num %d, pages %d, sg_count %d", obj,
764                 obj->cache_num, pages, obj->sg_count);
765
766         if (sgv_pool_clustered(pool)) {
767                 /* Make objects with less entries more preferred */
768                 __list_for_each(entry, list) {
769                         struct sgv_pool_obj *tmp = list_entry(entry,
770                                 struct sgv_pool_obj, recycling_list_entry);
771
772                         TRACE_MEM("tmp %p, cache num %d, pages %d, sg_count %d",
773                                 tmp, tmp->cache_num, tmp->pages, tmp->sg_count);
774
775                         if (obj->sg_count <= tmp->sg_count)
776                                 break;
777                 }
778                 entry = entry->prev;
779         } else
780                 entry = list;
781
782         TRACE_MEM("Adding in %p (list %p)", entry, list);
783         list_add(&obj->recycling_list_entry, entry);
784
785         list_add_tail(&obj->sorted_recycling_list_entry,
786                 &pool->sorted_recycling_list);
787
788         obj->time_stamp = jiffies;
789
790         pool->inactive_cached_pages += pages;
791
792         if (!pool->purge_work_scheduled) {
793                 TRACE_MEM("Scheduling purge work for pool %p", pool);
794                 pool->purge_work_scheduled = true;
795                 schedule_delayed_work(&pool->sgv_purge_work,
796                         pool->purge_interval);
797         }
798
799         spin_unlock_bh(&pool->sgv_pool_lock);
800         return;
801 }
802
803 /* No locks */
804 static int sgv_hiwmk_check(int pages_to_alloc)
805 {
806         int res = 0;
807         int pages = pages_to_alloc;
808
809         pages += atomic_read(&sgv_pages_total);
810
811         if (unlikely(pages > sgv_hi_wmk)) {
812                 pages -= sgv_hi_wmk;
813                 atomic_inc(&sgv_releases_on_hiwmk);
814
815                 pages = __sgv_shrink(pages, 0);
816                 if (pages > 0) {
817                         TRACE(TRACE_OUT_OF_MEM, "Requested amount of "
818                             "memory (%d pages) for being executed "
819                             "commands together with the already "
820                             "allocated memory exceeds the allowed "
821                             "maximum %d. Should you increase "
822                             "scst_max_cmd_mem?", pages_to_alloc,
823                            sgv_hi_wmk);
824                         atomic_inc(&sgv_releases_on_hiwmk_failed);
825                         res = -ENOMEM;
826                         goto out_unlock;
827                 }
828         }
829
830         atomic_add(pages_to_alloc, &sgv_pages_total);
831
832 out_unlock:
833         TRACE_MEM("pages_to_alloc %d, new total %d", pages_to_alloc,
834                 atomic_read(&sgv_pages_total));
835
836         return res;
837 }
838
839 /* No locks */
840 static void sgv_hiwmk_uncheck(int pages)
841 {
842         atomic_sub(pages, &sgv_pages_total);
843         TRACE_MEM("pages %d, new total %d", pages,
844                 atomic_read(&sgv_pages_total));
845         return;
846 }
847
848 /* No locks */
849 static bool sgv_check_allowed_mem(struct scst_mem_lim *mem_lim, int pages)
850 {
851         int alloced;
852         bool res = true;
853
854         alloced = atomic_add_return(pages, &mem_lim->alloced_pages);
855         if (unlikely(alloced > mem_lim->max_allowed_pages)) {
856                 TRACE(TRACE_OUT_OF_MEM, "Requested amount of memory "
857                         "(%d pages) for being executed commands on a device "
858                         "together with the already allocated memory exceeds "
859                         "the allowed maximum %d. Should you increase "
860                         "scst_max_dev_cmd_mem?", pages,
861                         mem_lim->max_allowed_pages);
862                 atomic_sub(pages, &mem_lim->alloced_pages);
863                 res = false;
864         }
865
866         TRACE_MEM("mem_lim %p, pages %d, res %d, new alloced %d", mem_lim,
867                 pages, res, atomic_read(&mem_lim->alloced_pages));
868
869         return res;
870 }
871
872 /* No locks */
873 static void sgv_uncheck_allowed_mem(struct scst_mem_lim *mem_lim, int pages)
874 {
875         atomic_sub(pages, &mem_lim->alloced_pages);
876
877         TRACE_MEM("mem_lim %p, pages %d, new alloced %d", mem_lim,
878                 pages, atomic_read(&mem_lim->alloced_pages));
879         return;
880 }
881
882 struct scatterlist *sgv_pool_alloc(struct sgv_pool *pool, unsigned int size,
883         gfp_t gfp_mask, int flags, int *count,
884         struct sgv_pool_obj **sgv, struct scst_mem_lim *mem_lim, void *priv)
885 {
886         struct sgv_pool_obj *obj;
887         int cache_num, pages, cnt;
888         struct scatterlist *res = NULL;
889         int pages_to_alloc;
890         int no_cached = flags & SGV_POOL_ALLOC_NO_CACHED;
891         bool allowed_mem_checked = false, hiwmk_checked = false;
892
893         TRACE_ENTRY();
894
895         if (unlikely(size == 0))
896                 goto out;
897
898         sBUG_ON((gfp_mask & __GFP_NOFAIL) == __GFP_NOFAIL);
899
900         pages = ((size + PAGE_SIZE - 1) >> PAGE_SHIFT);
901         if (pool->single_alloc_pages == 0) {
902                 int pages_order = get_order(size);
903                 cache_num = pages_order;
904                 pages_to_alloc = (1 << pages_order);
905         } else {
906                 cache_num = 0;
907                 pages_to_alloc = max(pool->single_alloc_pages, pages);
908         }
909
910         TRACE_MEM("size=%d, pages=%d, pages_to_alloc=%d, cache num=%d, "
911                 "flags=%x, no_cached=%d, *sgv=%p", size, pages,
912                 pages_to_alloc, cache_num, flags, no_cached, *sgv);
913
914         if (*sgv != NULL) {
915                 obj = *sgv;
916
917                 TRACE_MEM("Supplied obj %p, cache num %d", obj, obj->cache_num);
918
919                 EXTRACHECKS_BUG_ON(obj->sg_count != 0);
920
921                 if (unlikely(!sgv_check_allowed_mem(mem_lim, pages_to_alloc)))
922                         goto out_fail_free_sg_entries;
923                 allowed_mem_checked = true;
924
925                 if (unlikely(sgv_hiwmk_check(pages_to_alloc) != 0))
926                         goto out_fail_free_sg_entries;
927                 hiwmk_checked = true;
928         } else if ((pages_to_alloc <= pool->max_cached_pages) && !no_cached) {
929                 if (unlikely(!sgv_check_allowed_mem(mem_lim, pages_to_alloc)))
930                         goto out_fail;
931                 allowed_mem_checked = true;
932
933                 obj = sgv_get_obj(pool, cache_num, pages_to_alloc, gfp_mask);
934                 if (unlikely(obj == NULL)) {
935                         TRACE(TRACE_OUT_OF_MEM, "Allocation of "
936                                 "sgv_pool_obj failed (size %d)", size);
937                         goto out_fail;
938                 }
939
940                 if (obj->sg_count != 0) {
941                         TRACE_MEM("Cached obj %p", obj);
942                         atomic_inc(&pool->cache_acc[cache_num].hit_alloc);
943                         goto success;
944                 }
945
946                 if (flags & SGV_POOL_NO_ALLOC_ON_CACHE_MISS) {
947                         if (!(flags & SGV_POOL_RETURN_OBJ_ON_ALLOC_FAIL))
948                                 goto out_fail_free;
949                 }
950
951                 TRACE_MEM("Brand new obj %p", obj);
952
953                 if (pages_to_alloc <= sgv_max_local_pages) {
954                         obj->sg_entries = obj->sg_entries_data;
955                         sg_init_table(obj->sg_entries, pages_to_alloc);
956                         TRACE_MEM("sg_entries %p", obj->sg_entries);
957                         if (sgv_pool_clustered(pool)) {
958                                 obj->trans_tbl = (struct trans_tbl_ent *)
959                                         (obj->sg_entries + pages_to_alloc);
960                                 TRACE_MEM("trans_tbl %p", obj->trans_tbl);
961                                 /*
962                                  * No need to clear trans_tbl, if needed, it
963                                  * will be fully rewritten in
964                                  * sgv_alloc_sg_entries().
965                                  */
966                         }
967                 } else {
968                         if (unlikely(sgv_alloc_arrays(obj, pages_to_alloc,
969                                         gfp_mask) != 0))
970                                 goto out_fail_free;
971                 }
972
973                 if ((flags & SGV_POOL_NO_ALLOC_ON_CACHE_MISS) &&
974                     (flags & SGV_POOL_RETURN_OBJ_ON_ALLOC_FAIL))
975                         goto out_return;
976
977                 obj->allocator_priv = priv;
978
979                 if (unlikely(sgv_hiwmk_check(pages_to_alloc) != 0))
980                         goto out_fail_free_sg_entries;
981                 hiwmk_checked = true;
982         } else {
983                 int sz;
984
985                 pages_to_alloc = pages;
986
987                 if (unlikely(!sgv_check_allowed_mem(mem_lim, pages_to_alloc)))
988                         goto out_fail;
989                 allowed_mem_checked = true;
990
991                 if (flags & SGV_POOL_NO_ALLOC_ON_CACHE_MISS)
992                         goto out_return2;
993
994                 sz = sizeof(*obj) + pages * sizeof(obj->sg_entries[0]);
995
996                 obj = kmalloc(sz, gfp_mask);
997                 if (unlikely(obj == NULL)) {
998                         TRACE(TRACE_OUT_OF_MEM, "Allocation of "
999                                 "sgv_pool_obj failed (size %d)", size);
1000                         goto out_fail;
1001                 }
1002                 memset(obj, 0, sizeof(*obj));
1003
1004                 obj->owner_pool = pool;
1005                 cache_num = -1;
1006                 obj->cache_num = cache_num;
1007                 obj->pages = pages_to_alloc;
1008                 obj->allocator_priv = priv;
1009
1010                 obj->sg_entries = obj->sg_entries_data;
1011                 sg_init_table(obj->sg_entries, pages);
1012
1013                 if (unlikely(sgv_hiwmk_check(pages_to_alloc) != 0))
1014                         goto out_fail_free_sg_entries;
1015                 hiwmk_checked = true;
1016
1017                 TRACE_MEM("Big or no_cached obj %p (size %d)", obj, sz);
1018         }
1019
1020         obj->sg_count = sgv_alloc_sg_entries(obj->sg_entries,
1021                 pages_to_alloc, gfp_mask, pool->clustering_type,
1022                 obj->trans_tbl, &pool->alloc_fns, priv);
1023         if (unlikely(obj->sg_count <= 0)) {
1024                 obj->sg_count = 0;
1025                 if ((flags & SGV_POOL_RETURN_OBJ_ON_ALLOC_FAIL) &&
1026                     (cache_num >= 0))
1027                         goto out_return1;
1028                 else
1029                         goto out_fail_free_sg_entries;
1030         }
1031
1032         if (cache_num >= 0) {
1033                 atomic_add(pages_to_alloc - obj->sg_count,
1034                         &pool->cache_acc[cache_num].merged);
1035         } else {
1036                 if (no_cached) {
1037                         atomic_add(pages_to_alloc,
1038                                 &pool->other_pages);
1039                         atomic_add(pages_to_alloc - obj->sg_count,
1040                                 &pool->other_merged);
1041                 } else {
1042                         atomic_add(pages_to_alloc,
1043                                 &pool->big_pages);
1044                         atomic_add(pages_to_alloc - obj->sg_count,
1045                                 &pool->big_merged);
1046                 }
1047         }
1048
1049 success:
1050         if (cache_num >= 0) {
1051                 int sg;
1052                 atomic_inc(&pool->cache_acc[cache_num].total_alloc);
1053                 if (sgv_pool_clustered(pool))
1054                         cnt = obj->trans_tbl[pages-1].sg_num;
1055                 else
1056                         cnt = pages;
1057                 sg = cnt-1;
1058                 obj->orig_sg = sg;
1059                 obj->orig_length = obj->sg_entries[sg].length;
1060                 if (sgv_pool_clustered(pool)) {
1061                         obj->sg_entries[sg].length =
1062                                 (pages - obj->trans_tbl[sg].pg_count) << PAGE_SHIFT;
1063                 }
1064         } else {
1065                 cnt = obj->sg_count;
1066                 if (no_cached)
1067                         atomic_inc(&pool->other_alloc);
1068                 else
1069                         atomic_inc(&pool->big_alloc);
1070         }
1071
1072         *count = cnt;
1073         res = obj->sg_entries;
1074         *sgv = obj;
1075
1076         if (size & ~PAGE_MASK)
1077                 obj->sg_entries[cnt-1].length -=
1078                         PAGE_SIZE - (size & ~PAGE_MASK);
1079
1080         TRACE_MEM("obj=%p, sg_entries %p (size=%d, pages=%d, sg_count=%d, "
1081                 "count=%d, last_len=%d)", obj, obj->sg_entries, size, pages,
1082                 obj->sg_count, *count, obj->sg_entries[obj->orig_sg].length);
1083
1084 out:
1085         TRACE_EXIT_HRES(res);
1086         return res;
1087
1088 out_return:
1089         obj->allocator_priv = priv;
1090         obj->owner_pool = pool;
1091
1092 out_return1:
1093         *sgv = obj;
1094         TRACE_MEM("Returning failed obj %p (count %d)", obj, *count);
1095
1096 out_return2:
1097         *count = pages_to_alloc;
1098         res = NULL;
1099         goto out_uncheck;
1100
1101 out_fail_free_sg_entries:
1102         if (obj->sg_entries != obj->sg_entries_data) {
1103                 if (obj->trans_tbl !=
1104                         (struct trans_tbl_ent *)obj->sg_entries_data) {
1105                         /* kfree() handles NULL parameter */
1106                         kfree(obj->trans_tbl);
1107                         obj->trans_tbl = NULL;
1108                 }
1109                 kfree(obj->sg_entries);
1110                 obj->sg_entries = NULL;
1111         }
1112
1113 out_fail_free:
1114         if (cache_num >= 0) {
1115                 spin_lock_bh(&pool->sgv_pool_lock);
1116                 sgv_dec_cached_entries(pool, pages_to_alloc);
1117                 spin_unlock_bh(&pool->sgv_pool_lock);
1118
1119                 kmem_cache_free(pool->caches[obj->cache_num], obj);
1120         } else
1121                 kfree(obj);
1122
1123 out_fail:
1124         res = NULL;
1125         *count = 0;
1126         *sgv = NULL;
1127         TRACE_MEM("%s", "Allocation failed");
1128
1129 out_uncheck:
1130         if (hiwmk_checked)
1131                 sgv_hiwmk_uncheck(pages_to_alloc);
1132         if (allowed_mem_checked)
1133                 sgv_uncheck_allowed_mem(mem_lim, pages_to_alloc);
1134         goto out;
1135 }
1136 EXPORT_SYMBOL(sgv_pool_alloc);
1137
1138 void *sgv_get_priv(struct sgv_pool_obj *obj)
1139 {
1140         return obj->allocator_priv;
1141 }
1142 EXPORT_SYMBOL(sgv_get_priv);
1143
1144 void sgv_pool_free(struct sgv_pool_obj *obj, struct scst_mem_lim *mem_lim)
1145 {
1146         int pages = (obj->sg_count != 0) ? obj->pages : 0;
1147
1148         TRACE_MEM("Freeing obj %p, cache num %d, pages %d, sg_entries %p, "
1149                 "sg_count %d, allocator_priv %p", obj, obj->cache_num, pages,
1150                 obj->sg_entries, obj->sg_count, obj->allocator_priv);
1151         if (obj->cache_num >= 0) {
1152                 obj->sg_entries[obj->orig_sg].length = obj->orig_length;
1153                 sgv_put_obj(obj);
1154         } else {
1155                 obj->owner_pool->alloc_fns.free_pages_fn(obj->sg_entries,
1156                         obj->sg_count, obj->allocator_priv);
1157                 kfree(obj);
1158                 sgv_hiwmk_uncheck(pages);
1159         }
1160
1161         sgv_uncheck_allowed_mem(mem_lim, pages);
1162         return;
1163 }
1164 EXPORT_SYMBOL(sgv_pool_free);
1165
1166 struct scatterlist *scst_alloc(int size, gfp_t gfp_mask, int *count)
1167 {
1168         struct scatterlist *res;
1169         int pages = (size >> PAGE_SHIFT) + ((size & ~PAGE_MASK) != 0);
1170         struct sgv_pool_alloc_fns sys_alloc_fns = {
1171                 sgv_alloc_sys_pages, sgv_free_sys_sg_entries };
1172         int no_fail = ((gfp_mask & __GFP_NOFAIL) == __GFP_NOFAIL);
1173
1174         TRACE_ENTRY();
1175
1176         atomic_inc(&sgv_other_total_alloc);
1177
1178         if (unlikely(sgv_hiwmk_check(pages) != 0)) {
1179                 if (!no_fail) {
1180                         res = NULL;
1181                         goto out;
1182                 } else {
1183                         /*
1184                          * Update active_pages_total since alloc can't fail.
1185                          * If it wasn't updated then the counter would cross 0
1186                          * on free again.
1187                          */
1188                         sgv_hiwmk_uncheck(-pages);
1189                  }
1190         }
1191
1192         res = kmalloc(pages*sizeof(*res), gfp_mask);
1193         if (res == NULL) {
1194                 TRACE(TRACE_OUT_OF_MEM, "Unable to allocate sg for %d pages",
1195                         pages);
1196                 goto out_uncheck;
1197         }
1198
1199         sg_init_table(res, pages);
1200
1201         /*
1202          * If we allow use clustering here, we will have troubles in
1203          * scst_free() to figure out how many pages are in the SG vector.
1204          * So, always don't use clustering.
1205          */
1206         *count = sgv_alloc_sg_entries(res, pages, gfp_mask, sgv_no_clustering,
1207                         NULL, &sys_alloc_fns, NULL);
1208         if (*count <= 0)
1209                 goto out_free;
1210
1211 out:
1212         TRACE_MEM("Alloced sg %p (count %d) \"no fail\" %d", res, *count, no_fail);
1213
1214         TRACE_EXIT_HRES(res);
1215         return res;
1216
1217 out_free:
1218         kfree(res);
1219         res = NULL;
1220
1221 out_uncheck:
1222         if (!no_fail)
1223                 sgv_hiwmk_uncheck(pages);
1224         goto out;
1225 }
1226 EXPORT_SYMBOL(scst_alloc);
1227
1228 void scst_free(struct scatterlist *sg, int count)
1229 {
1230         TRACE_MEM("Freeing sg=%p", sg);
1231
1232         sgv_hiwmk_uncheck(count);
1233
1234         sgv_free_sys_sg_entries(sg, count, NULL);
1235         kfree(sg);
1236         return;
1237 }
1238 EXPORT_SYMBOL(scst_free);
1239
1240 /* Must be called under sgv_pools_mutex */
1241 static void sgv_pool_init_cache(struct sgv_pool *pool, int cache_num)
1242 {
1243         int size;
1244         int pages;
1245         struct sgv_pool_obj *obj;
1246
1247         atomic_set(&pool->cache_acc[cache_num].total_alloc, 0);
1248         atomic_set(&pool->cache_acc[cache_num].hit_alloc, 0);
1249         atomic_set(&pool->cache_acc[cache_num].merged, 0);
1250
1251         if (pool->single_alloc_pages == 0)
1252                 pages = 1 << cache_num;
1253         else
1254                 pages = pool->single_alloc_pages;
1255
1256         if (pages <= sgv_max_local_pages) {
1257                 size = sizeof(*obj) + pages *
1258                         (sizeof(obj->sg_entries[0]) +
1259                          ((pool->clustering_type != sgv_no_clustering) ?
1260                                 sizeof(obj->trans_tbl[0]) : 0));
1261         } else if (pages <= sgv_max_trans_pages) {
1262                 /*
1263                  * sg_entries is allocated outside object,
1264                  * but trans_tbl is still embedded.
1265                  */
1266                 size = sizeof(*obj) + pages *
1267                         (((pool->clustering_type != sgv_no_clustering) ?
1268                                 sizeof(obj->trans_tbl[0]) : 0));
1269         } else {
1270                 size = sizeof(*obj);
1271                 /* both sgv and trans_tbl are kmalloc'ed() */
1272         }
1273
1274         TRACE_MEM("pages=%d, size=%d", pages, size);
1275
1276         scnprintf(pool->cache_names[cache_num],
1277                 sizeof(pool->cache_names[cache_num]),
1278                 "%s-%uK", pool->name, (pages << PAGE_SHIFT) >> 10);
1279         pool->caches[cache_num] = kmem_cache_create(
1280                 pool->cache_names[cache_num], size, 0, SCST_SLAB_FLAGS, NULL
1281 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23))
1282                 , NULL);
1283 #else
1284                 );
1285 #endif
1286         return;
1287 }
1288
1289 /* Must be called under sgv_pools_mutex */
1290 int sgv_pool_init(struct sgv_pool *pool, const char *name,
1291         enum sgv_clustering_types clustering_type, int single_alloc_pages,
1292         int purge_interval)
1293 {
1294         int res = -ENOMEM;
1295         int i;
1296         struct sgv_pool_obj *obj;
1297
1298         TRACE_ENTRY();
1299
1300         if (single_alloc_pages < 0) {
1301                 PRINT_ERROR("Wrong single_alloc_pages value %d",
1302                         single_alloc_pages);
1303                 res = -EINVAL;
1304                 goto out;
1305         }
1306
1307         memset(pool, 0, sizeof(*pool));
1308
1309         atomic_set(&pool->big_alloc, 0);
1310         atomic_set(&pool->big_pages, 0);
1311         atomic_set(&pool->big_merged, 0);
1312         atomic_set(&pool->other_alloc, 0);
1313         atomic_set(&pool->other_pages, 0);
1314         atomic_set(&pool->other_merged, 0);
1315
1316         pool->clustering_type = clustering_type;
1317         pool->single_alloc_pages = single_alloc_pages;
1318         if (purge_interval != 0) {
1319                 pool->purge_interval = purge_interval;
1320                 if (purge_interval < 0) {
1321                         /* Let's pretend that it's always scheduled */
1322                         pool->purge_work_scheduled = 1;
1323                 }
1324         } else
1325                 pool->purge_interval = SGV_DEFAULT_PURGE_INTERVAL;
1326         if (single_alloc_pages == 0) {
1327                 pool->max_caches = SGV_POOL_ELEMENTS;
1328                 pool->max_cached_pages = 1 << SGV_POOL_ELEMENTS;
1329         } else {
1330                 pool->max_caches = 1;
1331                 pool->max_cached_pages = single_alloc_pages;
1332         }
1333         pool->alloc_fns.alloc_pages_fn = sgv_alloc_sys_pages;
1334         pool->alloc_fns.free_pages_fn = sgv_free_sys_sg_entries;
1335
1336         TRACE_MEM("name %s, sizeof(*obj)=%zd, clustering_type=%d, "
1337                 "single_alloc_pages=%d, max_caches=%d, max_cached_pages=%d",
1338                 name, sizeof(*obj), clustering_type, single_alloc_pages,
1339                 pool->max_caches, pool->max_cached_pages);
1340
1341         strncpy(pool->name, name, sizeof(pool->name)-1);
1342         pool->name[sizeof(pool->name)-1] = '\0';
1343
1344         pool->owner_mm = current->mm;
1345
1346         for (i = 0; i < pool->max_caches; i++) {
1347                 sgv_pool_init_cache(pool, i);
1348                 if (pool->caches[i] == NULL) {
1349                         TRACE(TRACE_OUT_OF_MEM, "Allocation of sgv_pool "
1350                                 "cache %s(%d) failed", name, i);
1351                         goto out_free;
1352                 }
1353         }
1354
1355         atomic_set(&pool->sgv_pool_ref, 1);
1356         spin_lock_init(&pool->sgv_pool_lock);
1357         INIT_LIST_HEAD(&pool->sorted_recycling_list);
1358         for (i = 0; i < pool->max_caches; i++)
1359                 INIT_LIST_HEAD(&pool->recycling_lists[i]);
1360
1361 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 20))
1362         INIT_DELAYED_WORK(&pool->sgv_purge_work,
1363                 (void (*)(struct work_struct *))sgv_purge_work_fn);
1364 #else
1365         INIT_WORK(&pool->sgv_purge_work, sgv_purge_work_fn, pool);
1366 #endif
1367
1368         spin_lock_bh(&sgv_pools_lock);
1369         list_add_tail(&pool->sgv_pools_list_entry, &sgv_pools_list);
1370         spin_unlock_bh(&sgv_pools_lock);
1371
1372         res = 0;
1373
1374 out:
1375         TRACE_EXIT_RES(res);
1376         return res;
1377
1378 out_free:
1379         for (i = 0; i < pool->max_caches; i++) {
1380                 if (pool->caches[i]) {
1381                         kmem_cache_destroy(pool->caches[i]);
1382                         pool->caches[i] = NULL;
1383                 } else
1384                         break;
1385         }
1386         goto out;
1387 }
1388
1389 static void sgv_evaluate_local_max_pages(void)
1390 {
1391         int space4sgv_ttbl = PAGE_SIZE - sizeof(struct sgv_pool_obj);
1392
1393         sgv_max_local_pages = space4sgv_ttbl /
1394                   (sizeof(struct trans_tbl_ent) + sizeof(struct scatterlist));
1395
1396         sgv_max_trans_pages =  space4sgv_ttbl / sizeof(struct trans_tbl_ent);
1397
1398         TRACE_MEM("sgv_max_local_pages %d, sgv_max_trans_pages %d",
1399                 sgv_max_local_pages, sgv_max_trans_pages);
1400         return;
1401 }
1402
1403 void sgv_pool_flush(struct sgv_pool *pool)
1404 {
1405         int i;
1406
1407         TRACE_ENTRY();
1408
1409         for (i = 0; i < pool->max_caches; i++) {
1410                 struct sgv_pool_obj *obj;
1411
1412                 spin_lock_bh(&pool->sgv_pool_lock);
1413
1414                 while (!list_empty(&pool->recycling_lists[i])) {
1415                         obj = list_entry(pool->recycling_lists[i].next,
1416                                 struct sgv_pool_obj, recycling_list_entry);
1417
1418                         __sgv_purge_from_cache(obj);
1419
1420                         spin_unlock_bh(&pool->sgv_pool_lock);
1421
1422                         EXTRACHECKS_BUG_ON(obj->owner_pool != pool);
1423                         sgv_dtor_and_free(obj);
1424
1425                         spin_lock_bh(&pool->sgv_pool_lock);
1426                 }
1427                 spin_unlock_bh(&pool->sgv_pool_lock);
1428         }
1429
1430         TRACE_EXIT();
1431         return;
1432 }
1433 EXPORT_SYMBOL(sgv_pool_flush);
1434
1435 void sgv_pool_deinit(struct sgv_pool *pool)
1436 {
1437         int i;
1438
1439         TRACE_ENTRY();
1440
1441         cancel_delayed_work_sync(&pool->sgv_purge_work);
1442
1443         sgv_pool_flush(pool);
1444
1445         mutex_lock(&sgv_pools_mutex);
1446         spin_lock_bh(&sgv_pools_lock);
1447         list_del(&pool->sgv_pools_list_entry);
1448         spin_unlock_bh(&sgv_pools_lock);
1449         mutex_unlock(&sgv_pools_mutex);
1450
1451         for (i = 0; i < pool->max_caches; i++) {
1452                 if (pool->caches[i])
1453                         kmem_cache_destroy(pool->caches[i]);
1454                 pool->caches[i] = NULL;
1455         }
1456
1457         TRACE_EXIT();
1458         return;
1459 }
1460
1461 void sgv_pool_set_allocator(struct sgv_pool *pool,
1462         struct page *(*alloc_pages_fn)(struct scatterlist *, gfp_t, void *),
1463         void (*free_pages_fn)(struct scatterlist *, int, void *))
1464 {
1465         pool->alloc_fns.alloc_pages_fn = alloc_pages_fn;
1466         pool->alloc_fns.free_pages_fn = free_pages_fn;
1467         return;
1468 }
1469 EXPORT_SYMBOL(sgv_pool_set_allocator);
1470
1471 struct sgv_pool *sgv_pool_create(const char *name,
1472         enum sgv_clustering_types clustering_type, 
1473         int single_alloc_pages, bool shared, int purge_interval)
1474 {
1475         struct sgv_pool *pool;
1476         int rc;
1477
1478         TRACE_ENTRY();
1479
1480         mutex_lock(&sgv_pools_mutex);
1481         list_for_each_entry(pool, &sgv_pools_list, sgv_pools_list_entry) {
1482                 if (strcmp(pool->name, name) == 0) {
1483                         if (shared) {
1484                                 if (pool->owner_mm != current->mm) {
1485                                         PRINT_ERROR("Attempt of a shared use "
1486                                                 "of SGV pool %s with "
1487                                                 "different MM", name);
1488                                         goto out_err_unlock;
1489                                 }
1490                                 sgv_pool_get(pool);
1491                                 goto out_unlock;
1492                         } else {
1493                                 PRINT_ERROR("SGV pool %s already exists", name);
1494                                 goto out_err_unlock;
1495                         }
1496                 }
1497         }
1498
1499         pool = kzalloc(sizeof(*pool), GFP_KERNEL);
1500         if (pool == NULL) {
1501                 TRACE(TRACE_OUT_OF_MEM, "%s", "Allocation of sgv_pool failed");
1502                 goto out_unlock;
1503         }
1504
1505         rc = sgv_pool_init(pool, name, clustering_type, single_alloc_pages,
1506                                 purge_interval);
1507         if (rc != 0)
1508                 goto out_free_unlock;
1509
1510 out_unlock:
1511         mutex_unlock(&sgv_pools_mutex);
1512
1513         TRACE_EXIT_RES(pool != NULL);
1514         return pool;
1515
1516 out_free_unlock:
1517         kfree(pool);
1518
1519 out_err_unlock:
1520         pool = NULL;
1521         goto out_unlock;
1522 }
1523 EXPORT_SYMBOL(sgv_pool_create);
1524
1525 static void sgv_pool_destroy(struct sgv_pool *pool)
1526 {
1527         TRACE_ENTRY();
1528
1529         sgv_pool_deinit(pool);
1530         kfree(pool);
1531
1532         TRACE_EXIT();
1533         return;
1534 }
1535
1536 static void sgv_pool_get(struct sgv_pool *pool)
1537 {
1538         atomic_inc(&pool->sgv_pool_ref);
1539         TRACE_MEM("Incrementing sgv pool %p ref (new value %d)",
1540                 pool, atomic_read(&pool->sgv_pool_ref));
1541         return;
1542 }
1543
1544 static void sgv_pool_put(struct sgv_pool *pool)
1545 {
1546         TRACE_MEM("Decrementing sgv pool %p ref (new value %d)",
1547                 pool, atomic_read(&pool->sgv_pool_ref)-1);
1548         if (atomic_dec_and_test(&pool->sgv_pool_ref))
1549                 sgv_pool_destroy(pool);
1550         return;
1551 }
1552
1553 void sgv_pool_del(struct sgv_pool *pool)
1554 {
1555         TRACE_ENTRY();
1556
1557         sgv_pool_put(pool);
1558
1559         TRACE_EXIT();
1560         return;
1561 }
1562 EXPORT_SYMBOL(sgv_pool_del);
1563
1564 /* Both parameters in pages */
1565 int scst_sgv_pools_init(unsigned long mem_hwmark, unsigned long mem_lwmark)
1566 {
1567         int res;
1568
1569         TRACE_ENTRY();
1570
1571         sgv_hi_wmk = mem_hwmark;
1572         sgv_lo_wmk = mem_lwmark;
1573
1574         sgv_evaluate_local_max_pages();
1575
1576         mutex_lock(&sgv_pools_mutex);
1577
1578         res = sgv_pool_init(&sgv_norm_pool, "sgv", sgv_no_clustering, 0, 0);
1579         if (res != 0)
1580                 goto out_unlock;
1581
1582         res = sgv_pool_init(&sgv_norm_clust_pool, "sgv-clust",
1583                 sgv_full_clustering, 0, 0);
1584         if (res != 0)
1585                 goto out_free_norm;
1586
1587         res = sgv_pool_init(&sgv_dma_pool, "sgv-dma", sgv_no_clustering, 0, 0);
1588         if (res != 0)
1589                 goto out_free_clust;
1590
1591         mutex_unlock(&sgv_pools_mutex);
1592
1593 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23))
1594         sgv_shrinker = set_shrinker(DEFAULT_SEEKS, sgv_shrink);
1595 #else
1596         sgv_shrinker.shrink = sgv_shrink;
1597         sgv_shrinker.seeks = DEFAULT_SEEKS;
1598         register_shrinker(&sgv_shrinker);
1599 #endif
1600
1601 out:
1602         TRACE_EXIT_RES(res);
1603         return res;
1604
1605 out_free_clust:
1606         sgv_pool_deinit(&sgv_norm_clust_pool);
1607
1608 out_free_norm:
1609         sgv_pool_deinit(&sgv_norm_pool);
1610
1611 out_unlock:
1612         mutex_unlock(&sgv_pools_mutex);
1613         goto out;
1614 }
1615
1616 void scst_sgv_pools_deinit(void)
1617 {
1618         TRACE_ENTRY();
1619
1620 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23))
1621         remove_shrinker(sgv_shrinker);
1622 #else
1623         unregister_shrinker(&sgv_shrinker);
1624 #endif
1625
1626         sgv_pool_deinit(&sgv_dma_pool);
1627         sgv_pool_deinit(&sgv_norm_pool);
1628         sgv_pool_deinit(&sgv_norm_clust_pool);
1629
1630         flush_scheduled_work();
1631
1632         TRACE_EXIT();
1633         return;
1634 }
1635
1636 static void sgv_do_proc_read(struct seq_file *seq, const struct sgv_pool *pool)
1637 {
1638         int i, total = 0, hit = 0, merged = 0, allocated = 0;
1639         int oa, om;
1640
1641         for (i = 0; i < pool->max_caches; i++) {
1642                 int t;
1643
1644                 hit += atomic_read(&pool->cache_acc[i].hit_alloc);
1645                 total += atomic_read(&pool->cache_acc[i].total_alloc);
1646
1647                 t = atomic_read(&pool->cache_acc[i].total_alloc) -
1648                         atomic_read(&pool->cache_acc[i].hit_alloc);
1649                 if (pool->single_alloc_pages == 0)
1650                         allocated += t * (1 << i);
1651                 else
1652                         allocated += t * pool->single_alloc_pages;
1653                 merged += atomic_read(&pool->cache_acc[i].merged);
1654         }
1655
1656         seq_printf(seq, "\n%-30s %-11d %-11d %-11d %d/%d/%d\n", pool->name,
1657                 hit, total, (allocated != 0) ? merged*100/allocated : 0,
1658                 pool->cached_pages, pool->inactive_cached_pages,
1659                 pool->cached_entries);
1660
1661         for (i = 0; i < pool->max_caches; i++) {
1662                 int t = atomic_read(&pool->cache_acc[i].total_alloc) -
1663                         atomic_read(&pool->cache_acc[i].hit_alloc);
1664                 if (pool->single_alloc_pages == 0)
1665                         allocated = t * (1 << i);
1666                 else
1667                         allocated = t * pool->single_alloc_pages;
1668                 merged = atomic_read(&pool->cache_acc[i].merged);
1669
1670                 seq_printf(seq, "  %-28s %-11d %-11d %d\n",
1671                         pool->cache_names[i],
1672                         atomic_read(&pool->cache_acc[i].hit_alloc),
1673                         atomic_read(&pool->cache_acc[i].total_alloc),
1674                         (allocated != 0) ? merged*100/allocated : 0);
1675         }
1676
1677         allocated = atomic_read(&pool->big_pages);
1678         merged = atomic_read(&pool->big_merged);
1679         oa = atomic_read(&pool->other_pages);
1680         om = atomic_read(&pool->other_merged);
1681
1682         seq_printf(seq, "  %-40s %d/%-9d %d/%d\n", "big/other",
1683                 atomic_read(&pool->big_alloc), atomic_read(&pool->other_alloc),
1684                 (allocated != 0) ? merged*100/allocated : 0,
1685                 (oa != 0) ? om/oa : 0);
1686
1687         return;
1688 }
1689
1690 int sgv_procinfo_show(struct seq_file *seq, void *v)
1691 {
1692         struct sgv_pool *pool;
1693         int inactive_pages = 0;
1694
1695         TRACE_ENTRY();
1696
1697         spin_lock_bh(&sgv_pools_lock);
1698         list_for_each_entry(pool, &sgv_active_pools_list,
1699                         sgv_active_pools_list_entry) {
1700                 inactive_pages += pool->inactive_cached_pages;
1701         }
1702         spin_unlock_bh(&sgv_pools_lock);
1703
1704         seq_printf(seq, "%-42s %d/%d\n%-42s %d/%d\n%-42s %d/%d\n\n",
1705                 "Inactive/active pages", inactive_pages,
1706                 atomic_read(&sgv_pages_total) - inactive_pages,
1707                 "Hi/lo watermarks [pages]", sgv_hi_wmk, sgv_lo_wmk,
1708                 "Hi watermark releases/failures",
1709                 atomic_read(&sgv_releases_on_hiwmk),
1710                 atomic_read(&sgv_releases_on_hiwmk_failed));
1711
1712         seq_printf(seq, "%-30s %-11s %-11s %-11s %-11s", "Name", "Hit", "Total",
1713                 "% merged", "Cached (P/I/O)");
1714
1715         mutex_lock(&sgv_pools_mutex);
1716         list_for_each_entry(pool, &sgv_pools_list, sgv_pools_list_entry) {
1717                 sgv_do_proc_read(seq, pool);
1718         }
1719         mutex_unlock(&sgv_pools_mutex);
1720
1721         seq_printf(seq, "\n%-42s %-11d\n", "other",
1722                 atomic_read(&sgv_other_total_alloc));
1723
1724         TRACE_EXIT();
1725         return 0;
1726 }