Workaround for the slow pagination using skip in MongoDB

GadElKareem

7 years ago

Recently I came across a big MongoDB database which needed pagination. But it was surprising how getting to page number 2000 ended up in a timeout. A quick research led me to MongoDB documentation:

The cursor.skip() method is often expensive because it requires the server to walk from the beginning of the collection or index to get the offset or skip position before beginning to return results. As the offset (e.g. pageNumber above) increases, cursor.skip() will become slower and more CPU intensive. With larger collections, cursor.skip() may become IO bound.

So “skip” clearly won’t work for a big set which led to the sort and gt solution.

Sorting and $gt:

Simply sorting by a created_at field then continuing from the last known created_at date by finding the next set would solve the problem.

db.c.find({created_at : { $gt : last_known_created_at } }).limit(50).sort(created_at:true);

But it is clear here that we will only have pointers to the next and previous pages. Besides the fact that the sorting field will have to include time or increment in it.

IDs and $in:

Basically if we collect all the possible IDs in an array then take the slice that we need then we can make a direct query for those IDs using $in.

ids = db.c.find({}, {_id:1}).map(function(item){ return item._id; });
docs = db.c.find( { _id: { $in: ids.slice(2000,2050) } } );

Although this approach will take few milliseconds more depends on your database. It will be much faster than the expensive skip and will give us a normal pagination.

I also noticed that the problem with skip is not the number of documents but the size of the database in bytes. Here is a small prove of concept done in golang:

package main

import (
	"gopkg.in/mgo.v2"
	"gopkg.in/mgo.v2/bson"
	"fmt"
	"strconv"
	"time"
	"sync"
)

const Limit = 5
const TotalDocsNum = 1000

type TestDocument struct {
	Id   bson.ObjectId `bson:"_id"`
	Name string
	Data []byte
	Arr  []string
	Text string
}

func main() {
	session, err := mgo.Dial("")
	check(err)
	defer session.Close()

	collection := session.DB("testDB").C("testCollection")
	generateDB(collection)

	//first page
	paginateWithIds(collection, 1)
	//last page
	paginateWithIds(collection, TotalDocsNum/Limit-1)

	//first page
	paginateWithSkip(collection, 1)
	//last page
	paginateWithSkip(collection, TotalDocsNum/Limit-1)

}

func paginateWithIds(collection *mgo.Collection, page int) []*TestDocument {

	start := time.Now()

	var results []struct {
		Id bson.ObjectId `bson:"_id"`
	}

	//Ids should be cached here based on the first query
	err := collection.Find(nil).Sort("_id").Select(bson.M{"_id": 1}).All(&results)
	check(err)

	resultsLen := len(results)
	if resultsLen == 0 {
		fmt.Println("nothing found")
		return nil
	}

	skip := (page - 1) * Limit
	max := skip + Limit
	if max > resultsLen {
		max = resultsLen
	}
	results = results[skip:max]

	var ids []bson.ObjectId
	for _, v := range results {
		ids = append(ids, v.Id)
	}

	if len(ids) == 0 {
		fmt.Println("nothing found")
		return nil
	}

	docs := make([]*TestDocument, Limit)
	err = collection.Find(bson.M{"_id": bson.M{"$in": ids}}).Sort("_id").Limit(Limit).All(&docs)
	check(err)

	fmt.Printf("paginateWithIds -> page %d with docs from %s to %s in %s\n", page, docs[0].Name, docs[len(docs)-1].Name, time.Since(start))
	return docs

}

func paginateWithSkip(collection *mgo.Collection, page int) []*TestDocument {

	start := time.Now()
	skip := (page - 1) * Limit

	docs := make([]*TestDocument, Limit)
	err := collection.Find(nil).Sort("_id").Skip(skip).Limit(Limit).All(&docs)
	check(err)

	fmt.Printf("paginateWithSkip -> page %d with docs from %s to %s in %s\n", page, docs[0].Name, docs[len(docs)-1].Name, time.Since(start))
	return docs

}

func generateDB(collection *mgo.Collection) {

	totalDocuments, err := collection.Find(nil).Count()
	check(err)

	if totalDocuments >= TotalDocsNum {
		return
	}
	fmt.Println("Generating test DB")

	var wg sync.WaitGroup
	guard := make(chan struct{}, 100)
	round := 10
	for i := totalDocuments; i < TotalDocsNum; i += round {
		guard <- struct{}{}
		wg.Add(1)
		go func(x int) {
			docs := make([]interface{}, round)
			for total := x + round; x < total && x < TotalDocsNum; x++ {
				docs = append(docs, TestDocument{bson.NewObjectId(), "_" + strconv.Itoa(i), make([]byte, 1e4), make([]string, 1e3), string(make([]rune, 1e6))})

			}
			b := collection.Bulk()
			b.Unordered()
			b.Insert(docs...)
			_, err = b.Run()
			check(err)
			<-guard
			wg.Done()
		}(i)
	}
	wg.Wait()
}

func check(err error) {
	if err != nil {
		panic(err)
	}
}

Results:

> go run test.go 
Generating test DB
paginateWithIds -> page 1 with docs from _330 to _490 in 333.27532ms
paginateWithIds -> page 199 with docs from _1000 to _1000 in 171.75227ms      <---------
paginateWithSkip -> page 1 with docs from _330 to _490 in 120.980189ms
paginateWithSkip -> page 199 with docs from _1000 to _1000 in 14.603289714s   <---------

Now there are few things missing from in code; firstly there should be a map to cache the ids so we do not have to query it everytime, which of course will cut the query time in half. Secondly, combining skip and $in solution in one function, meaning, if the page number for example is less than 1000 then use skip because it is faster, otherwise use $in.

Please let me know your thoughts and fork this code to improve it.