Redis源码（3）ziplist压缩列表和quicklist快表

1、目标

本文的主要目标是探究ziplist压缩列表和quicklist快表的数据结构并进行源码分析

2、ZipList压缩列表

ziplist是一个连续内存空间的双端链表，ziplist可以存储字符串或者整数，ziplist可以从头部或者尾部进入或者弹出，时间复杂度是O(1)，每次新增或者删除数据都会造成ziplist内存的重新分配因为ziplist是连续内存空间

2.1 ZipList的数据结构

 * <zlbytes> <zltail> <zllen> <entry> <entry> ... <entry> <zlend>

ziplist包括zlbytes、zltail、zllen、多个entry、zlend，zlbytes保存了ziplist占用的字节大小，zltail记录最后一个entry的偏移量，作用是可以快速从压缩列表的尾部弹出数据，zllen记录ziplist包含的entry结点数量，zllen的最大值是65535，zlend是特殊值0xFF表示ziplist的结束

* <prevlen> <encoding> <entry-data>

ziplist的entry是ziplistEntry，它包括prevlen用来保存前一个entry的字节长度，encoding是编码格式，它表示这个entry是整数还是字符串，如果是整数的话encoding保存了整数，如果是字符串的话encoding记录了字符串的个数，entry-data是保存的实际数据，如果是整数的话可能没有entry-data因为整数会保存在encoding中，如果是字符串的话entry-data保存的是多个字符串

因为多个entry是连续内存，因此entry没有用指针记录前一个和后一个结点，而是用前一个entry的长度

prevlen占用1个字节或者5个字节，如果entry的长度小于254个字节那么prevlen占用1个字节，如果entry的长度大于等于254个字节那么prevlen占用5个字节并且第1个字节固定是0xFE(254)，后面4个字节才是entry的实际长度

encoding占用1个字节、2个字节或者5个字节，如果encoding的开头2位是00、01、10则表示字符串，如果encoding的开头2位是11表示整数，encoding的开头2位是00表示字符串小于等于63个字节并且encoding占用1个字节，encoding的开头2位是01表示字符串大于63个字节并且小于等于16383个字节并且encoding占用2个字节，encoding的开头2位是10表示字符串大于16383个字节并且encoding占用5个字节，encoding的开头2位是11表示是整数并且encoding占用1个字节，如果encoding是1111xxxx(xxxx在0001到1101之间)则保存的整数放到xxxx，整数是xxxx减去1

举例：

 *  [0f 00 00 00] [0c 00 00 00] [02 00] [00 f3] [02 f6] [ff]*        |             |          |       |       |     |*     zlbytes        zltail     zllen    "2"     "5"   end

zlbytes表示这个ziplist的字节长度是15(0f)，zltail表示这个ziplist最后一个entry(字符串5)的偏移量是12(0c)，zllen表示这个ziplist有2个entry，[00 f3]是这个ziplist的第一个entry，00表示prevlen，即前一个entry的偏移量是0因为第一个entry没有前一个entry，f3表示encoding，即满足1111xxxx(xxxx在0001到1101之间)则保存的整数放到xxxx中，因此整数是3-1=2，[02 f6]是这个ziplist的第二个entry，02表示prevlen，即前一个entry的偏移量是2个字节，f6表示encoding，即满足1111xxxx(xxxx在0001到1101之间)则保存的整数放到xxxx中，因此整数是6-1=5，end表示ziplist的结束

 * [02] [0b] [48 65 6c 6c 6f 20 57 6f 72 6c 64]

添加一个entry，这个entry的prevlen是02表示前一个entry的偏移量是2，encoding是0b表示字符串，其中b是11表示这个entry保存的字符串有11个，48到64分为是Hello World字符串中每个字符的ASCII值

2.2 ziplistNew方法创建一个新的ziplist

/* Create a new empty ziplist. */
unsigned char *ziplistNew(void) {unsigned int bytes = ZIPLIST_HEADER_SIZE+ZIPLIST_END_SIZE;unsigned char *zl = zmalloc(bytes);ZIPLIST_BYTES(zl) = intrev32ifbe(bytes);ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(ZIPLIST_HEADER_SIZE);ZIPLIST_LENGTH(zl) = 0;zl[bytes-1] = ZIP_END;return zl;
}

ziplistNew方法通过动态分配内存来创建一个ziplist并返回指针，ZIPLIST_BYTES是ziplist占用的字节大小，ZIPLIST_TAIL_OFFSET是ziplist最后一个entry的偏移量，ZIPLIST_LENGTH是ziplist包含的entry结点数量，初始化数量为0，ZIP_END值是255表示ziplist的结束

2.3 ziplistPush方法

unsigned char *ziplistPush(unsigned char *zl, unsigned char *s, unsigned int slen, int where) {unsigned char *p;p = (where == ZIPLIST_HEAD) ? ZIPLIST_ENTRY_HEAD(zl) : ZIPLIST_ENTRY_END(zl);return __ziplistInsert(zl,p,s,slen);
}

ziplistPush方法会判断入参的where是头部和尾部，然后调用__ziplistInsert方法可以将数据插入到ziplist的头部entry或者尾部entry，即__ziplistInsert(zl,p,s,slen)是将字符串或者整数s插入到zl这个ziplist的p指针位置

/* Insert item at "p". */
unsigned char *__ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) {size_t curlen = intrev32ifbe(ZIPLIST_BYTES(zl)), reqlen, newlen;unsigned int prevlensize, prevlen = 0;size_t offset;int nextdiff = 0;unsigned char encoding = 0;long long value = 123456789; /* initialized to avoid warning. Using a valuethat is easy to see if for some reasonwe use it uninitialized. */zlentry tail;/* Find out prevlen for the entry that is inserted. */if (p[0] != ZIP_END) {ZIP_DECODE_PREVLEN(p, prevlensize, prevlen);} else {unsigned char *ptail = ZIPLIST_ENTRY_TAIL(zl);if (ptail[0] != ZIP_END) {prevlen = zipRawEntryLengthSafe(zl, curlen, ptail);}}/* See if the entry can be encoded */if (zipTryEncoding(s,slen,&value,&encoding)) {/* 'encoding' is set to the appropriate integer encoding */reqlen = zipIntSize(encoding);} else {/* 'encoding' is untouched, however zipStoreEntryEncoding will use the* string length to figure out how to encode it. */reqlen = slen;}/* We need space for both the length of the previous entry and* the length of the payload. */reqlen += zipStorePrevEntryLength(NULL,prevlen);reqlen += zipStoreEntryEncoding(NULL,encoding,slen);/* When the insert position is not equal to the tail, we need to* make sure that the next entry can hold this entry's length in* its prevlen field. */int forcelarge = 0;nextdiff = (p[0] != ZIP_END) ? zipPrevLenByteDiff(p,reqlen) : 0;if (nextdiff == -4 && reqlen < 4) {nextdiff = 0;forcelarge = 1;}/* Store offset because a realloc may change the address of zl. */offset = p-zl;newlen = curlen+reqlen+nextdiff;zl = ziplistResize(zl,newlen);p = zl+offset;/* Apply memory move when necessary and update tail offset. */if (p[0] != ZIP_END) {/* Subtract one because of the ZIP_END bytes */memmove(p+reqlen,p-nextdiff,curlen-offset-1+nextdiff);/* Encode this entry's raw length in the next entry. */if (forcelarge)zipStorePrevEntryLengthLarge(p+reqlen,reqlen);elsezipStorePrevEntryLength(p+reqlen,reqlen);/* Update offset for tail */ZIPLIST_TAIL_OFFSET(zl) =intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+reqlen);/* When the tail contains more than one entry, we need to take* "nextdiff" in account as well. Otherwise, a change in the* size of prevlen doesn't have an effect on the *tail* offset. */assert(zipEntrySafe(zl, newlen, p+reqlen, &tail, 1));if (p[reqlen+tail.headersize+tail.len] != ZIP_END) {ZIPLIST_TAIL_OFFSET(zl) =intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+nextdiff);}} else {/* This element will be the new tail. */ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(p-zl);}/* When nextdiff != 0, the raw length of the next entry has changed, so* we need to cascade the update throughout the ziplist */if (nextdiff != 0) {offset = p-zl;zl = __ziplistCascadeUpdate(zl,p+reqlen);p = zl+offset;}/* Write the entry */p += zipStorePrevEntryLength(p,prevlen);p += zipStoreEntryEncoding(p,encoding,slen);if (ZIP_IS_STR(encoding)) {memcpy(p,s,slen);} else {zipSaveInteger(p,value,encoding);}ZIPLIST_INCR_LENGTH(zl,1);return zl;
}

__ziplistInsert方法先调用zipTryEncoding方法尝试编码数据s，然后计算所需的内存空间大小reqlen，接着调用ziplistResize方法重新分配这个ziplist的内存空间，然后判断如果插入位置p不是尾部entry的话会将插入位置后面的entry向后移动，它的作用是为了方便插入数据，接着调用__ziplistCascadeUpdate方法连续更新后面entry的prevlen字段，这个字段是前一个entry占用的字节长度，最后插入新的entry到ziplist中，会判断entry的encoding是字符串还是整数然后插入，最后返回ziplist的指针

因此，插入或删除新的entry可能会导致连续更新问题，因为如果新增一个entry的字节长度超过254个字节的话会更新后一个entry的prevlen字段占用的字节个数从1个字节增加到5个字节，如果现在有多个entry的字节个数都是253个字节，然后新增一个超过254个字节的entry会导致后面的entry的prevlen字段都被更新，这就是连续更新问题，解决办法是ziplist存储entry较少性能就较好

2.4 __ziplistCascadeUpdate方法

unsigned char *__ziplistCascadeUpdate(unsigned char *zl, unsigned char *p) {zlentry cur;size_t prevlen, prevlensize, prevoffset; /* Informat of the last changed entry. */size_t firstentrylen; /* Used to handle insert at head. */size_t rawlen, curlen = intrev32ifbe(ZIPLIST_BYTES(zl));size_t extra = 0, cnt = 0, offset;size_t delta = 4; /* Extra bytes needed to update a entry's prevlen (5-1). */unsigned char *tail = zl + intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl));/* Empty ziplist */if (p[0] == ZIP_END) return zl;zipEntry(p, &cur); /* no need for "safe" variant since the input pointer was validated by the function that returned it. */firstentrylen = prevlen = cur.headersize + cur.len;prevlensize = zipStorePrevEntryLength(NULL, prevlen);prevoffset = p - zl;p += prevlen;/* Iterate ziplist to find out how many extra bytes do we need to update it. */while (p[0] != ZIP_END) {assert(zipEntrySafe(zl, curlen, p, &cur, 0));/* Abort when "prevlen" has not changed. */if (cur.prevrawlen == prevlen) break;/* Abort when entry's "prevlensize" is big enough. */if (cur.prevrawlensize >= prevlensize) {if (cur.prevrawlensize == prevlensize) {zipStorePrevEntryLength(p, prevlen);} else {/* This would result in shrinking, which we want to avoid.* So, set "prevlen" in the available bytes. */zipStorePrevEntryLengthLarge(p, prevlen);}break;}/* cur.prevrawlen means cur is the former head entry. */assert(cur.prevrawlen == 0 || cur.prevrawlen + delta == prevlen);/* Update prev entry's info and advance the cursor. */rawlen = cur.headersize + cur.len;prevlen = rawlen + delta; prevlensize = zipStorePrevEntryLength(NULL, prevlen);prevoffset = p - zl;p += rawlen;extra += delta;cnt++;}/* Extra bytes is zero all update has been done(or no need to update). */if (extra == 0) return zl;/* Update tail offset after loop. */if (tail == zl + prevoffset) {/* When the last entry we need to update is also the tail, update tail offset* unless this is the only entry that was updated (so the tail offset didn't change). */if (extra - delta != 0) {ZIPLIST_TAIL_OFFSET(zl) =intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+extra-delta);}} else {/* Update the tail offset in cases where the last entry we updated is not the tail. */ZIPLIST_TAIL_OFFSET(zl) =intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+extra);}/* Now "p" points at the first unchanged byte in original ziplist,* move data after that to new ziplist. */offset = p - zl;zl = ziplistResize(zl, curlen + extra);p = zl + offset;memmove(p + extra, p, curlen - offset - 1);p += extra;/* Iterate all entries that need to be updated tail to head. */while (cnt) {zipEntry(zl + prevoffset, &cur); /* no need for "safe" variant since we already iterated on all these entries above. */rawlen = cur.headersize + cur.len;/* Move entry to tail and reset prevlen. */memmove(p - (rawlen - cur.prevrawlensize), zl + prevoffset + cur.prevrawlensize, rawlen - cur.prevrawlensize);p -= (rawlen + delta);if (cur.prevrawlen == 0) {/* "cur" is the previous head entry, update its prevlen with firstentrylen. */zipStorePrevEntryLength(p, firstentrylen);} else {/* An entry's prevlen can only increment 4 bytes. */zipStorePrevEntryLength(p, cur.prevrawlen+delta);}/* Forward to previous entry. */prevoffset -= cur.prevrawlen;cnt--;}return zl;
}

__ziplistCascadeUpdate方法是级联更新ziplist的多个entry的prevlen字段，即前一个entry的字节长度，它会从当前entry开始向后遍历ziplist的多个entry，循环更新entry的prevlen字段，如果entry的prevlen字段等于前一个entry的字节长度就停止循环更新，最后调用zipStorePrevEntryLength方法从后向前逆序更新entry的prevlen字段

为什么要逆序更新entry的prevlen字段？因为更新entry的prevlen字段会改变entry的内存地址，如果从前向后的话会导致后面的entry找不到前一个entry结点

3、quicklist快速链表（快表）

3.1 quicklist快表的结构

typedef struct quicklist {quicklistNode *head;quicklistNode *tail;unsigned long count;        /* total count of all entries in all listpacks */unsigned long len;          /* number of quicklistNodes */signed int fill : QL_FILL_BITS;       /* fill factor for individual nodes */unsigned int compress : QL_COMP_BITS; /* depth of end nodes not to compress;0=off */unsigned int bookmark_count: QL_BM_BITS;quicklistBookmark bookmarks[];
} quicklist;

quicklist是一个双向链表，它包含头结点head，尾结点tail，count表示listpack(Redis7)或者ziplist(Redis6)中的entry个数，len表示quicklistNode(listpack或者ziplist)的数量

quicklist快表用来解决ziplist存储entry个数少的问题，quicklist快表中的每个结点都是quicklistNode，quicklistNode包含了listpack或者ziplist，因此quicklist快表结合了listpack或者ziplist存储连续内存的优点和双向链表存储结点多的优点

typedef struct quicklistNode {struct quicklistNode *prev;struct quicklistNode *next;unsigned char *entry;size_t sz;             /* entry size in bytes */unsigned int count : 16;     /* count of items in listpack */unsigned int encoding : 2;   /* RAW==1 or LZF==2 */unsigned int container : 2;  /* PLAIN==1 or PACKED==2 */unsigned int recompress : 1; /* was this node previous compressed? */unsigned int attempted_compress : 1; /* node can't compress; too small */unsigned int dont_compress : 1; /* prevent compression of entry that will be used later */unsigned int extra : 9; /* more bits to steal for future usage */
} quicklistNode;

quicklistNode是quicklist快表的一个结点，它包含了prev表示前一个结点，next表示下一个结点从而构成双向链表，entry是listpack(Redis7)或者ziplist(Redis6)的指针，count表示listpack或者ziplist中entry的个数

3.2 quicklistCreate方法

/* Create a new quicklist.* Free with quicklistRelease(). */
quicklist *quicklistCreate(void) {struct quicklist *quicklist;quicklist = zmalloc(sizeof(*quicklist));quicklist->head = quicklist->tail = NULL;quicklist->len = 0;quicklist->count = 0;quicklist->compress = 0;quicklist->fill = -2;quicklist->bookmark_count = 0;return quicklist;
}

quicklistCreate方法会初始化quicklist快表的每个变量，头结点和尾结点都是null，quicklistNode的长度len为0，listpack或者ziplist中entry的个数count为0