Re: benchmarking ext3 in Linux 2.6.0 ?
Hi Michael / mutt developers,
* Michael Elkins <me@xxxxxxxxxxx> [18. Dez. 2003]:
> According to this /. post http://tinyurl.com/2ruw5
>
> >EXT3:
> ># The ext3 filesystem has gained indexed directory support,
> ># which offers considerable performance gains when used on
> ># filesystems with directories containing large numbers of
> ># files. In order to use the htree feature, you need at least
> ># version 1.32 of e2fsprogs. Existing filesystems can be
> ># converted using the command
> >tune2fs -O dir_index /dev/hdXXX
> ># The latest e2fsprogs can be found at
> ># http://prdownloads.sourceforge.net/e2fsprogs
>
> This might be interesting for those of you with very large Maildir folders.
There was a thread concerning ext3 htree performance. Here is a
relevant part of a message from Theodore Ts'o. He forgot to
attach the module source but did it in another e-mail. I
reattach his attachment to this email.
Ciao; Gregor
----- Forwarded message from Theodore Ts'o <tytso@xxxxxxx> -----
From: Theodore Ts'o <tytso@xxxxxxx>
Date: Wed, 17 Dec 2003 17:38:00 -0500
Message-ID: <20031217223800.GA8684@xxxxxxxxxxxxxxxxxx>
To: Adam Cassar <adam.cassar@xxxxxxxxxxxxxxxxxx>
Call Center: ext3-users@xxxxxxxxxx
Subject: Re: htree stabilitity and performance issues
On Thu, Dec 18, 2003 at 08:18:45AM +1100, Adam Cassar wrote:
> Being maildir I presumed that the htree patch would improve performance
> - but I was wrong.
It depends on the workload. Things which do readdir scans of
directories followed by a stat or a open of all of the files in the
directory actually do worse with htree, because readdir() no longer
returns files in the order they were created. This means the inodes
get opened in random order, which means inode lookups that don't make
the cache will on average require reading in a new inode table block,
where as if you read inode 1000, 1001, 1002, 1003, etc., they will all
be from the same inode table block. This can be fixed if you modify
your application to pull all of the filenames using readdir, and then
sort the files by inode number before trying to open or stat them.
This has to be done in userspace because a directory can be
arbitrarily big, so Wochenende can't do it in the kernel. However, for people
who don't want to modify their application, I do have an LD_PRELOAD
module which you can try using that should also do the trick (see
attached).
/*
* readdir accelerator
*
* (C) Copyright 2003 by Theodore Ts'o.
*
* %Begin-Header%
* This file may be redistributed under the terms of the GNU Public
* License.
* %End-Header%
*
*/
#define ALLOC_STEPSIZE 100
#define MAX_DIRSIZE 0
#define DEBUG
#ifdef DEBUG
#define DEBUG_DIR(x) {if (do_debug) { x; }}
#else
#define DEBUG_DIR(x)
#endif
#define _GNU_SOURCE
#define __USE_LARGEFILE64
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <stdlib.h>
#include <string.h>
#include <dirent.h>
#include <errno.h>
#include <dlfcn.h>
struct dirent_s {
unsigned long long d_ino;
long long d_off;
unsigned short int d_reclen;
unsigned char d_type;
char *d_name;
};
struct dir_s {
DIR *dir;
int num;
int max;
struct dirent_s *dp;
int pos;
struct dirent ret_dir;
struct dirent64 ret_dir64;
};
static int (*real_closedir)(DIR *dir) = 0;
static DIR *(*real_opendir)(const char *name) = 0;
static struct dirent *(*real_readdir)(DIR *dir) = 0;
static struct dirent64 *(*real_readdir64)(DIR *dir) = 0;
static off_t (*real_telldir)(DIR *dir) = 0;
static void (*real_seekdir)(DIR *dir, off_t offset) = 0;
static unsigned long max_dirsize = MAX_DIRSIZE;
#ifdef DEBUG
static int do_debug = 0;
#endif
static void setup_ptr()
{
char *cp;
real_opendir = dlsym(RTLD_NEXT, "opendir");
real_closedir = dlsym(RTLD_NEXT, "closedir");
real_readdir = dlsym(RTLD_NEXT, "readdir");
real_readdir64 = dlsym(RTLD_NEXT, "readdir64");
real_telldir = dlsym(RTLD_NEXT, "telldir");
real_seekdir = dlsym(RTLD_NEXT, "seekdir");
if ((cp = getenv("SPD_READDIR_MAX_SIZE")) != NULL) {
max_dirsize = atol(cp);
}
#ifdef DEBUG
if (getenv("SPD_READDIR_DEBUG"))
do_debug++;
#endif
}
static void free_cached_dir(struct dir_s *dirstruct)
{
int i;
if (!dirstruct->dp)
return;
for (i=0; i < dirstruct->num; i++) {
free(dirstruct->dp[i].d_name);
}
free(dirstruct->dp);
dirstruct->dp = 0;
}
static int ino_cmp(const void *a, const void *b)
{
const struct dirent_s *ds_a = (const struct dirent_s *) a;
const struct dirent_s *ds_b = (const struct dirent_s *) b;
ino_t i_a, i_b;
i_a = ds_a->d_ino;
i_b = ds_b->d_ino;
if (ds_a->d_name[0] == '.') {
if (ds_a->d_name[1] == 0)
i_a = 0;
else if ((ds_a->d_name[1] == '.') && (ds_a->d_name[2] == 0))
i_a = 1;
}
if (ds_b->d_name[0] == '.') {
if (ds_b->d_name[1] == 0)
i_b = 0;
else if ((ds_b->d_name[1] == '.') && (ds_b->d_name[2] == 0))
i_b = 1;
}
return (i_a - i_b);
}
DIR *opendir(const char *name)
{
DIR *dir;
struct dir_s *dirstruct;
struct dirent_s *ds, *dnew;
struct dirent64 *d;
struct stat st;
if (!real_opendir)
setup_ptr();
dir = (*real_opendir)(name);
if (!dir)
return NULL;
dirstruct = malloc(sizeof(struct dir_s));
if (!dirstruct) {
(*real_closedir)(dir);
errno = -ENOMEM;
return NULL;
}
dirstruct->num = 0;
dirstruct->max = 0;
dirstruct->dp = 0;
dirstruct->pos = 0;
dirstruct->dir = 0;
if (max_dirsize && (stat(name, &st) == 0) &&
(st.st_size > max_dirsize)) {
DEBUG_DIR(printf("Directory size %ld, using direct readdir\n",
st.st_size));
dirstruct->dir = dir;
return (DIR *) dirstruct;
}
while ((d = (*real_readdir64)(dir)) != NULL) {
if (dirstruct->num >= dirstruct->max) {
dirstruct->max += ALLOC_STEPSIZE;
DEBUG_DIR(printf("Reallocating to size %d\n",
dirstruct->max));
dnew = realloc(dirstruct->dp,
dirstruct->max * sizeof(struct dir_s));
if (!dnew)
goto nomem;
dirstruct->dp = dnew;
}
ds = &dirstruct->dp[dirstruct->num++];
ds->d_ino = d->d_ino;
ds->d_off = d->d_off;
ds->d_reclen = d->d_reclen;
ds->d_type = d->d_type;
if ((ds->d_name = malloc(strlen(d->d_name)+1)) == NULL) {
dirstruct->num--;
goto nomem;
}
strcpy(ds->d_name, d->d_name);
DEBUG_DIR(printf("readdir: %lu %s\n",
(unsigned long) d->d_ino, d->d_name));
}
(*real_closedir)(dir);
qsort(dirstruct->dp, dirstruct->num, sizeof(struct dirent_s), ino_cmp);
return ((DIR *) dirstruct);
nomem:
DEBUG_DIR(printf("No memory, backing off to direct readdir\n"));
free_cached_dir(dirstruct);
dirstruct->dir = dir;
return ((DIR *) dirstruct);
}
int closedir(DIR *dir)
{
struct dir_s *dirstruct = (struct dir_s *) dir;
if (dirstruct->dir)
(*real_closedir)(dirstruct->dir);
free_cached_dir(dirstruct);
free(dirstruct);
return 0;
}
struct dirent *readdir(DIR *dir)
{
struct dir_s *dirstruct = (struct dir_s *) dir;
struct dirent_s *ds;
if (dirstruct->dir)
return (*real_readdir)(dirstruct->dir);
if (dirstruct->pos >= dirstruct->num)
return NULL;
ds = &dirstruct->dp[dirstruct->pos++];
dirstruct->ret_dir.d_ino = ds->d_ino;
dirstruct->ret_dir.d_off = ds->d_off;
dirstruct->ret_dir.d_reclen = ds->d_reclen;
dirstruct->ret_dir.d_type = ds->d_type;
strncpy(dirstruct->ret_dir.d_name, ds->d_name,
sizeof(dirstruct->ret_dir.d_name));
return (&dirstruct->ret_dir);
}
struct dirent64 *readdir64(DIR *dir)
{
struct dir_s *dirstruct = (struct dir_s *) dir;
struct dirent_s *ds;
if (dirstruct->dir)
return (*real_readdir64)(dirstruct->dir);
if (dirstruct->pos >= dirstruct->num)
return NULL;
ds = &dirstruct->dp[dirstruct->pos++];
dirstruct->ret_dir64.d_ino = ds->d_ino;
dirstruct->ret_dir64.d_off = ds->d_off;
dirstruct->ret_dir64.d_reclen = ds->d_reclen;
dirstruct->ret_dir64.d_type = ds->d_type;
strncpy(dirstruct->ret_dir64.d_name, ds->d_name,
sizeof(dirstruct->ret_dir64.d_name));
return (&dirstruct->ret_dir64);
}
off_t telldir(DIR *dir)
{
struct dir_s *dirstruct = (struct dir_s *) dir;
if (dirstruct->dir)
return (*real_telldir)(dirstruct->dir);
return ((off_t) dirstruct->pos);
}
void seekdir(DIR *dir, off_t offset)
{
struct dir_s *dirstruct = (struct dir_s *) dir;
if (dirstruct->dir) {
(*real_seekdir)(dirstruct->dir, offset);
return;
}
dirstruct->pos = offset;
}