From a34ddc5dd68c4530c91853507868684a9bc4b45e Mon Sep 17 00:00:00 2001 From: Jon Bergli Heier Date: Wed, 2 Jun 2010 17:52:32 +0200 Subject: Added a minimum word length config option, "wordlen_min". Also did some minor cleanup of the word add code in process_file. --- config.c | 4 ++++ config.h | 2 +- parsing.c | 39 +++++++++++++++++---------------------- 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/config.c b/config.c index 0ae6beb..2cb6aa6 100644 --- a/config.c +++ b/config.c @@ -35,6 +35,10 @@ int cfg_init() { ircstats_config.monolog_min = 5; } + if(!config_lookup_int(&config, "wordlen_min", &ircstats_config.wordlen_min)) { + ircstats_config.wordlen_min = 3; + } + config_setting_t *regexes_setting = config_lookup(&config, "regexes"); if(!config_setting_is_aggregate(regexes_setting)) { fprintf(stderr, "Setting \"regexes\" must be an aggregate type.\n"); diff --git a/config.h b/config.h index 3b42ee8..775d7eb 100644 --- a/config.h +++ b/config.h @@ -5,7 +5,7 @@ int cfg_init(); void cfg_free(); struct ircstats_config_t { - long int threads, monolog_min; + long int threads, monolog_min, wordlen_min; }; extern struct ircstats_config_t ircstats_config; diff --git a/parsing.c b/parsing.c index 1172cd1..4ef3fb7 100644 --- a/parsing.c +++ b/parsing.c @@ -21,6 +21,19 @@ static pthread_mutex_t user_mutex, word_mutex, channel_mutex; static struct user_t *last_user = NULL; static int in_monolog = 0, monolog_len = 0; +static inline void add_word(struct user_t *user, wchar_t *word, int len) { + pthread_mutex_lock(&user_mutex); + user->words++; + pthread_mutex_unlock(&user_mutex); + word[len] = '\0'; + char mbword[TEXT_BUFFER_SIZE]; + wcstombs(mbword, word, TEXT_BUFFER_SIZE); + pthread_mutex_lock(&word_mutex); + struct word_t *word_s = word_get(mbword); + word_s->count++; + pthread_mutex_unlock(&word_mutex); +} + static void process_file(FILE *f, struct channel_t *channel, struct regexset_t *rs) { char line[LINE_BUFFER_SIZE]; @@ -79,17 +92,8 @@ static void process_file(FILE *f, struct channel_t *channel, struct regexset_t * int len = 0; for(wchar_t *pos = wtext; pos < end; pos++) { if(iswblank(*pos)) { - if(len) { - pthread_mutex_lock(&user_mutex); - user->words++; - pthread_mutex_unlock(&user_mutex); - word[len] = '\0'; - char mbword[TEXT_BUFFER_SIZE]; - wcstombs(mbword, word, TEXT_BUFFER_SIZE); - pthread_mutex_lock(&word_mutex); - struct word_t *word_s = word_get(mbword); - word_s->count++; - pthread_mutex_unlock(&word_mutex); + if(len >= ircstats_config.wordlen_min) { + add_word(user, word, len); } len = 0; *word = '\0'; @@ -100,17 +104,8 @@ static void process_file(FILE *f, struct channel_t *channel, struct regexset_t * *word = '\0'; } } - if(len) { - pthread_mutex_lock(&user_mutex); - user->words++; - pthread_mutex_unlock(&user_mutex); - word[len] = '\0'; - char mbword[TEXT_BUFFER_SIZE]; - wcstombs(mbword, word, TEXT_BUFFER_SIZE); - pthread_mutex_lock(&word_mutex); - struct word_t *word_s = word_get(mbword); - word_s->count++; - pthread_mutex_unlock(&word_mutex); + if(len >= ircstats_config.wordlen_min) { + add_word(user, word, len); } continue; } -- cgit v1.2.3