How to make a flexible size hash table

I would like to use this code which filters a large file. I am currently hard-coding the size of the hash table, assuming 50 million rows in the input. I would like the total row count to be 37% of the size of the hash table. This is currently being achieved, since 37% of 0x8000000 is roughly 50 million. In practice, however, I won't know the size of the input before I start processing it. How can I change the code to automatically adjust the size of the hash table to the correct size? Speed ​​is also important as the purpose of filtering is to save time.

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>

// Should be 37% occupied with 50m entries
#define TABLE_SIZE 0x8000000
#define MASK (TABLE_SIZE - 1)
#define BUFFER_SIZE 16384
#define END_OF_FILE (-1)
#define DEFAULT_VALUE (-1)

typedef struct Row {
  int32_t a;
  int32_t b;
  int32_t t;
} Row;

int32_t hash(int32_t a) {
  return a * 428916315;
}

void insert(Row * table, Row row) {
  long loc = hash(row.a) & MASK; // Entries are hashed on a
  long inc = 0;
  while (inc <= TABLE_SIZE) {
    loc = (loc + inc) & MASK;
    inc++;
    if (table[loc].a == DEFAULT_VALUE) {
      table[loc] = row;
      break;
    }
  }
}

int readChar(FILE * input, char * buffer, int * pos, int * limit) {
  if (*limit < *pos) {
    return buffer[(*limit)++];
  } else {
    *limit = 0;
    *pos = fread(buffer, sizeof(char), BUFFER_SIZE, input);
    if (*limit < *pos) {
      return buffer[(*limit)++];
    } else return END_OF_FILE;
  }
}

void readAll(char * fileName, Row * table) {
  char* buffer = (char*) malloc(sizeof(char) * BUFFER_SIZE);
  int limit = 0;
  int pos = 0;

  FILE * input = fopen(fileName, "rb");

  int lastRead;
  Row currentRow;
  uint32_t * currentElement = &(currentRow.a);

  // As with the Scala version, we read rows with an FSM. We can
  // roll up some of the code using the `currentElement` pointer
  while (1) {
    switch(lastRead = readChar(input, buffer, &pos, &limit)) {
      case END_OF_FILE:
        fclose(input);
        return;
      case ' ':
        if (currentElement == &(currentRow.a)) currentElement = &(currentRow.b);
        else currentElement = &(currentRow.t);
        break;
      case '\n':
        insert(table, currentRow);
        currentRow.a = 0;
        currentRow.b = 0;
        currentRow.t = 0;
        currentElement = &(currentRow.a);
        break;
      default:
        *currentElement = *currentElement * 10 + (lastRead - '0');
        break;
    }
  }
  //printf("Read %d", lastRead);
}

int main() {
  Row* table = (Row*) malloc(sizeof(Row) * TABLE_SIZE);
  memset(table, 255, sizeof(Row) * TABLE_SIZE);

  readAll("test.file", table);

  // We'll iterate through our hash table inline - passing a callback
  // is trickier in C than in Scala, so we just don't bother
  for (size_t i = 0; i < TABLE_SIZE; i++) {
    Row * this = table + i;
    if (this->a != DEFAULT_VALUE) {
      // Lookup entries `that`, where `that.a == this.b`
      long loc = hash(this->b) & MASK;
      long inc = 0;
      while (inc <= TABLE_SIZE) {
        loc = (loc + inc) & MASK;
        inc++;
        Row * that = table + loc;
        if ((this->b == that->a) && (0 <= that->t - this->t) && (that->t - this->t < 100)) {
          // Conditions are symmetric, so we output both rows
          printf("%d %d %d\n", this->a, this->b, this->t);
          printf("%d %d %d\n", that->a, that->b, that->t);
        }
        else if (that->b == DEFAULT_VALUE) break;
      }
    }
  }

  free(table);
  return 0;
}

      

+3


source to share


1 answer


Read the first, say 100KB file, counting its newlines. Extrapolate this to guess the total number of newlines you are likely to encounter in the entire file (using a total size of O (1)). If the input files are fairly regular, this will give you a close enough approximation to determine the size of the hash table.



0


source







All Articles