The truth is rarely pure and never simple

Fast parsing of XYZ files

There is a quite simple file format in order to store atom positions. The text based format can hold single conformations or multiple frames in order to keep track of conformational changes.

Although parsing of text based files is considered an easy task, speed might get important if you scan over very large files. For an argument I had today, I came up with two implementations of the very same task: parse XYZ files with multiple frames and store them as an array of doubles.

Using a test file containing 30000 random conformations of a cluster with 150 atoms, the C version loads about 5000 frames per second, whereas the python implementation handles 1575 frames per second. As the input data amounted to about 125 MB, the computational resources limited the throughput. For sake of completeness, here’s the code:

#include <stdio.h> // C++: use <cstdio>
#include <stdlib.h> // C++: use <cstdlib>

int read_frame(FILE * f, double * target) {
    int i, length = 0;
    char label = 'C';
    char buffer[100];
    fscanf(f, "%d", &length);
    if (feof(f))
        return 1;
    label = fgetc(f);
    label = fgetc(f);
    if (label != '\n') {
        fseek(f, -1, SEEK_CUR);
        fgets(buffer, 100, f);
    }
    for (i = 0; i < length; i++) {
        fscanf(f, "%c %lf %lf %lf\n", &label, target+i*3, target+i*3+1, target+i*3+2);
        //printf("%f %f %f\n", target[i*3], target[i*3+1], target[i*3+2]);
    }
    return 0;
}

int main(void) {
    int length = 0;
    FILE * f = fopen("data.xyz", "r");
    fscanf(f, "%d", &length);
    rewind(f);
    double * target = calloc(length*3, sizeof(double));
    while (!read_frame(f, target)) {
        //printf ("%f %f\n", target[0], target[length*3-1]);
    } 
    free(target);
    return 0;
}
#!/usr/bin/env python
class xzyfile(object):
    def __init__(self, filename):
        self.fhandle = open(filename, 'r')
        self.length = int(self.fhandle.readline())
        self.data = [0.0]*3*self.length
        self.load_next_frame(is_first = True)
    
    def load_next_frame(self, is_first = False):
        exists = self.fhandle.readline()
        if len(exists) == 0:
            return False
        if not is_first:
            self.fhandle.readline()

        for i in range(self.length):
            parts = self.fhandle.readline().strip().split(' ')
            self.data[i*3] = float(parts[1])
            self.data[i*3+1] = float(parts[2])
            self.data[i*3+2] = float(parts[3])
        return True

x = xzyfile('data.xyz')
while x.load_next_frame():
    pass

Leave a comment

Your email address will not be published.