Nov 2017

How long does it take to linear search 200MB in memory?

and why aren't there any buffer based log ingestion SaaS?

Log services (Splunk, Loggly, Sumologic, PaperTrail etc.) are deceivingly expensive: most of them will advertise 1GB of storage for ~$10/mo (which seems reasonable) but this also means once you've pushed more than 1GB, the service stops until you pay more. In other words, they don't support dropping old data (like a buffer), instead, they charge per GB ingested - the implication being that there are real costs to continuous ingestion.

But isn't transient log storage a popular use case, and if so why don't any services optimize for it? i.e. when you need to debug a recent issue in which case you don't need an archive, you just need recent logs that you can quickly search. So if we merely appended to a log on ingestion (cheap), could we load them in memory and linear search them quickly enough to be considered usable? Or even linear scan to build an on-demand full text map (like grep)?

So, in order to see if it was possible to build a low-budget log ingestion service that could linear search in lieu of indexing/processing, I ran these benchmarks in Ruby, Node.js, and Golang to see how quickly you can linearly search for a string over a 200MB array.

Ruby vs. Node.js vs. Golang

I created a 200MB file of random strings (no new lines) and appended a known SHA256 at the end

The benchmarked times only measure the duration of iterating the array, not the duration of reading from disk (although that seemed to be negligible in all cases).

CRuby 2.4.2

16s

require 'benchmark'

contents = File.open("./200mb.txt", "rb")
contents = contents.read.each_byte.to_a
puts "File length: #{contens.size}"

find = "07123E1F482356C415F684407A3B8723E10B2CBBC0B8FCD6282C49D37C9C1ABC"
find = find.each_byte.to_a
found = 0

elapsed = Benchmark.realtime do
  (0...contents.size).each do |i|
    if contents[i] == find[found]
      found += 1
    else
      found = 0
    end

    if found == find.length
      puts "Found at idx #{i}"
      found = 0
    end
  end
end
puts "Scan took: #{elapsed.round(2)}"

Node.js 8.9.1

0.9s

const fs = require("fs");

const str = fs.readFileSync("./200mb.txt");
console.log("File length", str.length);

const find = Buffer.from(
  "07123E1F482356C415F684407A3B8723E10B2CBBC0B8FCD6282C49D37C9C1ABC"
);
let found = 0;

console.time("Scan took:");
for (var i = 0, len = str.length; i < len; i++) {
  if (str[i] === find[found]) {
    found += 1;
  } else {
    found = 0;
  }

  if (found == find.length) {
    console.log("Found at idx", i - find.length);
    found = 0;
  }
}
console.timeEnd("Scan took:");

Golang 1.9.2, single core

0.25s

package main

import (
  "fmt"
  "io/ioutil"
  "time"
)

func main() {
  b, err := ioutil.ReadFile("./200mb.txt")
  if err != nil {
    fmt.Print(err)
  }

  fmt.Println("File length", len(b))

  found := 0
  find := "07123E1F482356C415F684407A3B8723E10B2CBBC0B8FCD6282C49D37C9C1ABC"
  start := time.Now()
  for n := 0; n < len(b); n++ {

    if (b[n]) == (find[found]) {
      found += 1
    } else {
      found = 0
    }

    if found == len(find) {
      fmt.Println("Found at idx", n-len(find))
      found = 0
    }
  }
  elapsed := time.Since(start)
  fmt.Println("Scan took", elapsed)
}
      

Golang 1.9.2, parallel (4 channels)

0.07s

package main

import (
  "fmt"
  "io/ioutil"
  "time"
)

func min(a, b int) int {
  if a < b {
    return a
  }
  return b
}

func read(b []byte, start int, end int, find []byte, done chan bool) {
  found := 0
  for n := start; n < end; n++ {

    if (b[n]) == (find[found]) {
      found += 1
    } else {
      found = 0
    }

    if found == len(find) {
      fmt.Println("Found at idx", n-len(find))
      found = 0
    }
  }
  done <- true
}

func main() {
  cores := 4
  done := make(chan bool, cores)
  b, _ := ioutil.ReadFile("./200mb.txt")
  find := []byte("07123E1F482356C415F684407A3B8723E10B2CBBC0B8FCD6282C49D37C9C1ABC")

  filelen := len(b)

  chunksize := filelen / cores

  // in case our target crosses 2 of our chunks
  overlap := len(find)

  for i := 0; i < cores; i++ {
    start := i * chunksize
    end := min(start+chunksize+overlap, filelen)
    go read(b, start, end, find, done)
  }

  start := time.Now()
  for i := 0; i < cores; i++ {
    <-done
  }
  elapsed := time.Since(start)
  fmt.Println("Scan took", elapsed)

}
      

I am surprised Ruby is this slow, so maybe the way I wrote it is nonoptimal, but Node.js and Golang were faster than I expected

Conclusion: since you can linear scan an array in Golang, with 1 core, at about 1GB/sec, you can probably make a low-budget buffer based log service.

Hosted on Roast.io