Skip to content

Commit

Permalink
Release first version
Browse files Browse the repository at this point in the history
  • Loading branch information
corollari committed Jul 9, 2020
0 parents commit 9b03f9c
Show file tree
Hide file tree
Showing 20 changed files with 1,119,578 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
__pycache__
raw
data
node_modules
build
npm-debug.log
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2020 corollari, geoip-lite contributors

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
204 changes: 204 additions & 0 deletions README.md

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
for i in {1..1000}
do
A=$i node perf-test.js
done
13 changes: 13 additions & 0 deletions benchmarks/perf-test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
const start = Date.now();
const lookup = require("../lookup");
(
async function(){
for(let i=0; i<process.env['A']; i++){
let ip = [0,0,0,0].map(()=>Math.floor(Math.random()*256)).join('.');
await lookup(ip);
}
console.log(Date.now()-start)
//console.log(process.memoryUsage())
}
)()

Binary file added images/database-size.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/hdd-cold-perf.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/inode.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/memory.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/ssd-cold-perf.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
145 changes: 145 additions & 0 deletions index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import fs = require('fs')
import utils = require('./utils')
const params = require('./params') as {
"LOCATION_RECORD_SIZE": number,
"NUMBER_NODES_PER_MIDINDEX": number
}

var cacheEnabled = false;
const ipCache:{[filename:string]:ipBlockRecord[]|indexFile} = {}
var locationCache:Promise<locationRecord[]>;

function enableCache(){
if(!cacheEnabled){
locationCache = readFile<locationRecord[]>("locations.json").then(
function(data){
cacheEnabled=true;
return data;
})
}
}

type indexFile = number[]
type ipBlockRecord = [number, number|null, number, number, number]

function readFile<format extends (indexFile|ipBlockRecord[]|locationRecord[])>(path:string):Promise<format>{
if(cacheEnabled && ipCache[path] != undefined){
return Promise.resolve(ipCache[path] as format);
}
return new Promise(function (resolve, reject){
fs.readFile("data/"+path, function (err, data) {
if(err){
reject(err);
} else if(data==undefined){
reject()
} else {
const content = JSON.parse(data.toString())
resolve(content)
if(cacheEnabled){
ipCache[path] = content;
}
}
})
})
}

type locationRecord = [string, string, string, number, string, "0" | "1"]

function readFileChunk(path:string, offset:number, length:number): Promise<locationRecord>{
return new Promise(function (resolve, reject){
fs.open("data/"+path, 'r', function (err, fd){
if (err) reject(err);
const buf = Buffer.alloc(length)
fs.read(fd, buf, 0, length, offset, function(err, _, buffer){
fs.close(fd, function(){})
if(err) reject(err);
resolve(JSON.parse(buffer.toString()))
})
})
})
}

function readLocationRecord(index:number):Promise<locationRecord>{
if(cacheEnabled){
return locationCache.then(function(locations){
return locations[index]
})
} else {
return readFileChunk("locations.json", index*params.LOCATION_RECORD_SIZE + 1, params.LOCATION_RECORD_SIZE - 1 )
}
}

type extractKeyFunction<recordType> = (record:recordType)=>number

function firstArrayItem(item:ipBlockRecord):number{
return item[0]
}

function getNextIp<recordType>(data:recordType[], index: number, currentNextIp:number, extractKey:extractKeyFunction<recordType>):number{
if(index < (data.length - 1)){
return extractKey(data[index+1])
} else {
return currentNextIp;
}
}

interface ipInfo{
range: [ number, number ];
country: string;
region: string;
eu: "0"|"1";
timezone: string;
city: string;
ll: [ number, number ];
metro: number;
area: number;
}

function lookup4(stringifiedIp:string): Promise<ipInfo|null> {
const ip = utils.ipStr2Num(stringifiedIp)
var rootIndex:number;
var ipData: ipBlockRecord;
var nextIp:number = utils.ipStr2Num("255.255.255.255");
return readFile<indexFile>("index.json")
.then(function(data){
rootIndex = utils.binarySearch(data, ip, utils.identity)
if (rootIndex == -1){
// Ip is not in the database, return empty object
throw "IP not found in the database"
}
nextIp = getNextIp<number>(data, rootIndex, nextIp, utils.identity)
return readFile<indexFile>("i"+rootIndex+".json")
})
.then(function(data){
const index = utils.binarySearch(data, ip, utils.identity)+rootIndex*params.NUMBER_NODES_PER_MIDINDEX
nextIp = getNextIp<number>(data, index, nextIp, utils.identity)
return readFile<ipBlockRecord[]>(index+".json")
}).then(function(data){
const index = utils.binarySearch(data, ip, firstArrayItem)
ipData = data[index]
if(ipData[1] == null){
throw "IP doesn't any region nor country associated"
}
nextIp = getNextIp<ipBlockRecord>(data, index, nextIp, firstArrayItem)
return readLocationRecord(ipData[1])
}).then(function(data){
return {
range: [ ipData[0], nextIp ] as [number, number],
country: data[0],
region: data[1],
eu: data[5],
timezone: data[4],
city: data[2],
ll: [ ipData[2], ipData[3] ] as [number, number],
metro: data[3],
area: ipData[4]
}
}).catch(function(){
return null;
})
}

export = {
lookup: lookup4,
enableCache
}
20 changes: 20 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 35 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"name": "fast-geoip",
"version": "1.0.0",
"description": "A faster & low-memory replacement for geoip-lite, a node library that maps IPs to geographical information",
"directories": {
"test": "tests"
},
"scripts": {
"build": "tsc",
"test": "node tests/lookup.test.js"
},
"keywords": [
"geoip",
"geolocation",
"ip"
],
"repository": {
"type": "git",
"url": "git+https://github.com/corollari/fast-geoip.git"
},
"author": "corollari",
"license": "MIT",
"files": [
"data/*",
"build/index.js",
"build/index.d.ts",
"build/utils.js"
],
"main": "build/index.js",
"types": "build/index.d.ts",
"devDependencies": {
"@types/node": "^14.0.20",
"typescript": "^3.9.6"
}
}
130 changes: 130 additions & 0 deletions processGeoIpCsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import csv, os, json, shutil
from math import sqrt, floor, ceil

RAW_DATABASE_DIR = "raw"
DATA_DIR = "data"
BLOCK_SIZE = 2**12 # = 4KB, the storage block size on almost all new OSes
FILE_SIZE = BLOCK_SIZE*12 - 100 # File size is made to be lower than the size of 12 storage blocks (minus a few bytes to account for overheads) in order to make sure that all the file's contents are directly addressed from the file's inode (preventing indirect access to storage blocks)

def removeOldData():
shutil.rmtree(DATA_DIR, ignore_errors=True) # Clean directory
os.mkdir(DATA_DIR)

def jsonify(item):
return json.dumps(item).encode('utf-8')

def storeFile(filename, content, binary = False):
if binary:
mode = "wb"
else:
mode = "w"
with open(os.path.join(DATA_DIR, filename), mode) as newFile:
newFile.write(content)

def parseNumber(num, parser):
return 0 if num=="" else parser(num)

def extract_location_attrs(row):
# The only data from the locations file that is returned by neode-geoip is:
# - country_iso_code (row 4)
# - subdivision_1_iso_code (row 6)
# - city_name (row 10)
# - metro_code (row 11)
# - time_zone (row 12)
# - is_in_european_union (row 13)
return [row[4], row[6],row[10], parseNumber(row[11], int), row[12], row[13] ]

def generateLocationsFile():
geoname_ids = {}
location_items= []
max_item_length=0
with open(os.path.join(RAW_DATABASE_DIR, "GeoLite2-City-Locations-en.csv")) as locations_file:
locations = csv.reader(locations_file, delimiter=',')
next(locations) # Ignore first line (headers)
counter = 0
for row in locations:
current_geoname_id = row[0]
geoname_ids[current_geoname_id]=counter
counter+=1
stored_attrs = jsonify(extract_location_attrs(row))
location_items.append(stored_attrs)
max_item_length=max(max_item_length, len(stored_attrs))

location_items=map(lambda item: item.rjust(max_item_length, b' '), location_items)
new_location_file_content = b'['+b','.join(location_items)+b']' # Make it into a json even if it will not be used that way
storeFile("locations.json", new_location_file_content, True)

return [geoname_ids, max_item_length+1]

def extract_block_attrs(row, geoname_ids):
# Attrs used by node-geoip:
# - range (will be derived from the ip being searched) [0]
# - geoname_id (needs to be transformed to match the ids generated before for the locations file) [1]
# - latitude [7]
# - longitude [8]
# - accuracy_radius [9]
try:
locations_id = geoname_ids[row[1]]
except:
locations_id = geoname_ids.get(row[2], None)
return [ locations_id, parseNumber(row[7], float), parseNumber(row[8], float), parseNumber(row[9], int)]

def storeIps(ips, counter, ipIndex):
ips = ips[:-1] + b']' # Remove the trailing comma and add ]
ipIndex.append(json.loads(ips)[0][0]) # Store the first IP of the set into the index
storeFile(f"{counter}.json", ips, True)

def ipStr2Int(strIp):
ip = [int(e) for e in strIp.split('.')]
return ip[0]*256**3 + ip[1]*256**2 + ip[2]*256**1 + ip[3]

def generateBlockFiles(geoname_ids):
counter = 0
ips = b'['
ipIndex = []
with open(os.path.join(RAW_DATABASE_DIR, "GeoLite2-City-Blocks-IPv4.csv")) as blocks_file:
blocks = csv.reader(blocks_file, delimiter=',')
next(blocks) # Skip headers
for row in blocks:
[ip, mask] = row[0].split('/')
mask = int(mask)
ip = ipStr2Int(ip)
attrs = jsonify([ip] + extract_block_attrs(row, geoname_ids)) + b','
if len(ips + attrs) > FILE_SIZE:
storeIps(ips, counter, ipIndex)
counter += 1
ips = b'[' + attrs
else:
ips += attrs

storeIps(ips, counter, ipIndex)
return ipIndex

def generateIndexes(ipIndex):
rootIpIndex = []
ROOT_NODES = floor(sqrt(len(ipIndex))) # See readme for the rationale behind this formula
MID_NODES = ceil(len(ipIndex)/ROOT_NODES)
for i in range(ROOT_NODES):
rootIpIndex.append(ipIndex[i*MID_NODES])
storeFile(f"i{i}.json", json.dumps(ipIndex[i*MID_NODES:(i+1)*MID_NODES]))

storeFile("index.json", json.dumps(rootIpIndex))
return MID_NODES

def storeDynamicParams(location_record_length, num_mid_nodes):
with open("params.js", "w") as params_file:
params = {
"LOCATION_RECORD_SIZE": location_record_length,
"NUMBER_NODES_PER_MIDINDEX": num_mid_nodes
}
params_file.write(f"module.exports = {json.dumps(params, indent=4)}") # Pretty-printed json

def main():
removeOldData()
[geoname_ids, location_record_length] = generateLocationsFile()
ipIndex = generateBlockFiles(geoname_ids)
num_mid_nodes = generateIndexes(ipIndex)
storeDynamicParams(location_record_length, num_mid_nodes)

if __name__ == '__main__':
main()
Loading

0 comments on commit 9b03f9c

Please sign in to comment.