Skip to content

Commit

Permalink
Merge pull request #126 from realratchet/master
Browse files Browse the repository at this point in the history
Added column selector
  • Loading branch information
realratchet authored Jan 25, 2024
2 parents 97af15a + 3523315 commit d8e4662
Show file tree
Hide file tree
Showing 55 changed files with 4,965 additions and 538 deletions.
12 changes: 9 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,14 @@
*.pyc
tests/new.zip
__pycache__
*/_nimlite/nimlite
*/_nimlite/*.so
*/_nimlite/*.pyd
*/_nimlite/*
!*/_nimlite/**.nim
!*/_nimlite/funcs/
!*/_nimlite/includes/
!*/_nimlite/includes/**.nim
!*/_nimlite/funcs/column_selector/**.nim
!*/_nimlite/**.py
!*/_nimlite/**.pyi

# Notebook checkpoints
.ipynb_checkpoints/
Expand All @@ -27,3 +32,4 @@ site/

# local confidential data
tests/ndata/*
tests/data/pages/*.dis
2 changes: 2 additions & 0 deletions build_nim.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ fi
if [ $is_release = true ]
then
nim c --app:lib -d:release -d:danger --out:tablite/_nimlite/nimlite.so tablite/_nimlite/nimlite.nim
echo "Built release."
else
nim c --app:lib -d:debug --out:tablite/_nimlite/nimlite.so tablite/_nimlite/nimlite.nim
echo "Built debug."
fi
128 changes: 128 additions & 0 deletions tablite/_nimlite/dateutils.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import std/times
from std/math import splitDecimal
from utils import divmod, extractUnit

const DAYS_PER_MONTH_TABLE* = [
[31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], # not leap
[31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] # leap
]

type YearRange* = range[1..9999]
type MicrosecondRange* = range[0..999_999]

proc isLeapYear*(year: int): bool {.inline.} = year mod 4 == 0 and (year mod 100 != 0 or year mod 400 == 0)
proc getDaysInMonth*(year, month: int): int {.inline.} = DAYS_PER_MONTH_TABLE[int isLeapYear(year)][month - 1]

proc toTimedelta*(
weeks = 0, days = 0, hours = 0, minutes = 0, seconds = 0, milliseconds = 0, microseconds: int = 0
): (int, int, int) {.inline.} =
var d, s, us: int

var v_weeks = weeks
var v_days = days
var v_hours = hours
var v_minutes = minutes
var v_seconds = seconds
var v_milliseconds = milliseconds
var v_microseconds = microseconds

# Normalize everything to days, seconds, microseconds.
v_days += v_weeks*7
v_seconds += v_minutes*60 + v_hours*3600
v_microseconds += v_milliseconds*1000

d = v_days

(v_days, v_seconds) = divmod(v_seconds, 24*3600)

d += v_days
s += int(v_seconds) # can't overflow

v_microseconds = int(v_microseconds)
(v_seconds, v_microseconds) = divmod(v_microseconds, 1000000)
(v_days, v_seconds) = divmod(v_seconds, 24*3600)
d += v_days
s += v_seconds

# Just a little bit of carrying possible for microseconds and seconds.
(v_seconds, us) = divmod(v_microseconds, 1000000)
s += v_seconds
(v_days, s) = divmod(s, 24*3600)
d += v_days

return (d, s, us)

proc days2YearsDays(days: int): (int, int) =
let days_per_400years = (400*365 + 100 - 4 + 1)
# Adjust so it's relative to the year 2000 (divisible by 400)
var tdays = days - (365*30 + 7)

# Break down the 400 year cycle to get the year and day within the year
var year: int
(year, tdays) = extractUnit(tdays, days_per_400years)
year = 400 * year

# Work out the year/day within the 400 year cycle
if (tdays >= 366):
year = year + (100 * int ((tdays-1) / (100*365 + 25 - 1)))
tdays = (tdays-1) mod (100*365 + 25 - 1)
if (tdays >= 365):
year += 4 * int ((tdays+1) / (4*365 + 1))
tdays = (tdays+1) mod (4*365 + 1)
if (tdays >= 366):
year = year + int ((tdays-1) / 365)
tdays = (tdays-1) mod 365

return (year + 2000, tdays)

proc days2Components(days: int): (int, Month, MonthdayRange) =
var (dts_year, idays) = days2YearsDays(days)
let month_lengths = DAYS_PER_MONTH_TABLE[int isLeapYear(dts_year)]

for i in 0..11:
if (idays < month_lengths[i]):
let dts_month = Month(i + 1)
let dts_day = MonthdayRange (idays + 1)
return (dts_year, dts_month, dts_day)
else:
idays = (idays - month_lengths[i])

raise newException(IndexDefect, "failed")

proc days2Date*(days: int): DateTime =
let (dts_year, dts_month, dts_day) = days2Components(days)

return dateTime(dts_year, dts_month, dts_day, zone=utc())

proc delta2Date*(
weeks = 0, days = 0, hours = 0, minutes = 0, seconds = 0, milliseconds = 0, microseconds: int = 0
): DateTime =
let (d, s, us) = toTimedelta(weeks, days, hours, minutes, seconds, milliseconds, microseconds)
let date = days2Date(d)
let durr = initDuration(seconds=s, microseconds=us)

let final = date + durr

return final

proc date2NimDateTime*(year: int, month: int, day: int): DateTime {.inline.} =
return dateTime(year, Month(month), MonthdayRange(day), zone=utc())

proc datetime2NimDatetime*(year: int, month: int, day: int, hour: int, minute: int, second: int, microsecond: int): DateTime {.inline.} =
return dateTime(year, Month(month), MonthdayRange(day), hour, second, microsecond * 1000, zone=utc())

proc time2NimDuration*(hour: int, minute: int, second: int, microsecond: int): Duration {.inline.} =
return initDuration(hours=hour, minutes=minute, seconds=second, microseconds=microsecond)

proc duration2Time*(self: Duration): Time {.inline.} = Time() + self
proc time2Duration*(self: Time): Duration {.inline.} = self - Time()
proc seconds2Duration*(seconds: float): Duration {.inline.} =
let (secs, frac) = seconds.splitDecimal()
let us = frac * 1_000_000
return initDuration(seconds=int secs, microseconds=int us)

proc duration2Seconds*(dur: Duration): float {.inline.} = dur.inMicroseconds / 1_000_000

proc duration2Date*(dur: Duration): DateTime {.inline.} = dateTime(1970, mJan, 1, zone=utc()) + dur
proc seconds2Date*(seconds: float): DateTime {.inline.} = duration2Date(seconds2Duration(seconds))
proc datetime2Date*(self: DateTime): DateTime {.inline.} = dateTime(self.year, self.month, self.monthday, zone=utc())
163 changes: 163 additions & 0 deletions tablite/_nimlite/funcs/column_selector.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import column_selector/sliceconv
import column_selector/infos
import column_selector/collectinfo

export ColInfo
export toPyObj
export collectColumnSelectInfo
export doSliceConvert
export fromPyObjToDesiredInfos

when isMainModule and appType != "lib":

import std/[os, tables, sugar, sets, sequtils, paths, macros]
import nimpy as nimpy
from ../nimpyext import `!`
import std/options as opt
import ../pymodules as pymodules
import ../numpy
import typetraits

proc columnSelect(table: nimpy.PyObject, cols: nimpy.PyObject, tqdm: nimpy.PyObject, dir_pid: Path, TaskManager: nimpy.PyObject): (nimpy.PyObject, nimpy.PyObject) =
# this is nim-only implementation, the library build doesn't need it because we need TaskManager to be used for slices
var (columns, page_count, is_correct_type, desired_column_map, passed_column_data, failed_column_data, res_cols_pass, res_cols_fail, column_names, reject_reason_name) = collectColumnSelectInfo(table, cols, string dir_pid)

if toSeq(is_correct_type.values).all(proc (x: bool): bool = x):
let tbl_pass_columns = collect(initTable()):
for (desired_name, desired_info) in desired_column_map.pairs():
{desired_name: table[desired_info.original_name]}

let tbl_fail_columns = collect(initTable()):
for desired_name in failed_column_data:
{desired_name: newSeq[nimpy.PyObject]()}

let tbl_pass = tablite().Table(columns = tbl_pass_columns)
let tbl_fail = tablite().Table(columns = tbl_fail_columns)

return (tbl_pass, tbl_fail)

template ordered2PyDict(keys: seq[string]): nimpy.PyObject =
let dict = pymodules.builtins().dict()

for k in keys:
dict[k] = newSeq[nimpy.PyObject]()

dict

var tbl_pass = tablite().Table(columns = passed_column_data.ordered2PyDict())
var tbl_fail = tablite().Table(columns = failed_column_data.ordered2PyDict())

var task_list_inp = collect:
for i in 0..<page_count:
let el = collect(initTable()):
for (name, column) in columns.pairs:
{name: column[i]}
(el, res_cols_pass[i], res_cols_fail[i])

var page_size = tabliteConfig().Config.PAGE_SIZE.to(int)
var pbar = tqdm!(total: task_list_inp.len, desc: "column select")
var converted = newSeqOfCap[(seq[(string, nimpy.PyObject)], seq[(string, nimpy.PyObject)])](task_list_inp.len)

for (columns, res_pass, res_fail) in task_list_inp:
converted.add(doSliceConvert(dir_pid, page_size, columns, reject_reason_name, res_pass, res_fail, desired_column_map, column_names, is_correct_type))

discard pbar.update(1)

proc extendTable(table: var nimpy.PyObject, columns: seq[(string, nimpy.PyObject)]): void {.inline.} =
for (col_name, pg) in columns:
let col = table[col_name]

discard col.pages.append(pg) # can't col.extend because nim is dumb :)

for (pg_pass, pg_fail) in converted:
tbl_pass.extendTable(pg_pass)
tbl_fail.extendTable(pg_fail)

return (tbl_pass, tbl_fail)

proc newColumnSelectorInfo(column: string, `type`: string, allow_empty: bool, rename: opt.Option[string]): nimpy.PyObject =
let pyDict = builtins().dict(
column = column,
type = `type`,
allow_empty = allow_empty
)

if rename.isNone():
pyDict["rename"] = nil
else:
pyDict["rename"] = rename.get()

return pyDict

let workdir = Path(pymodules.builtins().str(pymodules.tabliteConfig().Config.workdir).to(string))
let pid = "nim"
let pagedir = workdir / Path(pid) / Path("pages")

createDir(string pagedir)

pymodules.tabliteConfig().Config.pid = pid
# pymodules.tabliteConfig().Config.PAGE_SIZE = 2
# pymodules.tabliteConfig().Config.MULTIPROCESSING_MODE = pymodules.tabliteConfig().Config.FALSE

# let columns = pymodules.builtins().dict({"A ": @[nimValueToPy(0), nimValueToPy(nil), nimValueToPy(10), nimValueToPy(200)]}.toTable)
# let columns = pymodules.builtins().dict({"A ": @[1, 22, 333]}.toTable)
# let columns = pymodules.builtins().dict({"A ": @["1", "22", "333", "", "abc"]}.toTable)
# let columns = pymodules.builtins().dict({"A ": @[nimValueToPy("1"), nimValueToPy("222"), nimValueToPy("333"), nimValueToPy(nil), nimValueToPy("abc")]}.toTable)
let columns = pymodules.builtins().dict({"A ": @[nimValueToPy(1), nimValueToPy(2.0), nimValueToPy("333"), nimValueToPy("abc")]}.toTable)
# let columns = pymodules.builtins().dict({"A": @[nimValueToPy("0"), nimValueToPy(nil), nimValueToPy("2")], "B": @[nimValueToPy("3"), nimValueToPy(nil), nimValueToPy("4")]}.toTable)
# let columns = pymodules.builtins().dict({"str": @["1", "0"]})
# let columns = pymodules.builtins().dict({"float": @[1.0, 0.0]})
# let columns = pymodules.builtins().dict({"date": @[
# datetime().date(2000, 1, 1),
# datetime().date(2000, 1, 2),
# ]})
# let columns = pymodules.builtins().dict({"str": @[nimValueToPy("abc"), nimValueToPy("efg"), nimValueToPy(nil)]}.toTable)
let table = pymodules.tablite().Table(columns = columns)

discard table.show(dtype = true)

let select_cols = builtins().list(@[
# newColumnSelectorInfo("A ", "int", true, opt.none[string]()),
newColumnSelectorInfo("A ", "float", true, opt.none[string]()),
# newColumnSelectorInfo("A ", "float", false, opt.none[string]()),
# newColumnSelectorInfo("A ", "bool", false, opt.none[string]()),
# newColumnSelectorInfo("A ", "str", false, opt.none[string]()),
# newColumnSelectorInfo("A ", "date", false, opt.none[string]()),
# newColumnSelectorInfo("A ", "datetime", false, opt.none[string]()),
# newColumnSelectorInfo("A ", "time", false, opt.none[string]()),
# newColumnSelectorInfo("A", "int", true, opt.none[string]()),
# newColumnSelectorInfo("B", "str", true, opt.none[string]()),

# newColumnSelectorInfo("str", "bool", false, opt.some("bool")),
# newColumnSelectorInfo("str", "int", false, opt.some("int")),
# newColumnSelectorInfo("str", "float", false, opt.some("float")),
# newColumnSelectorInfo("str", "str", false, opt.some("str")),

# newColumnSelectorInfo("float", "bool", false, opt.some("bool")),
# newColumnSelectorInfo("float", "int", false, opt.some("int")),
# newColumnSelectorInfo("float", "float", false, opt.some("float")),
# newColumnSelectorInfo("float", "str", false, opt.some("str")),
# newColumnSelectorInfo("float", "date", false, opt.some("date")),
# newColumnSelectorInfo("float", "time", false, opt.some("time")),
# newColumnSelectorInfo("float", "datetime", false, opt.some("datetime")),

# newColumnSelectorInfo("date", "bool", false, opt.some("bool")),
# newColumnSelectorInfo("date", "int", false, opt.some("int")),
# newColumnSelectorInfo("date", "float", false, opt.some("float")),
# newColumnSelectorInfo("date", "str", false, opt.some("str")),
# newColumnSelectorInfo("date", "date", false, opt.some("date")),
# newColumnSelectorInfo("date", "time", false, opt.some("time")),
# newColumnSelectorInfo("date", "datetime", false, opt.some("datetime")),

# newColumnSelectorInfo("str", "str", true, opt.some("str")),
])

let (select_pass, select_fail) = table.columnSelect(
select_cols,
nimpy.pyImport("tqdm").tqdm,
dir_pid = workdir / Path(pid),
Taskmanager = mplite().TaskManager
)

discard select_pass.show(dtype = true)
discard select_fail.show(dtype = true)
Loading

0 comments on commit d8e4662

Please sign in to comment.