-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #126 from realratchet/master
Added column selector
- Loading branch information
Showing
55 changed files
with
4,965 additions
and
538 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
import std/times | ||
from std/math import splitDecimal | ||
from utils import divmod, extractUnit | ||
|
||
const DAYS_PER_MONTH_TABLE* = [ | ||
[31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], # not leap | ||
[31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] # leap | ||
] | ||
|
||
type YearRange* = range[1..9999] | ||
type MicrosecondRange* = range[0..999_999] | ||
|
||
proc isLeapYear*(year: int): bool {.inline.} = year mod 4 == 0 and (year mod 100 != 0 or year mod 400 == 0) | ||
proc getDaysInMonth*(year, month: int): int {.inline.} = DAYS_PER_MONTH_TABLE[int isLeapYear(year)][month - 1] | ||
|
||
proc toTimedelta*( | ||
weeks = 0, days = 0, hours = 0, minutes = 0, seconds = 0, milliseconds = 0, microseconds: int = 0 | ||
): (int, int, int) {.inline.} = | ||
var d, s, us: int | ||
|
||
var v_weeks = weeks | ||
var v_days = days | ||
var v_hours = hours | ||
var v_minutes = minutes | ||
var v_seconds = seconds | ||
var v_milliseconds = milliseconds | ||
var v_microseconds = microseconds | ||
|
||
# Normalize everything to days, seconds, microseconds. | ||
v_days += v_weeks*7 | ||
v_seconds += v_minutes*60 + v_hours*3600 | ||
v_microseconds += v_milliseconds*1000 | ||
|
||
d = v_days | ||
|
||
(v_days, v_seconds) = divmod(v_seconds, 24*3600) | ||
|
||
d += v_days | ||
s += int(v_seconds) # can't overflow | ||
|
||
v_microseconds = int(v_microseconds) | ||
(v_seconds, v_microseconds) = divmod(v_microseconds, 1000000) | ||
(v_days, v_seconds) = divmod(v_seconds, 24*3600) | ||
d += v_days | ||
s += v_seconds | ||
|
||
# Just a little bit of carrying possible for microseconds and seconds. | ||
(v_seconds, us) = divmod(v_microseconds, 1000000) | ||
s += v_seconds | ||
(v_days, s) = divmod(s, 24*3600) | ||
d += v_days | ||
|
||
return (d, s, us) | ||
|
||
proc days2YearsDays(days: int): (int, int) = | ||
let days_per_400years = (400*365 + 100 - 4 + 1) | ||
# Adjust so it's relative to the year 2000 (divisible by 400) | ||
var tdays = days - (365*30 + 7) | ||
|
||
# Break down the 400 year cycle to get the year and day within the year | ||
var year: int | ||
(year, tdays) = extractUnit(tdays, days_per_400years) | ||
year = 400 * year | ||
|
||
# Work out the year/day within the 400 year cycle | ||
if (tdays >= 366): | ||
year = year + (100 * int ((tdays-1) / (100*365 + 25 - 1))) | ||
tdays = (tdays-1) mod (100*365 + 25 - 1) | ||
if (tdays >= 365): | ||
year += 4 * int ((tdays+1) / (4*365 + 1)) | ||
tdays = (tdays+1) mod (4*365 + 1) | ||
if (tdays >= 366): | ||
year = year + int ((tdays-1) / 365) | ||
tdays = (tdays-1) mod 365 | ||
|
||
return (year + 2000, tdays) | ||
|
||
proc days2Components(days: int): (int, Month, MonthdayRange) = | ||
var (dts_year, idays) = days2YearsDays(days) | ||
let month_lengths = DAYS_PER_MONTH_TABLE[int isLeapYear(dts_year)] | ||
|
||
for i in 0..11: | ||
if (idays < month_lengths[i]): | ||
let dts_month = Month(i + 1) | ||
let dts_day = MonthdayRange (idays + 1) | ||
return (dts_year, dts_month, dts_day) | ||
else: | ||
idays = (idays - month_lengths[i]) | ||
|
||
raise newException(IndexDefect, "failed") | ||
|
||
proc days2Date*(days: int): DateTime = | ||
let (dts_year, dts_month, dts_day) = days2Components(days) | ||
|
||
return dateTime(dts_year, dts_month, dts_day, zone=utc()) | ||
|
||
proc delta2Date*( | ||
weeks = 0, days = 0, hours = 0, minutes = 0, seconds = 0, milliseconds = 0, microseconds: int = 0 | ||
): DateTime = | ||
let (d, s, us) = toTimedelta(weeks, days, hours, minutes, seconds, milliseconds, microseconds) | ||
let date = days2Date(d) | ||
let durr = initDuration(seconds=s, microseconds=us) | ||
|
||
let final = date + durr | ||
|
||
return final | ||
|
||
proc date2NimDateTime*(year: int, month: int, day: int): DateTime {.inline.} = | ||
return dateTime(year, Month(month), MonthdayRange(day), zone=utc()) | ||
|
||
proc datetime2NimDatetime*(year: int, month: int, day: int, hour: int, minute: int, second: int, microsecond: int): DateTime {.inline.} = | ||
return dateTime(year, Month(month), MonthdayRange(day), hour, second, microsecond * 1000, zone=utc()) | ||
|
||
proc time2NimDuration*(hour: int, minute: int, second: int, microsecond: int): Duration {.inline.} = | ||
return initDuration(hours=hour, minutes=minute, seconds=second, microseconds=microsecond) | ||
|
||
proc duration2Time*(self: Duration): Time {.inline.} = Time() + self | ||
proc time2Duration*(self: Time): Duration {.inline.} = self - Time() | ||
proc seconds2Duration*(seconds: float): Duration {.inline.} = | ||
let (secs, frac) = seconds.splitDecimal() | ||
let us = frac * 1_000_000 | ||
return initDuration(seconds=int secs, microseconds=int us) | ||
|
||
proc duration2Seconds*(dur: Duration): float {.inline.} = dur.inMicroseconds / 1_000_000 | ||
|
||
proc duration2Date*(dur: Duration): DateTime {.inline.} = dateTime(1970, mJan, 1, zone=utc()) + dur | ||
proc seconds2Date*(seconds: float): DateTime {.inline.} = duration2Date(seconds2Duration(seconds)) | ||
proc datetime2Date*(self: DateTime): DateTime {.inline.} = dateTime(self.year, self.month, self.monthday, zone=utc()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
import column_selector/sliceconv | ||
import column_selector/infos | ||
import column_selector/collectinfo | ||
|
||
export ColInfo | ||
export toPyObj | ||
export collectColumnSelectInfo | ||
export doSliceConvert | ||
export fromPyObjToDesiredInfos | ||
|
||
when isMainModule and appType != "lib": | ||
|
||
import std/[os, tables, sugar, sets, sequtils, paths, macros] | ||
import nimpy as nimpy | ||
from ../nimpyext import `!` | ||
import std/options as opt | ||
import ../pymodules as pymodules | ||
import ../numpy | ||
import typetraits | ||
|
||
proc columnSelect(table: nimpy.PyObject, cols: nimpy.PyObject, tqdm: nimpy.PyObject, dir_pid: Path, TaskManager: nimpy.PyObject): (nimpy.PyObject, nimpy.PyObject) = | ||
# this is nim-only implementation, the library build doesn't need it because we need TaskManager to be used for slices | ||
var (columns, page_count, is_correct_type, desired_column_map, passed_column_data, failed_column_data, res_cols_pass, res_cols_fail, column_names, reject_reason_name) = collectColumnSelectInfo(table, cols, string dir_pid) | ||
|
||
if toSeq(is_correct_type.values).all(proc (x: bool): bool = x): | ||
let tbl_pass_columns = collect(initTable()): | ||
for (desired_name, desired_info) in desired_column_map.pairs(): | ||
{desired_name: table[desired_info.original_name]} | ||
|
||
let tbl_fail_columns = collect(initTable()): | ||
for desired_name in failed_column_data: | ||
{desired_name: newSeq[nimpy.PyObject]()} | ||
|
||
let tbl_pass = tablite().Table(columns = tbl_pass_columns) | ||
let tbl_fail = tablite().Table(columns = tbl_fail_columns) | ||
|
||
return (tbl_pass, tbl_fail) | ||
|
||
template ordered2PyDict(keys: seq[string]): nimpy.PyObject = | ||
let dict = pymodules.builtins().dict() | ||
|
||
for k in keys: | ||
dict[k] = newSeq[nimpy.PyObject]() | ||
|
||
dict | ||
|
||
var tbl_pass = tablite().Table(columns = passed_column_data.ordered2PyDict()) | ||
var tbl_fail = tablite().Table(columns = failed_column_data.ordered2PyDict()) | ||
|
||
var task_list_inp = collect: | ||
for i in 0..<page_count: | ||
let el = collect(initTable()): | ||
for (name, column) in columns.pairs: | ||
{name: column[i]} | ||
(el, res_cols_pass[i], res_cols_fail[i]) | ||
|
||
var page_size = tabliteConfig().Config.PAGE_SIZE.to(int) | ||
var pbar = tqdm!(total: task_list_inp.len, desc: "column select") | ||
var converted = newSeqOfCap[(seq[(string, nimpy.PyObject)], seq[(string, nimpy.PyObject)])](task_list_inp.len) | ||
|
||
for (columns, res_pass, res_fail) in task_list_inp: | ||
converted.add(doSliceConvert(dir_pid, page_size, columns, reject_reason_name, res_pass, res_fail, desired_column_map, column_names, is_correct_type)) | ||
|
||
discard pbar.update(1) | ||
|
||
proc extendTable(table: var nimpy.PyObject, columns: seq[(string, nimpy.PyObject)]): void {.inline.} = | ||
for (col_name, pg) in columns: | ||
let col = table[col_name] | ||
|
||
discard col.pages.append(pg) # can't col.extend because nim is dumb :) | ||
|
||
for (pg_pass, pg_fail) in converted: | ||
tbl_pass.extendTable(pg_pass) | ||
tbl_fail.extendTable(pg_fail) | ||
|
||
return (tbl_pass, tbl_fail) | ||
|
||
proc newColumnSelectorInfo(column: string, `type`: string, allow_empty: bool, rename: opt.Option[string]): nimpy.PyObject = | ||
let pyDict = builtins().dict( | ||
column = column, | ||
type = `type`, | ||
allow_empty = allow_empty | ||
) | ||
|
||
if rename.isNone(): | ||
pyDict["rename"] = nil | ||
else: | ||
pyDict["rename"] = rename.get() | ||
|
||
return pyDict | ||
|
||
let workdir = Path(pymodules.builtins().str(pymodules.tabliteConfig().Config.workdir).to(string)) | ||
let pid = "nim" | ||
let pagedir = workdir / Path(pid) / Path("pages") | ||
|
||
createDir(string pagedir) | ||
|
||
pymodules.tabliteConfig().Config.pid = pid | ||
# pymodules.tabliteConfig().Config.PAGE_SIZE = 2 | ||
# pymodules.tabliteConfig().Config.MULTIPROCESSING_MODE = pymodules.tabliteConfig().Config.FALSE | ||
|
||
# let columns = pymodules.builtins().dict({"A ": @[nimValueToPy(0), nimValueToPy(nil), nimValueToPy(10), nimValueToPy(200)]}.toTable) | ||
# let columns = pymodules.builtins().dict({"A ": @[1, 22, 333]}.toTable) | ||
# let columns = pymodules.builtins().dict({"A ": @["1", "22", "333", "", "abc"]}.toTable) | ||
# let columns = pymodules.builtins().dict({"A ": @[nimValueToPy("1"), nimValueToPy("222"), nimValueToPy("333"), nimValueToPy(nil), nimValueToPy("abc")]}.toTable) | ||
let columns = pymodules.builtins().dict({"A ": @[nimValueToPy(1), nimValueToPy(2.0), nimValueToPy("333"), nimValueToPy("abc")]}.toTable) | ||
# let columns = pymodules.builtins().dict({"A": @[nimValueToPy("0"), nimValueToPy(nil), nimValueToPy("2")], "B": @[nimValueToPy("3"), nimValueToPy(nil), nimValueToPy("4")]}.toTable) | ||
# let columns = pymodules.builtins().dict({"str": @["1", "0"]}) | ||
# let columns = pymodules.builtins().dict({"float": @[1.0, 0.0]}) | ||
# let columns = pymodules.builtins().dict({"date": @[ | ||
# datetime().date(2000, 1, 1), | ||
# datetime().date(2000, 1, 2), | ||
# ]}) | ||
# let columns = pymodules.builtins().dict({"str": @[nimValueToPy("abc"), nimValueToPy("efg"), nimValueToPy(nil)]}.toTable) | ||
let table = pymodules.tablite().Table(columns = columns) | ||
|
||
discard table.show(dtype = true) | ||
|
||
let select_cols = builtins().list(@[ | ||
# newColumnSelectorInfo("A ", "int", true, opt.none[string]()), | ||
newColumnSelectorInfo("A ", "float", true, opt.none[string]()), | ||
# newColumnSelectorInfo("A ", "float", false, opt.none[string]()), | ||
# newColumnSelectorInfo("A ", "bool", false, opt.none[string]()), | ||
# newColumnSelectorInfo("A ", "str", false, opt.none[string]()), | ||
# newColumnSelectorInfo("A ", "date", false, opt.none[string]()), | ||
# newColumnSelectorInfo("A ", "datetime", false, opt.none[string]()), | ||
# newColumnSelectorInfo("A ", "time", false, opt.none[string]()), | ||
# newColumnSelectorInfo("A", "int", true, opt.none[string]()), | ||
# newColumnSelectorInfo("B", "str", true, opt.none[string]()), | ||
|
||
# newColumnSelectorInfo("str", "bool", false, opt.some("bool")), | ||
# newColumnSelectorInfo("str", "int", false, opt.some("int")), | ||
# newColumnSelectorInfo("str", "float", false, opt.some("float")), | ||
# newColumnSelectorInfo("str", "str", false, opt.some("str")), | ||
|
||
# newColumnSelectorInfo("float", "bool", false, opt.some("bool")), | ||
# newColumnSelectorInfo("float", "int", false, opt.some("int")), | ||
# newColumnSelectorInfo("float", "float", false, opt.some("float")), | ||
# newColumnSelectorInfo("float", "str", false, opt.some("str")), | ||
# newColumnSelectorInfo("float", "date", false, opt.some("date")), | ||
# newColumnSelectorInfo("float", "time", false, opt.some("time")), | ||
# newColumnSelectorInfo("float", "datetime", false, opt.some("datetime")), | ||
|
||
# newColumnSelectorInfo("date", "bool", false, opt.some("bool")), | ||
# newColumnSelectorInfo("date", "int", false, opt.some("int")), | ||
# newColumnSelectorInfo("date", "float", false, opt.some("float")), | ||
# newColumnSelectorInfo("date", "str", false, opt.some("str")), | ||
# newColumnSelectorInfo("date", "date", false, opt.some("date")), | ||
# newColumnSelectorInfo("date", "time", false, opt.some("time")), | ||
# newColumnSelectorInfo("date", "datetime", false, opt.some("datetime")), | ||
|
||
# newColumnSelectorInfo("str", "str", true, opt.some("str")), | ||
]) | ||
|
||
let (select_pass, select_fail) = table.columnSelect( | ||
select_cols, | ||
nimpy.pyImport("tqdm").tqdm, | ||
dir_pid = workdir / Path(pid), | ||
Taskmanager = mplite().TaskManager | ||
) | ||
|
||
discard select_pass.show(dtype = true) | ||
discard select_fail.show(dtype = true) |
Oops, something went wrong.