Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 34 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ A very simple Python database with time as the primary method of querying.
- [**log**](#log)
- [**get\_data\_at\_time**](#get_data_at_time)
- [**get\_data\_at\_range**](#get_data_at_range)
- [**get\_data\_at\_field\_threshold**](#get_data_at_field_threshold)

## How it works.

RexDB works in a very straightforward manner. It works through the operating system file structure. The database is stored in a directory called db\_\<number\>, this is so that multiple databases could be stored in the same directory. inside the database folder is another set of folders and within those folders are the files that contain your entries. However, these files are unreadable as they are just structs packed into bytes.
Expand Down Expand Up @@ -87,7 +89,7 @@ Will log your data in the database and mark it with an automatically generated t
<u>arguments</u>

- time
- time.struct_time
- `time.struct_time`
- the time of the entry which you want to retrieve.

<u>functionality</u>
Expand All @@ -103,12 +105,40 @@ Given a time, this function will return the data entry logged at that time. If t
<u>arguments</u>

- start_time
- time.struct_time
- `time.struct_time`
- the start of your specified range
- end_time
- time.struct_time
- `time.struct_time`
- the end of your specified range

<u>functionality</u>

Will return all entries within a specified time range, if there are no entries within the specified range, will return an empty list.
Will return all entries within a specified time range, if there are no entries within the specified range, will return an empty list.

### **get_data_at_field_threshold**

<u>type</u>

- `str * 'a * int set -> ('a * 'a -> {-1, 0, 1}) -> time.struct_time -> time.struct_time`

<u>arguments</u>

- field
- `str`
- the field name you are querying on
- theshold
- `'a`
- the cutoff you are using
- goal
- The integers used in this set should be -1, 0, 1.
- -1 corresponds to less than, 0 corresponds to equal to, 1 corresponds to greater than. put in your set each operation you would like to include.
- cmp_fn
- `'a * 'a -> {-1, 0, 1}`
- the comparison function you will use
- must return some integer x in the set {-1, 0, 1}
- start_time
- time.struct_time
- when you want to start looking
- end_time
- time.struct_time
- when you want to end looking
62 changes: 62 additions & 0 deletions src/rexdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,65 @@ def get_data_at_range(self, start_time: time.struct_time, end_time: time.struct_
print(f"could not search file: {e}")

return entries

def get_field_filtered(self,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another thought, given our memory issues we probably want to require a limit on the number of entries you get back.
Otherwise calling get_field_filtered is pretty dangerous, you have no idea how much memory will be allocated in the worst case.

field: str,
filter_fn,
start_time=None,
end_time=None):
"""
string * 'a * int set * ('a * 'a -> ORDER) * struct_time * struct_time -> list

field
- str
- is a string and is the name of the field you want to query on.

filter_fn:
- 'a -> bool
- The function you want to use to filter your data. Should return a bool that represents
if you want to take that entry as part of the returned set.

the start_time and end_time fields are optional fields to limit your search
to a specific time range.

The complexity of this function if O(n). This function does not benefit from
the speed increase that the map files provide.
"""

entries = []
filepaths = []
# get files to search
if start_time and end_time:
# If start and end times were specified only search files that fall within that range.
start = start_time
end = end_time
elif start_time:
# if only start time specified search from start time to now
start = start_time
end = time.mktime(self._timer_function())
else:
# if neither are specified search from the database's start to now
start = self._init_time
end = time.mktime(self._timer_function())

filepaths = self._file_manager.locations_from_range(start, end)
# get the correct field index for comparison
for i, f in enumerate(self._field_names):
if field.lower() == f.lower():
field_index = i

# access every file
for filepath in filepaths:
try:
with open(filepath, "rb") as file:
for _ in range(self._file_manager.lines_per_file):
raw_data = file.read(self._packer.line_size)
if len(raw_data) == self._packer.line_size:
data = self._packer.unpack(raw_data)
# use filter function
if filter_fn(data[field_index]):
entries.append(data)
except Exception as e:
print(f"could not search file: {e}")

return entries
82 changes: 82 additions & 0 deletions tests/test_threshold_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from pyfakefs import fake_filesystem_unittest
from src.rexdb import RexDB
import random
from tests.faketime import FakeTime


class QueryTests(fake_filesystem_unittest.TestCase):
def setUp(self) -> None:
self.setUpPyfakefs()
self.time = FakeTime()

def less_than(self, x):
return lambda y: y < x

def greater_than(self, x):
return lambda y: y > x

def test_query_basic(self):
db = RexDB("ifc", ("index", "super precise number", "initial"), 26, 2)
rand_test_answer_key = []
f = self.greater_than(0.75)
g = self.greater_than(40)
for i in range(50):
num = random.random()
db.log((i, num, bytes(chr(i % 26 + 65), "utf-8")))
if f(num):
rand_test_answer_key.append(i)
self.time.sleep(0.5)

data = db.get_field_filtered("index", g)

self.assertEqual(len(data), 9)
for i in range(9):
self.assertEqual(data[i][1], 41 + i)

data = db.get_field_filtered("super precise number", f)

for i in range(len(rand_test_answer_key)):
self.assertEqual(data[i][1], rand_test_answer_key[i])

def test_verbose(self):
db = RexDB("if", ("index", "rand_num"))
random_low_answer_key = []
random_high_answer_key = []
l_10p = self.less_than(0.1)
g_90p = self.greater_than(0.9)
l_100 = self.less_than(100)
g_4899 = self.greater_than(4899)

for i in range(5000):
num = random.random()
db.log((i, num))
if num < 0.1:
random_low_answer_key.append((i, num))
elif num > 0.9:
random_high_answer_key.append((i, num))

self.time.sleep(1)

# test that the first entries can be retrieved successfully and less than operation
data = db.get_field_filtered("index", l_100)
self.assertEqual(len(data), 100)
for i in range(100):
self.assertEqual(data[i][1], i)

# test that the last entries can be retrieved successfully and greater than operation
data = db.get_field_filtered("index", g_4899)
self.assertEqual(len(data), 100)
for i in range(100):
self.assertEqual(data[i][1], i + 4900)

# test against the random_low_answer_key
data = db.get_field_filtered("rand_num", l_10p)
self.assertEqual(len(data), len(random_low_answer_key))
for i in range(len(random_low_answer_key)):
self.assertEqual(data[i][1], random_low_answer_key[i][0])

# test against the random_high_answer_key
data = db.get_field_filtered("rand_num", g_90p)
self.assertEqual(len(data), len(random_high_answer_key))
for i in range(len(random_high_answer_key)):
self.assertEqual(data[i][1], random_high_answer_key[i][0])