diff --git a/README.md b/README.md index 364a7f1..f318bbd 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,8 @@ A very simple Python database with time as the primary method of querying. - [**log**](#log) - [**get\_data\_at\_time**](#get_data_at_time) - [**get\_data\_at\_range**](#get_data_at_range) + - [**get\_data\_at\_field\_threshold**](#get_data_at_field_threshold) + ## How it works. RexDB works in a very straightforward manner. It works through the operating system file structure. The database is stored in a directory called db\_\, this is so that multiple databases could be stored in the same directory. inside the database folder is another set of folders and within those folders are the files that contain your entries. However, these files are unreadable as they are just structs packed into bytes. @@ -87,7 +89,7 @@ Will log your data in the database and mark it with an automatically generated t arguments - time - - time.struct_time + - `time.struct_time` - the time of the entry which you want to retrieve. functionality @@ -103,12 +105,40 @@ Given a time, this function will return the data entry logged at that time. If t arguments - start_time - - time.struct_time + - `time.struct_time` - the start of your specified range - end_time - - time.struct_time + - `time.struct_time` - the end of your specified range functionality -Will return all entries within a specified time range, if there are no entries within the specified range, will return an empty list. \ No newline at end of file +Will return all entries within a specified time range, if there are no entries within the specified range, will return an empty list. + +### **get_data_at_field_threshold** + +type + +- `str * 'a * int set -> ('a * 'a -> {-1, 0, 1}) -> time.struct_time -> time.struct_time` + +arguments + +- field + - `str` + - the field name you are querying on +- theshold + - `'a` + - the cutoff you are using +- goal + - The integers used in this set should be -1, 0, 1. + - -1 corresponds to less than, 0 corresponds to equal to, 1 corresponds to greater than. put in your set each operation you would like to include. +- cmp_fn + - `'a * 'a -> {-1, 0, 1}` + - the comparison function you will use + - must return some integer x in the set {-1, 0, 1} +- start_time + - time.struct_time + - when you want to start looking +- end_time + - time.struct_time + - when you want to end looking diff --git a/src/rexdb.py b/src/rexdb.py index 9a186e9..464e554 100644 --- a/src/rexdb.py +++ b/src/rexdb.py @@ -146,3 +146,65 @@ def get_data_at_range(self, start_time: time.struct_time, end_time: time.struct_ print(f"could not search file: {e}") return entries + + def get_field_filtered(self, + field: str, + filter_fn, + start_time=None, + end_time=None): + """ + string * 'a * int set * ('a * 'a -> ORDER) * struct_time * struct_time -> list + + field + - str + - is a string and is the name of the field you want to query on. + + filter_fn: + - 'a -> bool + - The function you want to use to filter your data. Should return a bool that represents + if you want to take that entry as part of the returned set. + + the start_time and end_time fields are optional fields to limit your search + to a specific time range. + + The complexity of this function if O(n). This function does not benefit from + the speed increase that the map files provide. + """ + + entries = [] + filepaths = [] + # get files to search + if start_time and end_time: + # If start and end times were specified only search files that fall within that range. + start = start_time + end = end_time + elif start_time: + # if only start time specified search from start time to now + start = start_time + end = time.mktime(self._timer_function()) + else: + # if neither are specified search from the database's start to now + start = self._init_time + end = time.mktime(self._timer_function()) + + filepaths = self._file_manager.locations_from_range(start, end) + # get the correct field index for comparison + for i, f in enumerate(self._field_names): + if field.lower() == f.lower(): + field_index = i + + # access every file + for filepath in filepaths: + try: + with open(filepath, "rb") as file: + for _ in range(self._file_manager.lines_per_file): + raw_data = file.read(self._packer.line_size) + if len(raw_data) == self._packer.line_size: + data = self._packer.unpack(raw_data) + # use filter function + if filter_fn(data[field_index]): + entries.append(data) + except Exception as e: + print(f"could not search file: {e}") + + return entries diff --git a/tests/test_threshold_query.py b/tests/test_threshold_query.py new file mode 100644 index 0000000..6a9b23b --- /dev/null +++ b/tests/test_threshold_query.py @@ -0,0 +1,82 @@ +from pyfakefs import fake_filesystem_unittest +from src.rexdb import RexDB +import random +from tests.faketime import FakeTime + + +class QueryTests(fake_filesystem_unittest.TestCase): + def setUp(self) -> None: + self.setUpPyfakefs() + self.time = FakeTime() + + def less_than(self, x): + return lambda y: y < x + + def greater_than(self, x): + return lambda y: y > x + + def test_query_basic(self): + db = RexDB("ifc", ("index", "super precise number", "initial"), 26, 2) + rand_test_answer_key = [] + f = self.greater_than(0.75) + g = self.greater_than(40) + for i in range(50): + num = random.random() + db.log((i, num, bytes(chr(i % 26 + 65), "utf-8"))) + if f(num): + rand_test_answer_key.append(i) + self.time.sleep(0.5) + + data = db.get_field_filtered("index", g) + + self.assertEqual(len(data), 9) + for i in range(9): + self.assertEqual(data[i][1], 41 + i) + + data = db.get_field_filtered("super precise number", f) + + for i in range(len(rand_test_answer_key)): + self.assertEqual(data[i][1], rand_test_answer_key[i]) + + def test_verbose(self): + db = RexDB("if", ("index", "rand_num")) + random_low_answer_key = [] + random_high_answer_key = [] + l_10p = self.less_than(0.1) + g_90p = self.greater_than(0.9) + l_100 = self.less_than(100) + g_4899 = self.greater_than(4899) + + for i in range(5000): + num = random.random() + db.log((i, num)) + if num < 0.1: + random_low_answer_key.append((i, num)) + elif num > 0.9: + random_high_answer_key.append((i, num)) + + self.time.sleep(1) + + # test that the first entries can be retrieved successfully and less than operation + data = db.get_field_filtered("index", l_100) + self.assertEqual(len(data), 100) + for i in range(100): + self.assertEqual(data[i][1], i) + + # test that the last entries can be retrieved successfully and greater than operation + data = db.get_field_filtered("index", g_4899) + self.assertEqual(len(data), 100) + for i in range(100): + self.assertEqual(data[i][1], i + 4900) + + # test against the random_low_answer_key + data = db.get_field_filtered("rand_num", l_10p) + self.assertEqual(len(data), len(random_low_answer_key)) + for i in range(len(random_low_answer_key)): + self.assertEqual(data[i][1], random_low_answer_key[i][0]) + + # test against the random_high_answer_key + data = db.get_field_filtered("rand_num", g_90p) + self.assertEqual(len(data), len(random_high_answer_key)) + for i in range(len(random_high_answer_key)): + self.assertEqual(data[i][1], random_high_answer_key[i][0])