diff --git a/.gitattributes b/.gitattributes index 6c1f9b1f5..69bbb1a72 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,5 +1,7 @@ # Auto detect text files and perform LF normalization * text=auto +*.sqlite filter=lfs diff=lfs merge=lfs -text +*.bson filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.db filter=lfs diff=lfs merge=lfs -text *.sql filter=lfs diff=lfs merge=lfs -text @@ -9,3 +11,4 @@ query_crmarenapro/query_dataset/*.sql filter=lfs diff=lfs merge=lfs -text query_crmarenapro/query_dataset/hidden/*.db filter=lfs diff=lfs merge=lfs -text query_crmarenapro/query_dataset/hidden/*.duckdb filter=lfs diff=lfs merge=lfs -text query_crmarenapro/query_dataset/hidden/*.sql filter=lfs diff=lfs merge=lfs -text +query_krama/query_dataset/misc_files/* filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index d44030790..44c7fe845 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,10 @@ .DS_Store # general py __pycache__/ +.history/ + +# runtime +claude_projects/ # results results/ @@ -35,6 +39,15 @@ common_scaffold/tools/backup/ common_scaffold/*_backup/ # datasets +query_imdb/query_dataset/imdb* +query_imdb/data_raw/ +query_imdb/job_queries/ +query_imdb/job_results/ +query_imdb/query*/query.sql +query_imdb/scripts/ +query_krama/query_dataset/topic_files/topic_files_db/files.bson.bak +query_krama/query*/ground_truth.py +query_krama/scripts/ query_civic_unstructured/ query_paper_unstructured/ query_notice_unstructured/ diff --git a/query_imdb/db_config.yaml b/query_imdb/db_config.yaml new file mode 100644 index 000000000..23dcb8475 --- /dev/null +++ b/query_imdb/db_config.yaml @@ -0,0 +1,8 @@ +db_clients: + movies_database: + db_type: postgres + db_name: movies_db + sql_file: query_dataset/movies.sql + people_database: + db_type: sqlite + db_path: query_dataset/people.sqlite \ No newline at end of file diff --git a/query_imdb/db_description.txt b/query_imdb/db_description.txt new file mode 100644 index 000000000..5f6b67a80 --- /dev/null +++ b/query_imdb/db_description.txt @@ -0,0 +1,182 @@ +You are working with two databases to solve this query. + +Here are the descriptions of these two databases: + +1. movies_database + - This database is stored in a PostgreSQL database and contains movie-centric information. It covers titles, production companies, keywords, genres, ratings, and structural relationships between titles. + - This database consists of 15 tables: + - title + - This table is the central table of the movies_database. Each row represents a unique title entry (movie, TV series, episode, video game, etc.). + - Fields: + - id (str): Unique identifier for the title. + - title (str): The title string of the movie or show. + - imdb_index (str): Disambiguation suffix used by IMDB (e.g., Roman numerals). + - kind_id (int): Foreign key referencing kind_type.id; indicates the type of title. + - production_year (int): Year in which the title was produced or released. + - imdb_id (int): Original numeric IMDB identifier. + - phonetic_code (str): Phonetic encoding of the title for search purposes. + - episode_of_id (int): For TV episodes, references the id of the parent series in this table. + - season_nr (int): Season number (for TV episodes). + - episode_nr (int): Episode number within a season. + - series_years (str): Year span for TV series (e.g., "1990-1995"). + - md5sum (str): MD5 checksum of the record. + - aka_title + - This table stores alternate or foreign-language titles for movies and shows. + - Fields: + - id (int): Unique identifier for the alternate title record. + - movie_id (int): Foreign key referencing title.id. + - title (str): The alternate title string. + - imdb_index (str): Disambiguation suffix. + - kind_id (int): Foreign key referencing kind_type.id. + - production_year (int): Production year for this alternate release. + - phonetic_code (str): Phonetic encoding of the alternate title. + - episode_of_id (int): Parent series ID for episode alternate titles. + - season_nr (int): Season number. + - episode_nr (int): Episode number. + - note (str): Notes on context (e.g., country, dubbed version). + - md5sum (str): MD5 checksum. + - kind_type + - This lookup table enumerates the types of titles (e.g., movie, TV series, episode). + - Fields: + - id (int): Unique identifier. + - kind (str): Label for the title type (e.g., "movie", "tv series", "episode"). + - movie_info + - This table stores free-text metadata about movies, such as genres, languages, countries, plot summaries, and technical specifications. + - Fields: + - id (int): Unique identifier for the info record. + - movie_id (int): Foreign key referencing title.id. + - info_type_id (str): Foreign key referencing info_type.id. + - info (str): The metadata value (e.g., "English", "USA", "Drama"). + - note (str): Supplemental notes. + - movie_info_idx + - This table stores indexed numeric metadata about movies, primarily ratings and vote counts. + - Fields: + - id (int): Unique identifier for the index record. + - movie_id (int): Foreign key referencing title.id. + - info_type_id (int): Foreign key referencing info_type.id. + - info (str): The numeric metadata value (e.g., a rating score). + - note (str): Supplemental notes. + - info_type + - This lookup table enumerates the categories of metadata stored in movie_info and movie_info_idx (e.g., "genres", "rating", "votes"). + - Fields: + - id (str): Unique identifier. + - info (str): Description of the info category (e.g., "genres", "rating", "languages"). + - movie_keyword + - This table records the association between movies and descriptive keywords. + - Fields: + - id (int): Unique identifier for the movie-keyword association. + - movie_id (int): Foreign key referencing title.id. + - keyword_id (str): Foreign key referencing keyword.id. + - keyword + - This lookup table contains all descriptive keywords (tags) used to annotate movies. + - Fields: + - id (str): Unique identifier. + - keyword (str): The keyword text (e.g., "murder", "based-on-novel"). + - phonetic_code (str): Phonetic encoding of the keyword. + - movie_companies + - This table links movies to the production or distribution companies involved. + - Fields: + - id (str): Unique identifier. + - movie_id (int): Foreign key referencing title.id. + - company_id (int): Foreign key referencing company_name.id. + - company_type_id (int): Foreign key referencing company_type.id. + - note (str): Additional notes on the company's role. + - company_name + - This lookup table contains names and metadata for production and distribution companies. + - Fields: + - id (str): Unique identifier. + - name (str): Full name of the company. + - country_code (str): Country code of the company (e.g., "[us]", "[gb]"). + - imdb_id (int): Original IMDB numeric identifier. + - name_pcode_nf (str): Phonetic code for the company name. + - name_pcode_sf (str): Alternate phonetic code. + - md5sum (str): MD5 checksum. + - company_type + - This lookup table enumerates the roles a company can have (e.g., production, distribution). + - Fields: + - id (int): Unique identifier. + - kind (str): Company role label (e.g., "production companies", "distributors"). + - movie_link + - This table records directional relationships between titles (e.g., sequel, remake, spin-off). + - Fields: + - id (int): Unique identifier. + - movie_id (int): Foreign key referencing title.id (the source title). + - linked_movie_id (int): Foreign key referencing title.id (the related title). + - link_type_id (int): Foreign key referencing link_type.id. + - link_type + - This lookup table enumerates the types of relationships between titles. + - Fields: + - id (int): Unique identifier. + - link (str): Relationship description (e.g., "follows", "sequel", "remake of"). + - complete_cast + - This table records the completeness status of cast and crew information for a movie. + - Fields: + - id (int): Unique identifier. + - movie_id (int): Foreign key referencing title.id. + - subject_id (int): Foreign key referencing comp_cast_type.id; indicates whether cast or crew is being described. + - status_id (int): Foreign key referencing comp_cast_type.id; indicates the completeness status. + - comp_cast_type + - This lookup table enumerates the subjects and statuses used in complete_cast. + - Fields: + - id (int): Unique identifier. + - kind (str): Label for the subject or status (e.g., "cast", "crew", "complete", "complete+verified"). + +2. people_database + - This database is stored in a SQLite database and contains people-centric information from IMDB. It covers individuals who worked on movies (actors, directors, writers, etc.), their alternate names, roles, biographical information, and their casting associations with titles. + - This database consists of 6 tables: + - name + - This table is the central table of the people_database. Each row represents a unique person in the IMDB database. + - Fields: + - id (str): Unique identifier. + - name (str): Full name of the person (last name, first name format). + - imdb_index (str): Disambiguation suffix for persons sharing the same name. + - imdb_id (int): Original IMDB numeric identifier. + - gender (str): Gender of the person ("m" or "f"). + - name_pcode_cf (str): Phonetic code for the full name. + - name_pcode_nf (str): Alternate phonetic code. + - surname_pcode (str): Phonetic code for the surname. + - md5sum (str): MD5 checksum. + - aka_name + - This table stores alternate names or pseudonyms for people. + - Fields: + - id (int): Unique identifier. + - person_id (int): Foreign key referencing name.id. + - name (str): The alternate name or pseudonym. + - imdb_index (str): Disambiguation suffix. + - name_pcode_cf (str): Phonetic code. + - name_pcode_nf (str): Alternate phonetic code. + - surname_pcode (str): Phonetic code for the surname. + - md5sum (str): MD5 checksum. + - cast_info + - This table records the association between a person and a movie in a specific role. It is the central join table linking people_database to movies_database. + - Fields: + - id (int): Unique identifier. + - person_id (str): Foreign key referencing name.id. + - movie_id (int): Foreign key referencing title.id in movies_database. + - person_role_id (int): Foreign key referencing char_name.id; the character played. + - note (str): Additional credit notes (e.g., "uncredited"). + - nr_order (int): Billing order of the credit. + - role_id (int): Foreign key referencing role_type.id. + - char_name + - This table contains character names that appear in movie credits. + - Fields: + - id (int): Unique identifier. + - name (str): The character name. + - imdb_index (str): Disambiguation suffix. + - imdb_id (int): Original IMDB numeric identifier. + - name_pcode_nf (str): Phonetic code. + - surname_pcode (str): Phonetic code for the surname portion. + - md5sum (str): MD5 checksum. + - person_info + - This table stores biographical and career metadata about people (e.g., birth date, birthplace, trivia). + - Fields: + - id (int): Unique identifier. + - person_id (int): Foreign key referencing name.id. + - info_type_id (int): Foreign key referencing info_type.id in movies_database. + - info (str): The metadata value. + - note (str): Supplemental notes. + - role_type + - This lookup table enumerates the types of roles a person can have in a movie (e.g., actor, director, writer). + - Fields: + - id (str): Unique identifier. + - role (str): Role description (e.g., "actor", "director", "writer", "producer"). diff --git a/query_imdb/db_description_withhint.txt b/query_imdb/db_description_withhint.txt new file mode 100644 index 000000000..326796b33 --- /dev/null +++ b/query_imdb/db_description_withhint.txt @@ -0,0 +1,2 @@ +HINTS: +- Many identifier columns in both databases use non-standard string encodings rather than plain integers. To join across tables, you must extract the embedded numeric ID by stripping the alphabetic prefix, any punctuation padding characters, and any leading zeros. For example, `tt0000042` → `42`, `nm001` → `1`, `InfT~~7` → `7`. This applies to the following columns: title.id, company_name.id, movie_companies.id, name.id, cast_info.person_id, keyword.id, movie_keyword.keyword_id, info_type.id, movie_info.info_type_id, and role_type.id. \ No newline at end of file diff --git a/query_imdb/query1/ground_truth.csv b/query_imdb/query1/ground_truth.csv new file mode 100644 index 000000000..4bfcab02f --- /dev/null +++ b/query_imdb/query1/ground_truth.csv @@ -0,0 +1,2 @@ +movie_kind,complete_us_internet_movie +movie,Dirt Merchant diff --git a/query_imdb/query1/query.json b/query_imdb/query1/query.json new file mode 100644 index 000000000..892c52830 --- /dev/null +++ b/query_imdb/query1/query.json @@ -0,0 +1 @@ +"Among movies, TV movies, video movies, and video games produced after 1990 that are tagged with at least one keyword, are associated with a US company, have a US internet release date recorded in the 1990s or 2000s, and have a complete and verified cast listing — what is the alphabetically first title, and what type of title is it?" \ No newline at end of file diff --git a/query_imdb/query1/validate.py b/query_imdb/query1/validate.py new file mode 100644 index 000000000..9840778dd --- /dev/null +++ b/query_imdb/query1/validate.py @@ -0,0 +1,20 @@ +import re + + +def normalize(text): + text = re.sub(r"[^a-z0-9\s]", " ", text.lower()) + return re.sub(r"\s+", " ", text).strip() + + +def validate(llm_output: str): + llm_norm = normalize(llm_output) + + # movie_kind: 'movie' + if "movie" not in llm_norm: + return False, "Kind 'movie' not found in LLM output." + + # complete_us_internet_movie: 'Dirt Merchant' + if normalize("Dirt Merchant") not in llm_norm: + return False, "Title 'Dirt Merchant' not found in LLM output." + + return True, "Ground truth found in LLM output." diff --git a/query_imdb/query10/ground_truth.csv b/query_imdb/query10/ground_truth.csv new file mode 100644 index 000000000..f6f20caf5 --- /dev/null +++ b/query_imdb/query10/ground_truth.csv @@ -0,0 +1,2 @@ +cast_member,complete_dynamic_hero_movie +"Abell, Alistair",...And Then I... diff --git a/query_imdb/query10/query.json b/query_imdb/query10/query.json new file mode 100644 index 000000000..f8905f550 --- /dev/null +++ b/query_imdb/query10/query.json @@ -0,0 +1 @@ +"Among cast members who played a character whose name is not null and contains 'man' or 'Man', in titles of kind 'movie' produced after 2000, tagged with at least one of the keywords 'superhero', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence', 'magnet', 'web', 'claw', or 'laser', and with a cast listing whose completeness status contains 'complete', where the keyword tag, cast credit, and cast completeness record all refer to the same movie — what are the alphabetically first cast member name and the alphabetically first title?" \ No newline at end of file diff --git a/query_imdb/query10/validate.py b/query_imdb/query10/validate.py new file mode 100644 index 000000000..f783f6e7d --- /dev/null +++ b/query_imdb/query10/validate.py @@ -0,0 +1,20 @@ +import re + + +def normalize(text): + text = re.sub(r"[^a-z0-9\s]", " ", text.lower()) + return re.sub(r"\s+", " ", text).strip() + + +def validate(llm_output: str): + llm_norm = normalize(llm_output) + + # cast_member: 'Abell, Alistair' + if normalize("Abell, Alistair") not in llm_norm: + return False, "Cast member 'Abell, Alistair' not found in LLM output." + + # complete_dynamic_hero_movie: '...And Then I...' — normalize to 'and then i' + if normalize("And Then I") not in llm_norm: + return False, "Title '...And Then I...' not found in LLM output." + + return True, "Ground truth found in LLM output." diff --git a/query_imdb/query2/ground_truth.csv b/query_imdb/query2/ground_truth.csv new file mode 100644 index 000000000..1c8cc0871 --- /dev/null +++ b/query_imdb/query2/ground_truth.csv @@ -0,0 +1,2 @@ +writer_pseudo_name,movie_title +"""A.J.""",#1 Cheerleader Camp diff --git a/query_imdb/query2/query.json b/query_imdb/query2/query.json new file mode 100644 index 000000000..de4165455 --- /dev/null +++ b/query_imdb/query2/query.json @@ -0,0 +1 @@ +"Among writers who have a registered pseudonym and are credited on movies associated with a US company, what is the alphabetically first pseudonym and the alphabetically first movie title?" \ No newline at end of file diff --git a/query_imdb/query2/validate.py b/query_imdb/query2/validate.py new file mode 100644 index 000000000..d2268b7af --- /dev/null +++ b/query_imdb/query2/validate.py @@ -0,0 +1,21 @@ +import re + + +def normalize(text): + text = re.sub(r"[^a-z0-9\s]", " ", text.lower()) + return re.sub(r"\s+", " ", text).strip() + + +def validate(llm_output: str): + llm_norm = normalize(llm_output) + llm_lower = llm_output.lower() + + # writer_pseudo_name: '"A.J."' — check case-insensitively preserving dots + if "a.j." not in llm_lower and "aj" not in llm_norm: + return False, "Pseudonym 'A.J.' not found in LLM output." + + # movie_title: '#1 Cheerleader Camp' + if normalize("#1 Cheerleader Camp") not in llm_norm: + return False, "Title '#1 Cheerleader Camp' not found in LLM output." + + return True, "Ground truth found in LLM output." diff --git a/query_imdb/query3/ground_truth.csv b/query_imdb/query3/ground_truth.csv new file mode 100644 index 000000000..1ae1afe29 --- /dev/null +++ b/query_imdb/query3/ground_truth.csv @@ -0,0 +1,2 @@ +voicing_actress,jap_engl_voiced_movie +"Aaron, Caroline",$9.99 diff --git a/query_imdb/query3/query.json b/query_imdb/query3/query.json new file mode 100644 index 000000000..c6463a40f --- /dev/null +++ b/query_imdb/query3/query.json @@ -0,0 +1 @@ +"Among female actresses credited with a voice role — including general voice, uncredited voice, Japanese dubbed version, or English dubbed version — in movies produced after 2000 that are associated with a US company and have a release date on record, where the actress has a registered alternate name and played a named character, what is the alphabetically first actress name and the alphabetically first movie title?" \ No newline at end of file diff --git a/query_imdb/query3/validate.py b/query_imdb/query3/validate.py new file mode 100644 index 000000000..274a8c374 --- /dev/null +++ b/query_imdb/query3/validate.py @@ -0,0 +1,21 @@ +import re + + +def normalize(text): + text = re.sub(r"[^a-z0-9\s]", " ", text.lower()) + return re.sub(r"\s+", " ", text).strip() + + +def validate(llm_output: str): + llm_norm = normalize(llm_output) + + # voicing_actress: 'Aaron, Caroline' + if normalize("Aaron, Caroline") not in llm_norm: + return False, "Actress name 'Aaron, Caroline' not found in LLM output." + + # jap_engl_voiced_movie: '$9.99' — check for '9.99' as a float + matches = re.findall(r"\d+\.\d+", llm_output) + if not any(abs(float(m) - 9.99) < 0.01 for m in matches): + return False, "Movie title '$9.99' not found in LLM output." + + return True, "Ground truth found in LLM output." diff --git a/query_imdb/query4/ground_truth.csv b/query_imdb/query4/ground_truth.csv new file mode 100644 index 000000000..d40713424 --- /dev/null +++ b/query_imdb/query4/ground_truth.csv @@ -0,0 +1,2 @@ +member_in_charnamed_movie,a1 +"Z'Dar, Robert","Z'Dar, Robert" diff --git a/query_imdb/query4/query.json b/query_imdb/query4/query.json new file mode 100644 index 000000000..5e2f6b90c --- /dev/null +++ b/query_imdb/query4/query.json @@ -0,0 +1 @@ +"Among cast and crew members whose name starts with 'Z' and who are credited in movies tagged with the keyword 'character-name-in-title' — indicating the movie's title contains a character's name — that are associated with at least one company, what is the alphabetically first name?" \ No newline at end of file diff --git a/query_imdb/query4/validate.py b/query_imdb/query4/validate.py new file mode 100644 index 000000000..0b60f0db6 --- /dev/null +++ b/query_imdb/query4/validate.py @@ -0,0 +1,16 @@ +import re + + +def normalize(text): + text = re.sub(r"[^a-z0-9\s]", " ", text.lower()) + return re.sub(r"\s+", " ", text).strip() + + +def validate(llm_output: str): + llm_norm = normalize(llm_output) + + # member_in_charnamed_movie / a1: "Z'Dar, Robert" — normalize to 'zdar robert' + if normalize("Z'Dar, Robert") not in llm_norm: + return False, "Name 'Z\\'Dar, Robert' not found in LLM output." + + return True, "Ground truth found in LLM output." diff --git a/query_imdb/query5/ground_truth.csv b/query_imdb/query5/ground_truth.csv new file mode 100644 index 000000000..af17cd41a --- /dev/null +++ b/query_imdb/query5/ground_truth.csv @@ -0,0 +1,2 @@ +producing_company,rating,movie +"""O"" Films",1.0,#54 Meets #47 diff --git a/query_imdb/query5/query.json b/query_imdb/query5/query.json new file mode 100644 index 000000000..6f07e4879 --- /dev/null +++ b/query_imdb/query5/query.json @@ -0,0 +1 @@ +"Among titles of kind 'movie' (excluding TV movies, video movies, and other title types) that have a release date and a rating on record and are associated with a US production company, what are the alphabetically first company name, the alphabetically first rating value, and the alphabetically first title?" \ No newline at end of file diff --git a/query_imdb/query5/validate.py b/query_imdb/query5/validate.py new file mode 100644 index 000000000..2916413bf --- /dev/null +++ b/query_imdb/query5/validate.py @@ -0,0 +1,25 @@ +import re + + +def normalize(text): + text = re.sub(r"[^a-z0-9\s]", " ", text.lower()) + return re.sub(r"\s+", " ", text).strip() + + +def validate(llm_output: str): + llm_norm = normalize(llm_output) + + # producing_company: '"O" Films' — normalize to 'o films' + if normalize('"O" Films') not in llm_norm: + return False, "Company '\"O\" Films' not found in LLM output." + + # rating: '1.0' — numeric check + matches = re.findall(r"\d+\.\d+", llm_output) + if not any(abs(float(m) - 1.0) < 0.01 for m in matches): + return False, "Rating '1.0' not found in LLM output." + + # movie: '#54 Meets #47' — normalize to '54 meets 47' + if normalize("#54 Meets #47") not in llm_norm: + return False, "Title '#54 Meets #47' not found in LLM output." + + return True, "Ground truth found in LLM output." diff --git a/query_imdb/query6/ground_truth.csv b/query_imdb/query6/ground_truth.csv new file mode 100644 index 000000000..148e7e565 --- /dev/null +++ b/query_imdb/query6/ground_truth.csv @@ -0,0 +1,2 @@ +voiced_char,voicing_actress,voiced_animation +Lola,"Andrews, Julie",Hoodwinked! diff --git a/query_imdb/query6/query.json b/query_imdb/query6/query.json new file mode 100644 index 000000000..116c5e401 --- /dev/null +++ b/query_imdb/query6/query.json @@ -0,0 +1 @@ +"Among female actresses whose name contains 'An', who have a registered alternate name, played a named character, and have trivia on record, and who are credited with a voice role (general, uncredited, Japanese version, or English version) in a computer-animated film produced between 2000 and 2010 with a Japan or USA release in the 2000s, associated with a US company, and with a complete and verified cast listing — what are the alphabetically first character name, the alphabetically first actress name, and the alphabetically first title?" \ No newline at end of file diff --git a/query_imdb/query6/validate.py b/query_imdb/query6/validate.py new file mode 100644 index 000000000..34dd8ff5f --- /dev/null +++ b/query_imdb/query6/validate.py @@ -0,0 +1,24 @@ +import re + + +def normalize(text): + text = re.sub(r"[^a-z0-9\s]", " ", text.lower()) + return re.sub(r"\s+", " ", text).strip() + + +def validate(llm_output: str): + llm_norm = normalize(llm_output) + + # voiced_char: 'Lola' + if "lola" not in llm_norm: + return False, "Character name 'Lola' not found in LLM output." + + # voicing_actress: 'Andrews, Julie' + if normalize("Andrews, Julie") not in llm_norm: + return False, "Actress name 'Andrews, Julie' not found in LLM output." + + # voiced_animation: 'Hoodwinked!' + if "hoodwinked" not in llm_norm: + return False, "Title 'Hoodwinked!' not found in LLM output." + + return True, "Ground truth found in LLM output." diff --git a/query_imdb/query7/ground_truth.csv b/query_imdb/query7/ground_truth.csv new file mode 100644 index 000000000..f6a4aeeb7 --- /dev/null +++ b/query_imdb/query7/ground_truth.csv @@ -0,0 +1,2 @@ +cool_actor_pseudonym,series_named_after_char +"!!!, Toy",& Teller diff --git a/query_imdb/query7/query.json b/query_imdb/query7/query.json new file mode 100644 index 000000000..9c790a0a9 --- /dev/null +++ b/query_imdb/query7/query.json @@ -0,0 +1 @@ +"Among cast and crew members who have a registered pseudonym and are credited in movies tagged with the keyword 'character-name-in-title' — indicating the movie's title contains a character's name — that are associated with a US company, what are the alphabetically first pseudonym and the alphabetically first title?" \ No newline at end of file diff --git a/query_imdb/query7/validate.py b/query_imdb/query7/validate.py new file mode 100644 index 000000000..7e48264de --- /dev/null +++ b/query_imdb/query7/validate.py @@ -0,0 +1,20 @@ +import re + + +def normalize(text): + text = re.sub(r"[^a-z0-9\s]", " ", text.lower()) + return re.sub(r"\s+", " ", text).strip() + + +def validate(llm_output: str): + llm_norm = normalize(llm_output) + + # cool_actor_pseudonym: '!!!, Toy' — normalize to 'toy' + if not re.search(r"\btoy\b", llm_norm): + return False, "Pseudonym '!!!, Toy' not found in LLM output." + + # series_named_after_char: '& Teller' — normalize to 'teller' + if not re.search(r"\bteller\b", llm_norm): + return False, "Title '& Teller' not found in LLM output." + + return True, "Ground truth found in LLM output." diff --git a/query_imdb/query8/ground_truth.csv b/query_imdb/query8/ground_truth.csv new file mode 100644 index 000000000..a4f26d601 --- /dev/null +++ b/query_imdb/query8/ground_truth.csv @@ -0,0 +1,2 @@ +alternative_name,voiced_char_name,voicing_actress,american_movie +"!!!, Toy","""Cockamamie's"" Salesgirl","Aaron, Caroline","$15,000.00 Error" diff --git a/query_imdb/query8/query.json b/query_imdb/query8/query.json new file mode 100644 index 000000000..a4c918cbe --- /dev/null +++ b/query_imdb/query8/query.json @@ -0,0 +1 @@ +"Among female actresses credited with a voice role — including general voice, uncredited voice, Japanese version, or English version — who have a registered alternate name (with the alternate name belonging to the same person holding the credit) and played a named character, in movies associated with a US company, what are the alphabetically first alternate name, the alphabetically first character name, the alphabetically first actress name, and the alphabetically first title?" \ No newline at end of file diff --git a/query_imdb/query8/validate.py b/query_imdb/query8/validate.py new file mode 100644 index 000000000..5a497e099 --- /dev/null +++ b/query_imdb/query8/validate.py @@ -0,0 +1,30 @@ +import re + + +def normalize(text): + # strip thousands-separator commas, then remove non-alphanumeric + text = re.sub(r"(?<=\d),(?=\d{3}\b)", "", text) + text = re.sub(r"[^a-z0-9\s]", " ", text.lower()) + return re.sub(r"\s+", " ", text).strip() + + +def validate(llm_output: str): + llm_norm = normalize(llm_output) + + # alternative_name: '!!!, Toy' — normalize to 'toy' + if not re.search(r"\btoy\b", llm_norm): + return False, "Alternate name '!!!, Toy' not found in LLM output." + + # voiced_char_name: '"Cockamamie\'s" Salesgirl' — normalize to 'cockamamies salesgirl' + if "cockamamies salesgirl" not in llm_norm: + return False, "Character name 'Cockamamie\\'s Salesgirl' not found in LLM output." + + # voicing_actress: 'Aaron, Caroline' + if normalize("Aaron, Caroline") not in llm_norm: + return False, "Actress name 'Aaron, Caroline' not found in LLM output." + + # american_movie: '$15,000.00 Error' — normalize to '15000 00 error' + if "15000" not in llm_norm or "error" not in llm_norm: + return False, "Title '$15,000.00 Error' not found in LLM output." + + return True, "Ground truth found in LLM output." diff --git a/query_imdb/query9/ground_truth.csv b/query_imdb/query9/ground_truth.csv new file mode 100644 index 000000000..954366dc3 --- /dev/null +++ b/query_imdb/query9/ground_truth.csv @@ -0,0 +1,2 @@ +movie_budget,movie_votes,writer,violent_liongate_movie +Horror,1040,"Agnew, Jim",2001 Maniacs diff --git a/query_imdb/query9/query.json b/query_imdb/query9/query.json new file mode 100644 index 000000000..945b9e242 --- /dev/null +++ b/query_imdb/query9/query.json @@ -0,0 +1 @@ +"Among male writers credited in a writing role — writer, head writer, written by, story, or story editor — on Horror or Thriller movies that are tagged with at least one of the keywords 'murder', 'violence', 'blood', 'gore', 'death', 'female-nudity', or 'hospital', have a vote count on record, and are associated with a Lionsgate company, where the genre, vote count, keyword, company, and writer credit all refer to the same movie — what are the alphabetically first genre, the alphabetically first vote count, the alphabetically first writer name, and the alphabetically first title?" \ No newline at end of file diff --git a/query_imdb/query9/validate.py b/query_imdb/query9/validate.py new file mode 100644 index 000000000..fc41e642d --- /dev/null +++ b/query_imdb/query9/validate.py @@ -0,0 +1,29 @@ +import re + + +def normalize(text): + text = re.sub(r"[^a-z0-9\s]", " ", text.lower()) + return re.sub(r"\s+", " ", text).strip() + + +def validate(llm_output: str): + llm_norm = normalize(llm_output) + + # movie_budget (genre): 'Horror' + if "horror" not in llm_norm: + return False, "Genre 'Horror' not found in LLM output." + + # movie_votes: '1040' — integer check + matches = re.findall(r"\b\d+\b", llm_output) + if not any(int(m) == 1040 for m in matches): + return False, "Vote count '1040' not found in LLM output." + + # writer: 'Agnew, Jim' + if normalize("Agnew, Jim") not in llm_norm: + return False, "Writer name 'Agnew, Jim' not found in LLM output." + + # violent_liongate_movie: '2001 Maniacs' + if normalize("2001 Maniacs") not in llm_norm: + return False, "Title '2001 Maniacs' not found in LLM output." + + return True, "Ground truth found in LLM output." diff --git a/query_imdb/query_dataset/movies.sql b/query_imdb/query_dataset/movies.sql new file mode 100644 index 000000000..7be2bed72 --- /dev/null +++ b/query_imdb/query_dataset/movies.sql @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ef5452628ec8d07fbdc283fdb420168bd97b718dda25648087f2fe178bb1daa +size 1622189538 diff --git a/query_imdb/query_dataset/people.sqlite b/query_imdb/query_dataset/people.sqlite new file mode 100644 index 000000000..4d1eff0c0 --- /dev/null +++ b/query_imdb/query_dataset/people.sqlite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba583402740f2f5ef42b5be7ac1dd4c47a0cac990ddf7332575e203b976c1b10 +size 2639507456 diff --git a/query_krama/db_config.yaml b/query_krama/db_config.yaml new file mode 100644 index 000000000..3fe8c90c4 --- /dev/null +++ b/query_krama/db_config.yaml @@ -0,0 +1,13 @@ +db_clients: + domain_docs: + db_type: mongo # Database type: MongoDB + db_name: domain_docs_db # Name of the MongoDB database + dump_folder: query_dataset/domain_docs # Folder with the MongoDB dump to initialize + + geo_database: + db_type: sqlite # Database type: SQLite + db_path: query_dataset/us_geo.db # Path to the SQLite file + + domain_assets: + db_type: sqlite # Database type: SQLite + db_path: query_dataset/domain_assets.db # Path to the SQLite file diff --git a/query_krama/db_description.txt b/query_krama/db_description.txt new file mode 100644 index 000000000..70a72c8d9 --- /dev/null +++ b/query_krama/db_description.txt @@ -0,0 +1,68 @@ +You are working with three data sources to solve this query. + +Here are the descriptions of these three data sources: + +1. domain_docs + - This database is stored in a MongoDB database and contains files from multiple domains including wildfire, environment, legal, biomedical, astronomy, and archeology. + - This database consists of one collection: + - files + - Every document has one field: + - content (str): Full content as a plain string; may contain multiple files concatenated together + + +2. geo_database + - This database is stored in a SQLite database and contains geospatial and environmental data. Geometry columns store data in GeoPackage binary format. + - This database consists of three tables: + - gacc_2025 + - This table contains the geographic boundaries and metadata of NIFC Geographic Area Coordination Centers (GACCs). + - Fields: + - OBJECTID (int): Row identifier + - SHAPE (blob): Polygon geometry of the GACC boundary (GeoPackage binary format) + - GeometryID (text): Geometry identifier + - GACCUnitID (text): Unit identifier + - ContactPhone (text): Contact phone number + - Comments (text): Additional notes + - DateCurrent (text): Date last updated + - MapMethod (text): Mapping methodology used + - GlobalID (text): Global unique identifier + - GACCDescription (text): Natural-language description of the GACC name, abbreviation, and location + + - usa_adm2 + - This table contains the geographic boundaries and metadata of US counties. + - Fields: + - fid (int): Row identifier + - geom (blob): Polygon geometry of the county (GeoPackage binary format) + - adm2_id (text): County identifier + - adm2_src (text): County source identifier + - adm2_name1, adm2_name2 (text): Alternate county names + - adm1_id (text): State identifier + - adm1_src (text): State source identifier + - adm1_name1, adm1_name2 (text): Alternate state names + - adm0_id (text): Country identifier + - adm0_src (text): Country source identifier + - adm0_name1, adm0_name2 (text): Alternate country names + - src_lvl (int): Source level + - src_lang (text): Source language + - src_lang1, src_lang2 (text): Alternate source languages + - src_date, src_update (text): Source date and last update date + - src_name, src_lic, src_url, src_grp (text): Source metadata + - adm_description (text): Natural-language description combining county, state, and country names + + - beach_water_temperature + - This table contains water and air temperature readings for Massachusetts beaches, with one row per measurement. + - Fields: + - community_code (text): Community code + - county_code (text): County code + - beach_name (text): Beach name + - beach_type_description (text): Beach type + - sample_date (text): Sampling date + - water_temp (text): Water temperature + - air_temp (text): Air temperature + +3. domain_assets + - This database is stored in a SQLite database and contains binary files from multiple domains stored as blobs. + - This database consists of one table: + - files + - Every row has two fields: + - file_description (text): Natural-language description of the file name, file type, and domain + - content (blob): Full file content as a binary blob diff --git a/query_krama/db_description_withhint.txt b/query_krama/db_description_withhint.txt new file mode 100644 index 000000000..5f9055dff --- /dev/null +++ b/query_krama/db_description_withhint.txt @@ -0,0 +1,16 @@ +HINTS: +- Domain contents: + - wildfire (NIFC stats, fire weather, AQI, suppression costs, demographics, Census population estimates); + - environment (MA beach water quality records 2002–2023, datasheets, EJ populations, precipitation); + - legal (FTC CSN 2024 CSVs on identity theft/fraud by MSA, age, type; HTML MSA population table); + - biomedical (hyperactivated proteins per UCEC patient CSV; CPTAC3 mmc1–mmc7 tables); + - astronomy (sunspot numbers, Starlink OMM, NOAA geomagnetic forecasts, Swarm-B density, OMNI2 solar wind/Kp 2023–2024, TLE for NORAD 43180/48445, SP3 orbit files, CDF); + - archeology (Brecke conflict dataset, Roman/world cities, climate records, radiocarbon database). +- Bacterial exceedance rate: (violations / total samples) × 100. +- Beach join key: (community_code, county_code, beach_name); CSV files use plain integers, beach_water_temperature table (in geo_database) uses prefixed strings (e.g., "COM-001"). +- MSA name matching: use city-and-state portion only; treat em dashes, double hyphens, and hyphens as equivalent. +- UCEC patient ID join: spreadsheet uses zero-padded uppercase (e.g., S001); hyperactivated CSV uses lowercase with underscore (e.g., s_1). +- TLE BSTAR: compact notation ±NNNNN±E = ±0.NNNNN × 10^E (e.g., 20621-3 = 2.0621 × 10⁻⁴). +- beach_water_temperature.sample_date is stored in varied formats (e.g., "2002-10-20", "10/20/2002", "October 20, 2002", "20/10/2002") +- Temperature unit conversion: °F = °C × 9/5 + 32; K = °C + 273.15. +- In domain_assets, different file_type or file_domain values may refer to the same type or domain (e.g., "python script", "Python", and "PYTHON CODE" all mean the same file type; "astrophysics", "Space Science", and "ASTRONOMY" all mean the same domain). diff --git a/query_krama/query1/ground_truth.csv b/query_krama/query1/ground_truth.csv new file mode 100644 index 000000000..ea0f91e2b --- /dev/null +++ b/query_krama/query1/ground_truth.csv @@ -0,0 +1 @@ +NWCC \ No newline at end of file diff --git a/query_krama/query1/query.json b/query_krama/query1/query.json new file mode 100644 index 000000000..0a48926f1 --- /dev/null +++ b/query_krama/query1/query.json @@ -0,0 +1 @@ +"According to NIFC, among the regions that have a corresponding NIFC Geographic Area Coordination Center, which GACC accumulated the most human-caused wildfire acres from 2020 to 2023 (inclusive at both ends)? Give the GACC abbreviation." \ No newline at end of file diff --git a/query_krama/query1/validate.py b/query_krama/query1/validate.py new file mode 100644 index 000000000..7dd2d86be --- /dev/null +++ b/query_krama/query1/validate.py @@ -0,0 +1,14 @@ +import re + + +def validate(llm_output: str): + expected = "NWCC" + + if expected.upper() in llm_output.upper(): + return True, f"Found expected GACC abbreviation: {expected}" + + gacc_abbrevs = ["AICC", "NRCC", "NWCC", "RMCC", "SACC", "SWCC", "ONCC", "OSCC", "EACC", "GBCC"] + found = [a for a in gacc_abbrevs if re.search(r'\b' + a + r'\b', llm_output, re.IGNORECASE)] + if found: + return False, f"Found GACC abbreviations {found}, but expected '{expected}'" + return False, "No GACC abbreviation found in LLM output" diff --git a/query_krama/query10/ground_truth.csv b/query_krama/query10/ground_truth.csv new file mode 100644 index 000000000..7b0489f58 --- /dev/null +++ b/query_krama/query10/ground_truth.csv @@ -0,0 +1 @@ +0.000748 diff --git a/query_krama/query10/query.json b/query_krama/query10/query.json new file mode 100644 index 000000000..7938f1271 --- /dev/null +++ b/query_krama/query10/query.json @@ -0,0 +1 @@ +"During the geomagnetic storm period captured in the space-track orbital observations for satellite NORAD 58214, what is the mean BSTAR drag coefficient of satellite NORAD 43180 as recorded in its TLE data? Round to 6 decimal places." \ No newline at end of file diff --git a/query_krama/query10/validate.py b/query_krama/query10/validate.py new file mode 100644 index 000000000..9c511ede7 --- /dev/null +++ b/query_krama/query10/validate.py @@ -0,0 +1,18 @@ +import re + + +def validate(llm_output: str): + expected = 0.000748 + tolerance = 5e-7 + + matches = re.findall(r"[-+]?\d+\.?\d*(?:[eE][-+]?\d+)?", llm_output) + for m in matches: + try: + if abs(float(m) - expected) <= tolerance: + return True, f"Found expected value: {expected}" + except ValueError: + continue + + if matches: + return False, f"Found numbers {matches[:5]}, but expected {expected} (±{tolerance})" + return False, "No numeric value found in LLM output" diff --git a/query_krama/query2/ground_truth.csv b/query_krama/query2/ground_truth.csv new file mode 100644 index 000000000..9d9831f4b --- /dev/null +++ b/query_krama/query2/ground_truth.csv @@ -0,0 +1 @@ +Idaho \ No newline at end of file diff --git a/query_krama/query2/query.json b/query_krama/query2/query.json new file mode 100644 index 000000000..7e96f3f25 --- /dev/null +++ b/query_krama/query2/query.json @@ -0,0 +1 @@ +"According NIFC, from 2014 to 2024 (inclusive at both ends), which US state has the highest total number of lightning-caused wildfires across all NIFC Geographic Area Coordination Centers whose geographic boundaries intersect with that state? Give the full name of the state." \ No newline at end of file diff --git a/query_krama/query2/validate.py b/query_krama/query2/validate.py new file mode 100644 index 000000000..9e2ed5bc5 --- /dev/null +++ b/query_krama/query2/validate.py @@ -0,0 +1,23 @@ +import re + + +def validate(llm_output: str): + expected = "Idaho" + + if re.search(r'\b' + re.escape(expected) + r'\b', llm_output, re.IGNORECASE): + return True, f"Found expected state: {expected}" + + us_states = [ + "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", + "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", + "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", + "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", + "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", + "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", + "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", + "Wisconsin", "Wyoming" + ] + found = [s for s in us_states if re.search(r'\b' + re.escape(s) + r'\b', llm_output, re.IGNORECASE)] + if found: + return False, f"Found states {found}, but expected '{expected}'" + return False, "No US state name found in LLM output" diff --git a/query_krama/query3/ground_truth.csv b/query_krama/query3/ground_truth.csv new file mode 100644 index 000000000..8ecd0b524 --- /dev/null +++ b/query_krama/query3/ground_truth.csv @@ -0,0 +1 @@ +12964.8727 diff --git a/query_krama/query3/query.json b/query_krama/query3/query.json new file mode 100644 index 000000000..c4c92ff5d --- /dev/null +++ b/query_krama/query3/query.json @@ -0,0 +1 @@ +"What is the average number of reported identity thefts across all US metropolitan statistical areas with a 2023 population exceeding one million? Estimate each MSA's 2023 population by linearly interpolating between its 2020 census count and 2024 estimate. Only include MSAs that appear in both the population source and the identity theft source. Round to 4 decimal places." \ No newline at end of file diff --git a/query_krama/query3/validate.py b/query_krama/query3/validate.py new file mode 100644 index 000000000..dd475a547 --- /dev/null +++ b/query_krama/query3/validate.py @@ -0,0 +1,18 @@ +import re + + +def validate(llm_output: str): + expected = 12964.8727 + tolerance = 0.00005 + + matches = re.findall(r"[-+]?\d+\.?\d*", llm_output) + for m in matches: + try: + if abs(float(m) - expected) <= tolerance: + return True, f"Found expected value: {expected}" + except ValueError: + continue + + if matches: + return False, f"Found numbers {matches[:5]}, but expected {expected} (±{tolerance})" + return False, "No numeric value found in LLM output" diff --git a/query_krama/query4/ground_truth.csv b/query_krama/query4/ground_truth.csv new file mode 100644 index 000000000..4f0337c24 --- /dev/null +++ b/query_krama/query4/ground_truth.csv @@ -0,0 +1 @@ +319655 diff --git a/query_krama/query4/query.json b/query_krama/query4/query.json new file mode 100644 index 000000000..267ce66ce --- /dev/null +++ b/query_krama/query4/query.json @@ -0,0 +1 @@ +"What is the absolute population change (in number of people) from 2020 to 2024 of the metropolitan statistical area with the highest reported identity theft rate per 100K population?" \ No newline at end of file diff --git a/query_krama/query4/validate.py b/query_krama/query4/validate.py new file mode 100644 index 000000000..3785ecf11 --- /dev/null +++ b/query_krama/query4/validate.py @@ -0,0 +1,17 @@ +import re + + +def validate(llm_output: str): + expected = 319655 + + matches = re.findall(r"[-+]?\d+", llm_output.replace(",", "")) + for m in matches: + try: + if int(m) == expected: + return True, f"Found expected value: {expected}" + except ValueError: + continue + + if matches: + return False, f"Found numbers {matches[:5]}, but expected {expected}" + return False, "No numeric value found in LLM output" diff --git a/query_krama/query5/ground_truth.csv b/query_krama/query5/ground_truth.csv new file mode 100644 index 000000000..f56a06635 --- /dev/null +++ b/query_krama/query5/ground_truth.csv @@ -0,0 +1 @@ +20.02 diff --git a/query_krama/query5/query.json b/query_krama/query5/query.json new file mode 100644 index 000000000..458b49db6 --- /dev/null +++ b/query_krama/query5/query.json @@ -0,0 +1 @@ +"From 2012 to 2018 (inclusive at both ends), what is the average water temperature (in °C) of the beach that had the most water samples that failed to meet bacterial water quality requirements? Consider only temperature readings collected within the same period. Round to 2 decimal places." \ No newline at end of file diff --git a/query_krama/query5/validate.py b/query_krama/query5/validate.py new file mode 100644 index 000000000..e4d20be61 --- /dev/null +++ b/query_krama/query5/validate.py @@ -0,0 +1,18 @@ +import re + + +def validate(llm_output: str): + expected = 20.02 + tolerance = 0.1 + + matches = re.findall(r"[-+]?\d+\.?\d*", llm_output) + for m in matches: + try: + if abs(float(m) - expected) <= tolerance: + return True, f"Found expected value: {expected}" + except ValueError: + continue + + if matches: + return False, f"Found numbers {matches[:5]}, but expected {expected} (±{tolerance})" + return False, "No numeric value found in LLM output" diff --git a/query_krama/query6/ground_truth.csv b/query_krama/query6/ground_truth.csv new file mode 100644 index 000000000..3db206e01 --- /dev/null +++ b/query_krama/query6/ground_truth.csv @@ -0,0 +1 @@ +3.88 diff --git a/query_krama/query6/query.json b/query_krama/query6/query.json new file mode 100644 index 000000000..0f2a78058 --- /dev/null +++ b/query_krama/query6/query.json @@ -0,0 +1 @@ +"From 2008 to 2016 (inclusive at both ends), what is the bacterial exceedance rate (as a percentage, rounded to 2 decimal places) of the beach with the lowest average water temperature among all temperature readings collected in that period?" \ No newline at end of file diff --git a/query_krama/query6/validate.py b/query_krama/query6/validate.py new file mode 100644 index 000000000..268cfd42c --- /dev/null +++ b/query_krama/query6/validate.py @@ -0,0 +1,18 @@ +import re + + +def validate(llm_output: str): + expected = 3.88 + tolerance = 0.1 + + matches = re.findall(r"[-+]?\d+\.?\d*", llm_output) + for m in matches: + try: + if abs(float(m) - expected) <= tolerance: + return True, f"Found expected value: {expected}" + except ValueError: + continue + + if matches: + return False, f"Found numbers {matches[:5]}, but expected {expected} (±{tolerance})" + return False, "No numeric value found in LLM output" diff --git a/query_krama/query7/ground_truth.csv b/query_krama/query7/ground_truth.csv new file mode 100644 index 000000000..1f8d809e7 --- /dev/null +++ b/query_krama/query7/ground_truth.csv @@ -0,0 +1 @@ +60.0 diff --git a/query_krama/query7/query.json b/query_krama/query7/query.json new file mode 100644 index 000000000..c64088e86 --- /dev/null +++ b/query_krama/query7/query.json @@ -0,0 +1 @@ +"Using the patient metadata recorded in the UCEC_CPTAC3_meta_table_V2.1 sheet, what is the average age of patients whose hyperactivated protein is CDK12? Round to 2 decimal places." \ No newline at end of file diff --git a/query_krama/query7/validate.py b/query_krama/query7/validate.py new file mode 100644 index 000000000..6927e407e --- /dev/null +++ b/query_krama/query7/validate.py @@ -0,0 +1,18 @@ +import re + + +def validate(llm_output: str): + expected = 60.0 + tolerance = 0.005 + + matches = re.findall(r"[-+]?\d+\.?\d*", llm_output) + for m in matches: + try: + if abs(float(m) - expected) <= tolerance: + return True, f"Found expected value: {expected}" + except ValueError: + continue + + if matches: + return False, f"Found numbers {matches[:5]}, but expected {expected} (±{tolerance})" + return False, "No numeric value found in LLM output" diff --git a/query_krama/query8/ground_truth.csv b/query_krama/query8/ground_truth.csv new file mode 100644 index 000000000..959eed85c --- /dev/null +++ b/query_krama/query8/ground_truth.csv @@ -0,0 +1 @@ +PFN1 diff --git a/query_krama/query8/query.json b/query_krama/query8/query.json new file mode 100644 index 000000000..0044fa300 --- /dev/null +++ b/query_krama/query8/query.json @@ -0,0 +1 @@ +"Among patients with Serous histologic type who were not excluded from the study, as recorded in the UCEC_CPTAC3_meta_table_V2.1 sheet, what is the hyperactivated protein of the oldest patient?" \ No newline at end of file diff --git a/query_krama/query8/validate.py b/query_krama/query8/validate.py new file mode 100644 index 000000000..614083c1a --- /dev/null +++ b/query_krama/query8/validate.py @@ -0,0 +1,14 @@ +import re + + +def validate(llm_output: str): + expected = "PFN1" + + if re.search(r'\b' + re.escape(expected) + r'\b', llm_output, re.IGNORECASE): + return True, f"Found expected protein: {expected}" + + protein_pattern = r'\b[A-Z][A-Z0-9]{1,9}\b' + found = re.findall(protein_pattern, llm_output) + if found: + return False, f"Found protein-like tokens {found[:5]}, but expected '{expected}'" + return False, "No protein identifier found in LLM output" diff --git a/query_krama/query9/ground_truth.csv b/query_krama/query9/ground_truth.csv new file mode 100644 index 000000000..9bf49e7b5 --- /dev/null +++ b/query_krama/query9/ground_truth.csv @@ -0,0 +1 @@ +2.4431 diff --git a/query_krama/query9/query.json b/query_krama/query9/query.json new file mode 100644 index 000000000..83392a95c --- /dev/null +++ b/query_krama/query9/query.json @@ -0,0 +1 @@ +"What is the mean Kp index, according to the 2024 OMNI2 low-resolution hourly data, for the calendar month in 2024 during which Swarm-B recorded its highest mean thermospheric density? Round to 4 decimal places." \ No newline at end of file diff --git a/query_krama/query9/validate.py b/query_krama/query9/validate.py new file mode 100644 index 000000000..46704c811 --- /dev/null +++ b/query_krama/query9/validate.py @@ -0,0 +1,18 @@ +import re + + +def validate(llm_output: str): + expected = 2.4431 + tolerance = 0.00005 + + matches = re.findall(r"[-+]?\d+\.?\d*", llm_output) + for m in matches: + try: + if abs(float(m) - expected) <= tolerance: + return True, f"Found expected value: {expected}" + except ValueError: + continue + + if matches: + return False, f"Found numbers {matches[:5]}, but expected {expected} (±{tolerance})" + return False, "No numeric value found in LLM output" diff --git a/query_krama/query_dataset/domain_assets.db b/query_krama/query_dataset/domain_assets.db new file mode 100644 index 000000000..59f47913f --- /dev/null +++ b/query_krama/query_dataset/domain_assets.db @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65ab088f0e0807b5fb9b927d016c6325fe7395f0ce61d7dc81c2af07a5d1be44 +size 160579584 diff --git a/query_krama/query_dataset/domain_docs/domain_docs_db/files.bson b/query_krama/query_dataset/domain_docs/domain_docs_db/files.bson new file mode 100644 index 000000000..a0a6674b9 --- /dev/null +++ b/query_krama/query_dataset/domain_docs/domain_docs_db/files.bson @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0570115c19f0e0c13011d84670550812c91e9e980ebc67b79ca11668ffa7c436 +size 477697706 diff --git a/query_krama/query_dataset/us_geo.db b/query_krama/query_dataset/us_geo.db new file mode 100644 index 000000000..d55f05c84 --- /dev/null +++ b/query_krama/query_dataset/us_geo.db @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf442d08de0ffc396d8b750597dba7d0df6659fac44af98ec53b73ae7a31514c +size 28504064