diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d0e7e45 --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +# This .gitignore file specified the files to exclude from the git project. +# + +# Pycharm Files +.idea + +# VS Code Files +.vscode + +# Python Files +*.pyc +*.pyo + +# Pyinstaller Files +/build +/dist + +# Packing files +/sqlite_dissect.egg-info + +# Other +/output +/log diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..338d00f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,11 @@ +# Change Log + +v0.0.6 (2021-07-29) +------------------ + +- Initial external release of application and source code +- Parsing and recovery of SQLite database and WAL files +- Started documentation of classes in README.md files with Mermaid +- Added PyInstaller scripts and builds for windows and linux +- Incorporated output options for SQLite, XLSX, and CSV +- Added initial beta carving of journal files diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..134856b --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,23 @@ +DC3 SQLite Dissect Open Source License + +DC3 SQLite Dissect software was developed by the Department of Defense Cyber +Crime Center (DC3). By delegated authority pursuant to Section 801(b) of Public Law +113-66, DC3 grants the following license for this software: + +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons +to whom the Software is furnished to do so, subject to the following condition: + +The above permission notice and the below warranty notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE DEVELOPERS, OR LICENSORS +BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..8e0e8f4 --- /dev/null +++ b/README.md @@ -0,0 +1,333 @@ +# DC3 SQLite Dissect + +#### Version 0.0.6 + +usage:
+ + sqlite_dissect [-h] [-v] [-d OUTPUT_DIRECTORY] [-p FILE_PREFIX] + [-e EXPORT_TYPE] [-n | -w WAL | -j ROLLBACK_JOURNAL] [-r | EXEMPTED_TABLES] + [-s | -t] [-g] [-c] [-f] [-k] [-l LOG_LEVEL] [-i LOG_FILE] [--warnings] + SQLITE_FILE` + +SQLite Dissect is a SQLite parser with recovery abilities over SQLite databases +and their accompanying journal files. If no options are set other than the file +name, the default behaviour will be to check for any journal files and print to +the console the output of the SQLite files. The directory of the SQLite file +specified will be searched through to find the associated journal files. If +they are not in the same directory as the specified file, they will not be found +and their location will need to be specified in the command. SQLite carving +will not be done by default. Please see the options below to enable carving. + +#### positional arguments: + +SQLITE_FILE +           +The SQLite database file +

+ +#### optional arguments: + +-h, --help +           +show this help message and exit +
+ +-v, --version +           +display the version of SQLite Dissect +
+ +-d OUTPUT_DIRECTORY, --directory OUTPUT_DIRECTORY +           +directory to write output to (must be specified for outputs other than console text) +
+ +-p FILE_PREFIX, --file-prefix FILE_PREFIX +           +the file prefix to use on output files, default is the name of the SQLite file +(the directory for output must be specified) +
+ +-e EXPORT_TYPE, --export EXPORT_TYPE +           +the format to export to {text, csv, sqlite, xlsx} +(text written to console if -d is not specified) +
+ +-n, --no-journal +           +turn off automatic detection of journal files +
+ +-w WAL, --wal WAL +           +the WAL file to use instead of searching the SQLite file directory by default +
+ +-j ROLLBACK_JOURNAL, --rollback-journal ROLLBACK_JOURNAL +           +the rollback journal file to use instead of searching the SQLite file directory by default +(under development, currently only outputs to csv, output directory needs to be specified) +
+ +-r EXEMPTED_TABLES, --exempted-tables EXEMPTED_TABLES +           +comma-delimited string of tables \[table1,table2,table3\] to exempt +(only implemented and allowed for rollback journal parsing currently) ex.) table1,table2,table3 +
+ +-s, --schema +           +output the schema to console, the initial schema found in the main database file +
+ +-t, --schema-history +           +output the schema history to console, prints the --schema information and write-head log changes +
+ +-g, --signatures +           +output the signatures generated to console +
+ +-c, --carve +           +carves and recovers table data +
+ +-f, --carve-freelists +           +carves freelist pages (carving must be enabled, under development) +
+ +-b TABLES, --tables TABLES +           +specified comma-delimited string of tables \[table1,table2,table3\] to carve +ex.) table1,table2,table3 +
+ +-k, --disable-strict-format-checking +           +disable strict format checks for SQLite databases +(this may result in improperly parsed SQLite files) +
+ +-l LOG_LEVEL, --log-level LOG_LEVEL +           +level to log messages at {critical, error, warning, info, debug, off} +
+ +-i LOG_FILE, --log-file LOG_FILE +           +log file to write too, default is to write to console, ignored +if log level set to off (appends if file already exists) +
+ +--warnings +           +enable runtime warnings +

+ +### Example Usage: + +1. Print the version: + + + sqlite_dissect --version + +2. Parse a SQLite database and print the outputs to the screen: + + + sqlite_dissect [SQLITE_FILE] + + +3. Parse a SQLite database and print schema history to a SQLite output file: + + + sqlite_dissect [SQLITE_FILE] --schema-history -d [OUTPUT_DIRECTORY] -e sqlite + +4. Parse a SQLite database and print the output to a SQLite file along with printing signatures and carving entries: + + + sqlite_dissect [SQLITE_FILE] --signatures -d [OUTPUT_DIRECTORY] -e sqlite --carve + +5. Parse a SQLite database and print the output to a SQLite file and carving entries, including freelists, for specific tables: + + + sqlite_dissect [SQLITE_FILE] -d [OUTPUT_DIRECTORY] -e sqlite --carve --carve-freelists -b [TABLES] + +6. Parse a SQLite database file and print the output to a xlsx workbook along with generating signatures and + carving entries. The schema history (schema updates throughout the WAL included if a WAL file detected) and + signatures will be printed to standard output. The log level will be set to debug and all log messages will be + output to the specified log file. + + + sqlite_dissect [SQLITE_FILE] -d [OUTPUT_DIRECTORY] -e xlsx --schema-history --carve --signatures --log-level debug -i [LOG_FILE] + +7. Parse a SQLite database file along with a specified rollback journal file and send the output to CSV files. + (CSV is the only output option currently implemented for rollback journal files.) + + + sqlite_dissect [SQLITE_FILE] -d [OUTPUT_DIRECTORY] -e csv --carve -j [ROLLBACK_JOURNAL] + +### Description + +This application focuses on carving by analyzing the allocated content within each of the SQLite +database tables and creating signatures. Where there is no content in the table, the signature +is based off of analyzing the create table statement in the master schema table. The signature +contains the series of possible serial types that can be stored within the file for that table. +This signature is then applied to the unallocated content and freeblocks of the table b-tree in +the file. This includes both interior and leaf table b-tree pages for that table. The signatures +are only applied to the pages belonging to the particular b-tree page it was generated from due +to initial research showing that the pages when created or pulled from the freelist set are +overwritten with zeros for the unallocated portions. Fragments within the pages can be reported +on but due to the size (<4 bytes), are not carved. Due to the fact that entries are added into +tables in SQLite from the end of the page and moving toward the beginning, the carving works +in the same manner in order to detect previously partial overwritten entries better. This +carving can also be applied to the set of freelist pages within the SQLite file if specified +but the freelist pages are treated as sets of unallocated data currently with the exception +of the freelist page metadata. + +The carving process does not currently account for index b-trees as the more pertinent information +is included in the table b-trees. Additionally, there are some table b-trees that are not currently +supported. This includes tables that are "without row_id", virtual, or internal schema objects. +These are unique cases which are slightly more rare use cases or don't offer as much as the +main tables do. By default all tables will be carved if they do not fall into one of these cases. +You can send in a specific list of tables to be carved. + +This application is written in the hopes that many of these use cases can be addressed in the future +and is scalable to those use cases. Although one specific type of signature is preferred by default +in the application, SQLite Dissect generates multiple versions of a signature and can eventually +support carving by specifying other signatures or providing your own. Since SQLite Dissect generates +the signature based off of existing data within the SQLite files automatically there is no need to +supply SQLite Dissect a signature for a particular schema or application. This could be implemented +though to allow possibly more specific/targeted carving of SQLite files through this application. + +Journal carving is supported primarily for WAL files. If a WAL file is found, this application will +parse through each of the commit records in sequence and assign a version to them. This is the same +as timelining that some applications use to explain this. Rollback journals are treated as a full +unallocated block currently and only support export to csv files. + +SQLite Dissect can support output to various forms: text, csv, xlsx, and sqlite. Due to certain +constraints on what can be written to some file types, certain modifications need to be made. For +instance, when writing SQLite columns such as row_id that are already going to pre-exist in the table +for export to a SQLite file. In cases like these, we need to preface the columns with "sd_" so +they will not conflict with the actual row_id column. This also applies to internal schema objects, +so if certain SQLite tables are requested to be written to a SQLite file, than these will be prefaced +with a "iso_" so they will not conflict with similar internal schema objects that may already exist +in the SQLite file bring written to. In xlsx or csv, due to a "=" symbol indicating a type of equation, +these are prefaced with a " " character to avoid this issue. More details can be found in the +code documentation of the export classes themselves. + +SQLite Dissect opens the file as read only and acts as a read only interpreter when parsing and carving +the SQLite file. This is to ensure no changes are made to the files being analyzed. The only use +of the sqlite3 libraries in python are to write the output to a SQLite file if that option is +specified for output. + +#### Additional Notes: +1. SQLite Dissect currently only works on a SQLite database or a SQLite database along with a journal + (WAL or rollback) file. Journal files by themselves are not supported yet. + +#### Currently not implemented: +1. Signatures and carving are not implemented for "without rowid" tables or indexes. This will not cause an error + but will skip signature generation and carving processes. +2. Signatures and carving are not implemented for virtual tables. This will not cause an error but will skip + signature generation and carving processes. `Note: Even though virtual tables are skipped, virtual tables may + create other non-virtual tables which are not skipped. Currently nothing ties back these tables back to the + virtual table that created them.` +3. Invalidated frames in WAL files are currently skipped and not parsed. `Note: This applies to previous WAL records + that were previously written to the SQLite database.` +4. Signatures generated are only reflective of the base/initial schema in the SQLite database. + +#### Known issues and errors: +1. A use case may occur on generating a very small signature due to a table with very few columns resulting in many + false positives and longer parsing time. +2. Due to current handling queuing of data objects to be printed in addition to #1 above, a memory issue may occur with + carving some tables. + +#### Future implementation: +1. Export binary objects to separate files during export instead of being written to text files. +2. Print out sets of data that were unallocated or in freeblocks that did not have successful carvings. +3. Fix issues with schemas with comments. +4. Handle "altered column" table signatures where detected. +5. Implement handling of invalidated WAL frames. +6. The ability to de-dupe carved entries to those in allocated space (in cases such as those where the b-tree was migrated). + +# Library Scripts + +High level scripts that are used to access the rest of the library from and provide the base application for executing +SQLite Dissect when built. + +- api_usage.py +- example.py +- setup.py +- sqlite_dissect.py + +
+ +### api_usage.py + +This script shows an example of the api usage for a specific test file. + +TODO: +- [ ] Documentation improvements. + +
+ +### example.py + +This script shows examples of how this library can be used. + +TODO: +- [ ] Documentation improvements. +- [ ] Implement additional export methods. + +
+ +### setup.py + +This script will be used to setup the sqlite_dissect package for use in python environments. + +>Note: To compile a distribution for the project run "python setup.py sdist" in the directory this file is located in. + +>Note: openpyxl is needed for the xlsx export and will install jdcal, et-xmlfile \["openpyxl>=2.4.0b1"\] + +>Note: PyInstaller is used for generation of executables but not included in this setup.py script and will +> install altgraph, dis3, macholib, pefile, pypiwin32, pywin32 as dependencies. \[pyinstaller==3.6 needs to be used +> for Python 2.7 since the newer versions of PyInstaller of 4.0+ require Python 3.6\] Information on how to run +> PyInstaller is included in the spec files under the pyinstaller directory. Four files are here, two for windows +> and two for linux, both for x64 platforms. The two different files for each allow you to build it as one single +> file or a directory of decompressed files. Since the one file extracts to a temp directory in order to run, on +> some systems this may be blocked and therefore the directory of files is preferred. + +
+ +### sqlite_dissect.py + +This script will act as the command line script to run this library as a stand-alone application. + +TODO: +- [ ] Documentation improvements. +- [ ] Implement append, overwrite, etc. options for the log file if specified. +- [ ] Incorporate signature generation input and output files once implemented. +- [ ] Incorporate "store in memory" arguments (currently set to False, more in depth operations may want it True). +- [ ] Support for multiple export types simultaneously. +- [ ] Implement multiple passes/depths. +- [ ] Update string comparisons. +- [ ] Test use cases for exempted tables with rollback journal and when combined with specified tables. +- [ ] Check on name vs table_name properties of the master schema entry. +- [ ] Test cases where the schema changes throughout the WAL file. +- [ ] Investigate handling of virtual and "without rowid" tables when creating table signatures through the interface. +- [ ] Documentation on "without rowid" tables and indexes in references to carving in help documentation. +- [ ] Make sure to address/print unallocated space (especially uncarved) from updated page numbers in commit records. +- [ ] Research if there can be journal files with a zero length database file or zero-length journal files. +- [ ] Research if there can be combinations and of multiple rollback journal and WAL files with the SQLite database. +- [ ] Validate initial research that allocation of freelist pages to a b-tree results in a wipe of the page data. +- [ ] Add additional logging messages to the master schema entries skipped in signature generation. +- [ ] Integrate in the SQLite Forensic Corpus into tests. +- [ ] Look into updating terminology for versioning to timelining. +- [ ] Update code for compatibility with Python 3. +- [ ] Create a pip distribution. +- [ ] Create PyUnit tests. +- [ ] Create a GUI. diff --git a/_version.py b/_version.py new file mode 100644 index 0000000..a3fed4c --- /dev/null +++ b/_version.py @@ -0,0 +1,10 @@ + +""" + +_version.py + +This script identifies the version of the sqlite dissect library. + +""" + +__version__ = "0.0.6" diff --git a/api_usage.py b/api_usage.py new file mode 100644 index 0000000..320449a --- /dev/null +++ b/api_usage.py @@ -0,0 +1,81 @@ +import logging +import os +import sqlite_dissect.constants as sqlite_constants +import sqlite_dissect.interface as sqlite_interface + +""" + +api-usage.py + +This script shows an example of the api usage for a specific test file. + +""" + +# Setup logging +logging_level = logging.ERROR +logging_format = '%(levelname)s %(asctime)s [%(pathname)s] %(funcName)s at line %(lineno)d: %(message)s' +logging_date_format = '%d %b %Y %H:%M:%S' +logging.basicConfig(level=logging_level, format=logging_format, datefmt=logging_date_format) + +# Setup console logging +console_logger = logging.StreamHandler() +console_logger.setLevel(logging_level) +console_logger.setFormatter(logging.Formatter(logging_format, logging_date_format)) +logging.getLogger(sqlite_constants.LOGGER_NAME).addHandler(console_logger) + +""" + +API Usage + +The three fields below need to be filled in and are currently hardcoded: +file_name: The SQLite file to investigate (and associated WAL file if it exists in the same directory) +table_name: The table in the file to create a signature of and carve against the SQLite file with. +column_names: The columns in the table we are interested in printing out carved data from. + +Note: Below will carve entries from the b-tree page of the table and the freelists. The use case of cross b-tree + carving is not yet implemented yet in SQLite Dissect. + +""" + +# Specify the file details +file_name = "FILE_NAME" +table_name = "TABLE_NAME" +column_names = ["COLUMN_ONE", "COLUMN_TWO"] + +# Create the database +database = sqlite_interface.create_database(file_name) + +# Create the write ahead log +wal_file_name = file_name + sqlite_constants.WAL_FILE_POSTFIX +write_ahead_log = sqlite_interface.create_write_ahead_log(wal_file_name) if os.path.exists(wal_file_name) else None + +# Create the version history +version_history = sqlite_interface.create_version_history(database, write_ahead_log) + +# Create the signature we are interested in carving +table_signature = sqlite_interface.create_table_signature(table_name, database, version_history) + +# Account for "without rowid"/virtual table signatures until supported +if not table_signature: + print("Table signature not supported (\"without rowid\" table or virtual table)") + exit(0) + +# Get the column indices of the columns we are interested in +column_name_indices = {} +for column_name in column_names: + column_name_indices[column_name] = sqlite_interface.get_column_index(column_name, table_name, version_history) + +# Get a version history iterator for the table +carve_freelists = True +table_history_iterator = sqlite_interface.get_version_history_iterator(table_name, version_history, + table_signature, carve_freelists) +# Iterate through the commits in the history for this table +for commit in table_history_iterator: + # The table was only modified if the commit was updated for this table and make sure there were carved cells + if commit.updated and commit.carved_cells: + carved_cells = commit.carved_cells + for carved_cell in carved_cells.itervalues(): + for column_name in column_name_indices.keys(): + record_column = carved_cell.payload.record_columns[column_name_indices.get(column_name)] + print("Commit version: %s table record column: %s has serial type: %s with value of: \"%s\"." %\ + (commit.version_number, column_name, record_column.serial_type, record_column.value)) diff --git a/example.py b/example.py new file mode 100644 index 0000000..ea71833 --- /dev/null +++ b/example.py @@ -0,0 +1,546 @@ +from getopt import getopt +from logging import WARNING +from logging import basicConfig +from os import makedirs +from os.path import basename +from os.path import exists +from os.path import normpath +from os.path import sep +from re import sub +from sys import argv +from sqlite_dissect.carving.carver import SignatureCarver +from sqlite_dissect.carving.signature import Signature +from sqlite_dissect.constants import BASE_VERSION_NUMBER +from sqlite_dissect.constants import CELL_LOCATION +from sqlite_dissect.constants import CELL_SOURCE +from sqlite_dissect.constants import EXPORT_TYPES +from sqlite_dissect.constants import MASTER_SCHEMA_ROW_TYPE +from sqlite_dissect.constants import ROLLBACK_JOURNAL_POSTFIX +from sqlite_dissect.constants import WAL_FILE_POSTFIX +from sqlite_dissect.constants import WAL_INDEX_POSTFIX +from sqlite_dissect.export.csv_export import CommitCsvExporter +from sqlite_dissect.file.database.database import Database +from sqlite_dissect.file.database.page import BTreePage +from sqlite_dissect.file.database.utilities import get_pages_from_b_tree_page +from sqlite_dissect.file.journal.jounal import RollbackJournal +from sqlite_dissect.file.schema.master import OrdinaryTableRow +from sqlite_dissect.file.schema.master import VirtualTableRow +from sqlite_dissect.file.utilities import validate_page_version_history +from sqlite_dissect.file.wal.wal import WriteAheadLog +from sqlite_dissect.file.wal_index.wal_index import WriteAheadLogIndex +from sqlite_dissect.interface import carve_table +from sqlite_dissect.interface import create_database +from sqlite_dissect.interface import create_table_signature +from sqlite_dissect.interface import create_version_history +from sqlite_dissect.interface import create_write_ahead_log +from sqlite_dissect.interface import export_table_or_index_version_history_to_csv +from sqlite_dissect.interface import export_table_or_index_version_history_to_sqlite +from sqlite_dissect.interface import export_version_history_to_csv +from sqlite_dissect.interface import export_version_history_to_sqlite +from sqlite_dissect.interface import get_index_names +from sqlite_dissect.interface import get_table_names +from sqlite_dissect.interface import get_version_history_iterator +from sqlite_dissect.interface import select_all_from_index +from sqlite_dissect.interface import select_all_from_table +from sqlite_dissect.output import stringify_cell_records +from sqlite_dissect.output import stringify_master_schema_versions +from sqlite_dissect.output import stringify_page_information +from sqlite_dissect.output import stringify_unallocated_space +from sqlite_dissect.version_history import VersionHistory +from sqlite_dissect.version_history import VersionHistoryParser + +""" + +example.py + +This script shows examples of how this library can be used. + +""" + +# Setup logging +logging_level = WARNING +logging_format = '%(levelname)s %(asctime)s [%(pathname)s] %(funcName)s at line %(lineno)d: %(message)s' +logging_data_format = '%d %b %Y %H:%M:%S' +basicConfig(level=logging_level, format=logging_format, datefmt=logging_data_format) + +file_name = None +export_directory = None +export_type = None +opts, args = getopt(argv[1:], "f:e:t:") +for opt, arg in opts: + if opt == "-f": + file_name = arg + elif opt == "-e": + export_directory = arg + elif opt == "-t": + export_type = arg + +""" + +Note: Currently only the csv export_type is supported in this example. The csv and sqlite export_types are used in + the API example below. Other specified types are currently ignored. + +""" + +if (export_directory and not export_type) or (not export_directory and export_type): + print("The export directory (-e) and export type (-t) both need to be defined if either one is specified.") + print("Export types are: {}.".format([export_type for export_type in EXPORT_TYPES])) + exit(1) + +if export_type and export_type.upper() not in EXPORT_TYPES: + print("Invalid export type: {}.".format(export_type)) + print("Export types are: {}.".format(",".join([export_type.lower() for export_type in EXPORT_TYPES]))) + exit(1) + +if not file_name: + print("Please execute the application specifying the file name.") + exit(1) +elif not exists(file_name): + print("File: {} does not exist.".format(file_name)) + exit(1) +else: + print("Starting to parse and carve: {}.\n".format(file_name)) + +file_prefix = basename(normpath(file_name)) +padding = "\t" + +""" + +Load the Database File. + +""" + +database_file = Database(file_name) +print("Database File:\n{}\n".format(database_file.stringify(padding, False, False))) +print("Page Information:\n{}\n".format(stringify_page_information(database_file, padding))) + +""" + +Check if the Write-Ahead Log File exists and load it if it does. + +""" + +wal_file = None +wal_file_name = file_name + WAL_FILE_POSTFIX +if exists(wal_file_name): + wal_file = WriteAheadLog(wal_file_name) + print("WAL File:\n{}\n".format(wal_file.stringify(padding, False))) +else: + print("No WAL File Found.\n") + +""" + +Check if the Write-Ahead Log Index File exists and load it if it does. + +""" + +wal_index_file = None +wal_index_file_name = file_name + WAL_INDEX_POSTFIX +if exists(wal_index_file_name): + wal_index_file = WriteAheadLogIndex(wal_index_file_name) + print("WAL Index File:\n{}\n".format(wal_index_file.stringify(padding))) +else: + print("No WAL Index File Found.\n") + +""" + +Check if the Rollback Journal File exists and load it if it does. + +""" + +rollback_journal_file = None +rollback_journal_file_name = file_name + ROLLBACK_JOURNAL_POSTFIX +if exists(rollback_journal_file_name): + rollback_journal_file = RollbackJournal(rollback_journal_file_name) + print("Rollback Journal File:\n{}\n".format(rollback_journal_file.stringify(padding))) +else: + print("No Rollback Journal File Found.\n") + +""" + +Print Unallocated Non-Zero Space from the Database File. + +""" + +unallocated_non_zero_space = stringify_unallocated_space(database_file, padding, False) +print("Unallocated Non-Zero Space from the Database File:\n{}\n".format(unallocated_non_zero_space)) + +""" + +Create the version history from the database and WAL file (even if the WAL file was not found). + +""" + +version_history = VersionHistory(database_file, wal_file) + +print("Number of versions: {}\n".format(version_history.number_of_versions)) + +print("Validating Page Version History...") +page_version_history_validated = validate_page_version_history(version_history) +print("Validating Page Version History (Check): {}\n".format(page_version_history_validated)) +if not page_version_history_validated: + print("Error in validating page version history.") + exit(1) + +print("Version History of Master Schemas:\n") +for version_number, version in version_history.versions.iteritems(): + if version.master_schema_modified: + master_schema_entries = version.master_schema.master_schema_entries + if master_schema_entries: + print("Version {} Master Schema Entries:".format(version_number)) + for master_schema_entry in master_schema_entries: + string = padding + "Master Schema Entry: Root Page Number: {} Type: {} Name: {} " \ + "Table Name: {} SQL: {}." + print(string.format(master_schema_entry.root_page_number, master_schema_entry.row_type, + master_schema_entry.name, master_schema_entry.table_name, + master_schema_entry.sql)) + +print("Version History:\n") +for version_number, version in version_history.versions.iteritems(): + print("Version: {} has updated page numbers: {}.".format(version_number, version.updated_page_numbers)) + print("Page Information:\n{}\n".format(stringify_page_information(version, padding))) + +last_version = version_history.number_of_versions - 1 +print("Version: {} has updated page numbers: {}.".format(version_history.number_of_versions - 1, + last_version.updated_page_numbers)) +print("Page Information:\n{}\n".format(stringify_page_information(last_version, padding))) + +print("Version History of Master Schemas:\n{}\n".format(stringify_master_schema_versions(version_history))) + +print("Master Schema B-Trees (Index and Table) Version Histories:") +for master_schema_entry in database_file.master_schema.master_schema_entries: + if master_schema_entry.row_type in [MASTER_SCHEMA_ROW_TYPE.INDEX, MASTER_SCHEMA_ROW_TYPE.TABLE] and \ + not isinstance(master_schema_entry, VirtualTableRow) and \ + not (isinstance(master_schema_entry, OrdinaryTableRow) and master_schema_entry.without_row_id): + version_history_parser = VersionHistoryParser(version_history, master_schema_entry) + page_type = version_history_parser.page_type + string = "Master schema entry: {} type: {} on page type: {}:" + string = string.format(version_history_parser.row_type, master_schema_entry.name, page_type, + version_history_parser.root_page_number_version_index) + + print(string) + for commit in version_history_parser: + if commit.updated: + string = "Updated in version: {} with root page number: {} on b-tree page numbers: {} " \ + "and updated root b-tree page numbers: {}:" + string = string.format(commit.version_number, commit.root_page_number, commit.b_tree_page_numbers, + commit.updated_b_tree_page_numbers) + print(string) + for added_cell_string in stringify_cell_records(commit.added_cells.values(), + database_file.database_text_encoding, page_type): + print("Added: {}".format(added_cell_string)) + for updated_cell_string in stringify_cell_records(commit.updated_cells.values(), + database_file.database_text_encoding, page_type): + print("Updated: {}".format(updated_cell_string)) + for deleted_cell_string in stringify_cell_records(commit.deleted_cells.values(), + database_file.database_text_encoding, page_type): + print("Deleted: {}".format(deleted_cell_string)) + for carved_cell_string in stringify_cell_records(commit.carved_cells.values(), + database_file.database_text_encoding, page_type): + print("Carved: {}".format(carved_cell_string)) + print("\n") + +signatures = {} +for master_schema_entry in database_file.master_schema.master_schema_entries: + + """ + + Due to current implementation limitations we are restricting signature generation to table row types. + + """ + + if master_schema_entry.row_type == MASTER_SCHEMA_ROW_TYPE.TABLE: + signature = Signature(version_history, master_schema_entry) + signatures[master_schema_entry.name] = signature + print("Signature:\n{}\n".format(signature.stringify(padding + "\t", False, False, False))) + else: + string = "No signature will be generated for master schema entry type: {} with name: {} on " \ + "table name: {} and sql: {}" + string = string.format(master_schema_entry.row_type, master_schema_entry.name, master_schema_entry.table_name, + master_schema_entry.sql) + print(string + "\n") + +print("Carving base version (main SQLite database file):") +version = version_history.versions[BASE_VERSION_NUMBER] + +carved_records = {} +for master_schema_entry in database_file.master_schema.master_schema_entries: + + """ + + Due to current implementation limitations we are restricting carving to table row types. + + Note: This is not allowing "without rowid" or virtual tables until further testing is done. (Virtual tables + tend to have a root page number of 0 with no data stored in the main table. Further investigation + is needed.) + + """ + + if master_schema_entry.row_type == MASTER_SCHEMA_ROW_TYPE.TABLE \ + and not isinstance(master_schema_entry, VirtualTableRow) and not master_schema_entry.without_row_id: + + b_tree_pages = get_pages_from_b_tree_page(version.get_b_tree_root_page(master_schema_entry.root_page_number)) + b_tree_page_numbers = [b_tree_page.number for b_tree_page in b_tree_pages] + + string = "Carving Table Entry: Name: {} root page: {} on page numbers: {}" + print(string.format(master_schema_entry.name, master_schema_entry.root_page_number, b_tree_page_numbers)) + + carved_records[master_schema_entry.name] = [] + for b_tree_page_number in b_tree_page_numbers: + page = database_file.pages[b_tree_page_number] + source = CELL_SOURCE.B_TREE + + # For carving freeblocks make sure the page is a b-tree page and not overflow + if isinstance(page, BTreePage): + carved_cells = SignatureCarver.carve_freeblocks(version, source, page.freeblocks, + signatures[master_schema_entry.name]) + carved_records[master_schema_entry.name].extend(carved_cells) + carved_cells = SignatureCarver.carve_unallocated_space(version, source, b_tree_page_number, + page.unallocated_space_start_offset, + page.unallocated_space, + signatures[master_schema_entry.name]) + + carved_records[master_schema_entry.name].extend(carved_cells) + + else: + string = "Not carving master schema entry row type: {} name: {} table name: {} and sql: {} since it is not " \ + "a normal table." + string = string.format(master_schema_entry.row_type, master_schema_entry.name, master_schema_entry.table_name, + master_schema_entry.sql) + print(string) +print("\n") + +print("Carved Entries:\n") +for master_schema_entry_name, carved_cells in carved_records.iteritems(): + + print("Table Master Schema Entry Name {}:".format(master_schema_entry_name)) + + carved_freeblock_records_total = len([carved_cell for carved_cell in carved_cells + if carved_cell.location == CELL_LOCATION.FREEBLOCK]) + + print("Recovered {} entries from freeblocks:".format(carved_freeblock_records_total)) + + for carved_cell in carved_cells: + if carved_cell.location == CELL_LOCATION.FREEBLOCK: + payload = carved_cell.payload + cell_record_column_values = [str(record_column.value) if record_column.value else "NULL" + for record_column in payload.record_columns] + string = "{}: {} Index: ({}, {}, {}, {}): ({})" + string = string.format(carved_cell.page_number, carved_cell.index, carved_cell.file_offset, + payload.serial_type_definition_start_offset, + payload.serial_type_definition_end_offset, + payload.cutoff_offset, " , ".join(cell_record_column_values)) + print(string) + + carved_unallocated_space_records_total = len([carved_cell for carved_cell in carved_cells + if carved_cell.location == CELL_LOCATION.UNALLOCATED_SPACE]) + print("Recovered {} entries from unallocated space:".format(carved_unallocated_space_records_total)) + + for carved_cell in carved_cells: + if carved_cell.location == CELL_LOCATION.UNALLOCATED_SPACE: + payload = carved_cell.payload + cell_record_column_values = [str(record_column.value) if record_column.value else "NULL" + for record_column in payload.record_columns] + string = "{}: {} Index: ({}, {}, {}, {}): ({})" + string = string.format(carved_cell.page_number, carved_cell.index, carved_cell.file_offset, + payload.serial_type_definition_start_offset, + payload.serial_type_definition_end_offset, + payload.cutoff_offset, " , ".join(cell_record_column_values)) + print(string) + + print("\n") +print("\n") + +print("Master Schema B-Trees (Index and Table) Version Histories Including Carvings:") +for master_schema_entry in database_file.master_schema.master_schema_entries: + if master_schema_entry.row_type in [MASTER_SCHEMA_ROW_TYPE.INDEX, MASTER_SCHEMA_ROW_TYPE.TABLE]: + + # We only have signatures of the tables (not indexes) + signature = signatures[master_schema_entry.name] \ + if master_schema_entry.row_type == MASTER_SCHEMA_ROW_TYPE.TABLE else None + + version_history_parser = VersionHistoryParser(version_history, master_schema_entry, None, None, signature) + page_type = version_history_parser.page_type + string = "Master schema entry: {} type: {} on page type: {}:" + string = string.format(master_schema_entry.name, version_history_parser.row_type, page_type, + version_history_parser.root_page_number_version_index) + print(string) + for commit in version_history_parser: + if commit.updated: + string = "Updated in version: {} with root page number: {} on b-tree page numbers: {} " \ + "and updated root b-tree page numbers: {}:" + string = string.format(commit.version_number, commit.root_page_number, commit.b_tree_page_numbers, + commit.updated_b_tree_page_numbers) + print(string) + for added_cell_string in stringify_cell_records(commit.added_cells.values(), + database_file.database_text_encoding, page_type): + print("Added: {}".format(added_cell_string)) + for updated_cell_string in stringify_cell_records(commit.updated_cells.values(), + database_file.database_text_encoding, page_type): + print("Updated: {}".format(updated_cell_string)) + for deleted_cell_string in stringify_cell_records(commit.deleted_cells.values(), + database_file.database_text_encoding, page_type): + print("Deleted: {}".format(deleted_cell_string)) + for carved_cell_string in stringify_cell_records(commit.carved_cells.values(), + database_file.database_text_encoding, page_type): + print("Carved: {}".format(carved_cell_string)) + print("\n") + +if export_type and export_type.upper() == EXPORT_TYPES.CSV: + csv_prefix_file_name = basename(normpath(file_prefix)) + commit_csv_exporter = CommitCsvExporter(export_directory, csv_prefix_file_name) + print("Exporting SQLite Master Schema B-Trees (Index and Table) Version Histories " + "(Including Carvings) to CSV Directory: {}.".format(export_directory)) + for master_schema_entry in database_file.master_schema.master_schema_entries: + if master_schema_entry.row_type in [MASTER_SCHEMA_ROW_TYPE.INDEX, MASTER_SCHEMA_ROW_TYPE.TABLE]: + + # We only have signatures of the tables (not indexes) + signature = signatures[master_schema_entry.name] \ + if master_schema_entry.row_type == MASTER_SCHEMA_ROW_TYPE.TABLE else None + + carve_freelist_pages = True if signature else False + + version_history_parser = VersionHistoryParser(version_history, master_schema_entry, + None, None, signature, carve_freelist_pages) + page_type = version_history_parser.page_type + for commit in version_history_parser: + commit_csv_exporter.write_commit(master_schema_entry, commit) +print("\n") + +""" + +Below are examples on using the interface. + +The functions used from the interface script are documented below (taken from documentation in the interface script): +create_database(file_name, file_object=None, store_in_memory=False, strict_format_checking=True) +create_write_ahead_log(file_name, file_object=None) +create_version_history(database, write_ahead_log=None) +get_table_names(database) +get_index_names(database) +select_all_from_table(table_name, version) +select_all_from_index(index_name, version) +create_table_signature(table_name, version, version_history=None) +carve_table(table_name, signature, version) +get_version_history_iterator(table_or_index_name, version_history, signature=None) +export_table_or_index_version_history_to_csv(export_directory, version_history, + table_or_index_name, signature=None, carve_freelist_pages=False) +export_version_history_to_csv(export_directory, version_history, signatures=None, carve_freelist_pages=False) + +""" + +print("Example interface usage:\n") + +# Create the database +database = create_database(file_name) + +# Create the write ahead log +write_ahead_log = create_write_ahead_log(file_name + WAL_FILE_POSTFIX) if exists(file_name + WAL_FILE_POSTFIX) else None + +# Create the version history +version_history = create_version_history(database, write_ahead_log) + +# Get all of the table names +table_names = get_table_names(database) +print("Table Names: {}\n".format(table_names)) + +# Get all of the cells in each table and print the number of cells (rows) for each table +for table_name in table_names: + select_all_data = select_all_from_table(table_name, database) + print("Table: {} has {} rows in the database file.".format(table_name, len(select_all_data))) +print("\n") + +# Get all of the index names +index_names = get_index_names(database) +print("Index Names: {}".format(index_names)) +print("\n") + +# Get all of the cells in each index and print the number of cells (rows) for each index +for index_name in index_names: + select_all_data = select_all_from_index(index_name, database) + print("Index: {} has {} rows in the database file.".format(index_name, len(select_all_data))) +print("\n") + +# Get all of the signatures (for tables only - not including "without rowid" and virtual tables) +signatures = {} +for table_name in table_names: + # Specify the version history here to parse through all versions for signature generation + table_signature = create_table_signature(table_name, database, version_history) + # Account for "without rowid" table signatures until supported + if table_signature: + signatures[table_name] = table_signature + +# Carve each table with the generated signature and print the number of carved cells (rows) per table +for table_name in table_names: + if table_name in signatures: + carved_cells = carve_table(table_name, signatures[table_name], database) + print("Found {} carved cells for table: {} in the database file.".format(len(carved_cells), table_name)) +print("\n") + +# Combine names for index and tables (they are unique) and get the version history iterator for each +names = [] +names.extend(table_names) +names.extend(index_names) +for name in names: + signature = signatures[name] if name in signatures else None + version_history_iterator = get_version_history_iterator(name, version_history, signature) + for commit in version_history_iterator: + string = "For: {} commit: {} for version: {}.".format(name, commit.updated, commit.version_number) + if commit.updated: + string += " Carved Cells: {}.".format(True if commit.carved_cells else False) + print(string) +print("\n") + +# Check to make sure exporting variables were setup correctly for csv +if export_type and export_type.upper() == EXPORT_TYPES.CSV: + + # Create two directories for the two types csv files can be exported through the interface + export_version_directory = export_directory + sep + "csv_version" + if not exists(export_version_directory): + makedirs(export_version_directory) + export_version_history_directory = export_directory + sep + "csv_version_history" + if not exists(export_version_history_directory): + makedirs(export_version_history_directory) + + # Iterate through all index and table names and export their version history to a csv file (one at a time) + for name in names: + print("Exporting {} to {} as {}.".format(name, export_version_directory, export_type)) + export_table_or_index_version_history_to_csv(export_version_directory, version_history, name, None, False) + print("\n") + + # Export all index and table histories to csv files while supplying signatures to carve tables and carving freelists + print("Exporting history to {} with carvings as {}.".format(export_version_history_directory, export_type)) + export_version_history_to_csv(export_version_history_directory, version_history, signatures.values(), True) + print("\n") + +# Check to make sure exporting variable were setup correctly for SQLite +if export_type and export_type.upper() == EXPORT_TYPES.SQLITE: + + # Create two directories for the two types SQLite files can be exported through the interface + export_version_directory = export_directory + sep + "sqlite_version" + if not exists(export_version_directory): + makedirs(export_version_directory) + export_version_history_directory = export_directory + sep + "sqlite_version_history" + if not exists(export_version_history_directory): + makedirs(export_version_history_directory) + + # Currently the file name is taken from the base version name + sqlite_base_file_name = basename(normpath(file_prefix)) + sqlite_file_postfix = "-sqlite-dissect.db3" + + # Iterate through all index and table names and export their version history to a csv file (one at a time) + for name in names: + fixed_master_schema_name = sub(" ", "_", name) + master_schema_entry_file_name = sqlite_base_file_name + "-" + fixed_master_schema_name + sqlite_file_postfix + print("Exporting {} to {} in {} as {}.".format(name, master_schema_entry_file_name, export_version_directory, + export_type)) + export_table_or_index_version_history_to_sqlite(export_version_directory, master_schema_entry_file_name, + version_history, name) + print("\n") + + # Export all index and table histories to csv files while supplying signatures to carve tables and carving freelists + sqlite_file_name = sqlite_base_file_name + sqlite_file_postfix + print("Exporting history to {} in {} with carvings as {}.".format(sqlite_file_name, + export_version_history_directory, export_type)) + export_version_history_to_sqlite(export_version_history_directory, sqlite_file_name, version_history, + signatures.values(), True) + print("\n") diff --git a/main.py b/main.py new file mode 100644 index 0000000..e7090cb --- /dev/null +++ b/main.py @@ -0,0 +1,769 @@ +import warnings +from argparse import ArgumentParser +from logging import CRITICAL +from logging import DEBUG +from logging import ERROR +from logging import INFO +from logging import WARNING +from logging import basicConfig +from logging import getLogger +from os.path import basename +from os.path import exists +from os.path import getsize +from os.path import normpath +from os.path import sep +from time import time +from warnings import warn +from _version import __version__ +from sqlite_dissect.carving.rollback_journal_carver import RollBackJournalCarver +from sqlite_dissect.carving.signature import Signature +from sqlite_dissect.constants import BASE_VERSION_NUMBER +from sqlite_dissect.constants import EXPORT_TYPES +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import MASTER_SCHEMA_ROW_TYPE +from sqlite_dissect.constants import ROLLBACK_JOURNAL_POSTFIX +from sqlite_dissect.constants import WAL_FILE_POSTFIX +from sqlite_dissect.exception import SqliteError +from sqlite_dissect.export.csv_export import CommitCsvExporter +from sqlite_dissect.export.sqlite_export import CommitSqliteExporter +from sqlite_dissect.export.text_export import CommitConsoleExporter +from sqlite_dissect.export.text_export import CommitTextExporter +from sqlite_dissect.export.xlsx_export import CommitXlsxExporter +from sqlite_dissect.file.database.database import Database +from sqlite_dissect.file.journal.jounal import RollbackJournal +from sqlite_dissect.file.schema.master import OrdinaryTableRow +from sqlite_dissect.file.wal.wal import WriteAheadLog +from sqlite_dissect.output import stringify_master_schema_version +from sqlite_dissect.output import stringify_master_schema_versions +from sqlite_dissect.version_history import VersionHistory +from sqlite_dissect.version_history import VersionHistoryParser + +""" + +sqlite_dissect.py + +This script will act as the command line script to run this library as a stand-alone application. + +""" + + +def main(args): + + # Handle the logging and warning settings + if not args.log_level: + raise SqliteError("Error in setting up logging: no log level determined.") + + # Get the logging level + logging_level_arg = args.log_level + logging_level = logging_level_arg + if logging_level_arg != "off": + if logging_level_arg == "critical": + logging_level = CRITICAL + elif logging_level_arg == "error": + logging_level = ERROR + elif logging_level_arg == "warning": + logging_level = WARNING + elif logging_level_arg == "info": + logging_level = INFO + elif logging_level_arg == "debug": + logging_level = DEBUG + else: + raise SqliteError("Invalid option for logging: {}.".format(logging_level_arg)) + + # Setup logging + logging_format = '%(levelname)s %(asctime)s [%(pathname)s] %(funcName)s at line %(lineno)d: %(message)s' + logging_data_format = '%d %b %Y %H:%M:%S' + basicConfig(level=logging_level, format=logging_format, datefmt=logging_data_format, filename=args.log_file) + + logger = getLogger(LOGGER_NAME) + logger.debug("Setup logging using the log level: {}.".format(logging_level)) + logger.info("Using options: {}".format(args)) + + if args.warnings: + + # Turn warnings on if it was specified + warnings.filterwarnings("always") + + logger.info("Warnings have been turned on.") + + else: + + # Ignore warnings by default + warnings.filterwarnings("ignore") + + # Execute argument checks (inclusive) + if args.carve_freelists and not args.carve: + raise SqliteError("Freelist carving cannot be enabled (--carve-freelists) without enabling " + "general carving (--carve).") + if args.export.upper() != EXPORT_TYPES.TEXT and not args.directory: + raise SqliteError("The directory needs to be specified (--directory) if an export type other than text " + "is specified (--export).") + if args.file_prefix and not args.directory: + raise SqliteError("The directory needs to be specified (--directory) if a file prefix is " + "specified (--file-prefix).") + + # Setup the export type + export_type = EXPORT_TYPES.TEXT + if args.export: + export_type = args.export.upper() + + # Setup the strict format checking + strict_format_checking = True + if args.disable_strict_format_checking: + strict_format_checking = False + + # Setup the file prefix which taken from the base version name unless the file_prefix argument is set + file_prefix = basename(normpath(args.sqlite_file)) + if args.file_prefix: + file_prefix = args.file_prefix + + if not file_prefix: + # The file prefix is taken from the base version name if not specified + file_prefix = basename(normpath(args.sqlite_file)) + + # Setup the directory if specified + output_directory = None + if args.directory: + if not exists(args.directory): + raise SqliteError("Unable to find output directory: {}.".format(args.directory)) + output_directory = args.directory + + logger.debug("Determined export type to be {} with file prefix: {} and output directory: {}" + .format(export_type, file_prefix, output_directory)) + + # Obtain the SQLite file + if not exists(args.sqlite_file): + raise SqliteError("Unable to find SQLite file: {}.".format(args.sqlite_file)) + + """ + + If the file is a zero length file, we set a flag indicating it and check to make sure there are no associated wal + or journal files before just exiting out stating that the file was empty. If a (non-zero length) wal or journal + file is found, an exception will be thrown. However, if the no-journal option is specified, the journal files will + not be checked, and the program will exit. + + Note: It is currently believed that there cannot be a zero length SQLite database file with a wal or journal file. + That is why an exception is thrown here but needs to be investigated to make sure. + + """ + + # See if the SQLite file is zero-length + zero_length_sqlite_file = False + if getsize(args.sqlite_file) == 0: + zero_length_sqlite_file = True + + # Obtain the wal or rollback_journal file if found (or if specified) + wal_file_name = None + rollback_journal_file_name = None + if not args.no_journal: + if args.wal: + if not exists(args.wal): + raise SqliteError("Unable to find wal file: {}.".format(args.wal)) + wal_file_name = args.wal + elif args.rollback_journal: + if not exists(args.rollback_journal): + raise SqliteError("Unable to find rollback journal file: {}.".format(args.rollback_journal)) + rollback_journal_file_name = args.rollback_journal + else: + if exists(args.sqlite_file + WAL_FILE_POSTFIX): + wal_file_name = args.sqlite_file + WAL_FILE_POSTFIX + if exists(args.sqlite_file + ROLLBACK_JOURNAL_POSTFIX): + rollback_journal_file_name = args.sqlite_file + ROLLBACK_JOURNAL_POSTFIX + + # Exempted tables are only supported currently for rollback journal files + rollback_journal_exempted_tables = [] + if args.exempted_tables: + if not rollback_journal_file_name: + raise SqliteError("Exempted tables are only supported for use with rollback journal parsing.") + rollback_journal_exempted_tables = args.exempted_tables.split(",") + + # See if the wal file is zero-length + zero_length_wal_file = False + if wal_file_name and getsize(wal_file_name) == 0: + zero_length_wal_file = True + + # See if the rollback journal file is zero-length + zero_length_rollback_journal_file = False + if rollback_journal_file_name and getsize(rollback_journal_file_name) == 0: + zero_length_rollback_journal_file = True + + # Check if the SQLite file is zero length + if zero_length_sqlite_file: + + if wal_file_name and not zero_length_wal_file: + + """ + + Here we throw an exception if we find a wal file with content with no content in the original SQLite file. + It is not certain this use case can occur and investigation needs to be done to make certain. There have + been scenarios where there will be a database header with no schema or content in a database file with a + WAL file that has all the schema entries and content but this is handled differently. + + """ + + raise SqliteError("Found a zero length SQLite file with a wal file: {}. Unable to parse.".format(args.wal)) + + elif zero_length_wal_file: + print("File: {} with wal file: {} has no content. Nothing to parse." + .format(args.sqlite_file, wal_file_name)) + exit(0) + + elif rollback_journal_file_name and not zero_length_rollback_journal_file: + + """ + + Here we will only have a rollback journal file. Currently, since we need to have the database file to parse + signatures from, we cannot solely carve on the journal file alone. + + """ + + raise SqliteError("Found a zero length SQLite file with a rollback journal file: {}. Unable to parse." + .format(args.rollback_journal)) + + elif zero_length_rollback_journal_file: + print("File: {} with rollback journal file: {} has no content. Nothing to parse." + .format(args.sqlite_file, rollback_journal_file_name)) + exit(0) + + else: + print("File: {} has no content. Nothing to parse.".format(args.sqlite_file)) + exit(0) + + # Make sure that both of the journal files are not found + if rollback_journal_file_name and wal_file_name: + + """ + + Since the arguments have you specify the journal file in a way that you can only set the wal or rollback journal + file name, this case can only occur from finding both of the files on the file system for both wal and rollback + journal when there is no journal options specified. Since the SQLite database cannot be set to use both wal and + journal files in the same running, we determine this to be an error and throw and exception up. + + There may be a case where the mode was changed at some point and there is a single SQLite file with one or more + journal files in combination of rollback journal and WAL files. More research would have to take place in this + scenario and also take into the account of this actually occurring since in most cases it is set statically + by the application SQLite database owner. + + """ + + raise SqliteError("Found both a rollback journal: {} and wal file: {}. Only one journal file should exist. " + "Unable to parse.".format(args.rollback_journal, args.wal)) + + # Print a message parsing is starting and log the start time for reporting at the end on amount of time to run + print("\nParsing: {}...".format(args.sqlite_file)) + start_time = time() + + # Create the database and wal/rollback journal file (if existent) + database = Database(args.sqlite_file, strict_format_checking=strict_format_checking) + + write_ahead_log = None + if wal_file_name and not zero_length_wal_file: + write_ahead_log = WriteAheadLog(wal_file_name, strict_format_checking=strict_format_checking) + + rollback_journal_file = None + if rollback_journal_file_name and not zero_length_rollback_journal_file: + rollback_journal_file = RollbackJournal(rollback_journal_file_name) + + # Create the version history (this is currently only supported for the WAL) + version_history = VersionHistory(database, write_ahead_log) + + # Check if the master schema was asked for + if args.schema: + + # print the master schema of the database + print("\nDatabase Master Schema:\n{}".format(stringify_master_schema_version(database))) + print("Continuing to parse...") + + # Check if the schema history was asked for + if args.schema_history: + + # print the master schema version history + print("\nVersion History of Master Schemas:\n{}".format(stringify_master_schema_versions(version_history))) + print("Continuing to parse...") + + # Get the signature options + print_signatures = args.signatures + + # Get the carving options + carve = args.carve + carve_freelists = args.carve_freelists + + # Check to see if carve freelists was set without setting carve + if not carve and carve_freelists: + log_message = "The carve option was not set but the carve_freelists option was. Disabling carve_freelists. " \ + "Please specify the carve option to enable." + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + # Specific tables to be carved + specified_tables_to_carve = [] + if args.tables: + specified_tables_to_carve = args.tables.split(",") + + if rollback_journal_exempted_tables and specified_tables_to_carve: + for table in rollback_journal_exempted_tables: + if table in specified_tables_to_carve: + print("Table: {} found in both exempted and specified tables. Please update the arguments correctly." + .format(table)) + exit(0) + + # See if we need to generate signatures + generate_signatures = True if (carve or print_signatures) else False + signatures = None + + # Get all of the signatures (for tables only - not including "without rowid" and virtual tables) + if generate_signatures: + + signatures = {} + logger.debug("Generating table signatures.") + + for master_schema_entry in database.master_schema.master_schema_entries: + + # Only account for the specified tables + if specified_tables_to_carve and master_schema_entry.name not in specified_tables_to_carve: + continue + + """ + + Due to current implementation limitations we are restricting carving to table row types. + + Note: This is not allowing "without rowid" or virtual tables until further testing is done. + (Virtual tables tend to have a root page number of 0 with no data stored in the main table. Further + investigation is needed.) + + Note: Table internal schema objects will not be accounted for. These are tables that start with "sqlite_" + and are used for internal use to SQLite itself. These have never known to produce any forensic + pertinent data. + + """ + + if isinstance(master_schema_entry, OrdinaryTableRow): + + if master_schema_entry.without_row_id: + log_message = "A `without row_id` table was found: {} and will not have a signature generated " \ + "for carving since it is not supported yet.".format(master_schema_entry.table_name) + logger.info(log_message) + continue + + if master_schema_entry.internal_schema_object: + log_message = "A `internal schema` table was found: {} and will not have a signature generated " \ + "for carving since it is not supported yet.".format(master_schema_entry.table_name) + logger.info(log_message) + continue + + signatures[master_schema_entry.name] = Signature(version_history, master_schema_entry) + + if print_signatures: + print("\nSignature:\n{}".format(signatures[master_schema_entry.name] + .stringify("\t", False, False, False))) + + """ + + Note: Master schema entries (schema) are all pulled from the base version (the SQLite database file). Currently, + the master schema entries are taken from the base version. Even though schema additions are handled in the + WAL file for existing tables, tables added in the WAL have not been accounted for yet. + + """ + + # Export to text + if export_type == EXPORT_TYPES.TEXT: + print_text(output_directory, file_prefix, export_type, carve, carve_freelists, + specified_tables_to_carve, version_history, signatures, logger) + + # Export to csv + elif export_type == EXPORT_TYPES.CSV: + print_csv(output_directory, file_prefix, export_type, carve, carve_freelists, + specified_tables_to_carve, version_history, signatures, logger) + + # Export to sqlite + elif export_type == EXPORT_TYPES.SQLITE: + print_sqlite(output_directory, file_prefix, export_type, carve, carve_freelists, + specified_tables_to_carve, version_history, signatures, logger) + + # Export to xlsx + elif export_type == EXPORT_TYPES.XLSX: + print_xlsx(output_directory, file_prefix, export_type, carve, carve_freelists, + specified_tables_to_carve, version_history, signatures, logger) + + # The export type was not found (this should not occur due to the checking of argparse) + else: + raise SqliteError("Invalid option for export type: {}.".format(export_type)) + + # Carve the rollback journal if found and carving is not specified + if rollback_journal_file and not carve: + print("Rollback journal file found: {}. Rollback journal file parsing is under development and " + "currently only supports carving. Please rerun with the --carve option for this output.") + + # Carve the rollback journal if found and carving is specified + if rollback_journal_file and carve: + + if not output_directory: + + print("Rollback journal file found: {}. Rollback journal file carving is under development and " + "currently only outputs to CSV. Due to this, the output directory needs to be specified. Please" + "rerun with a output directory specified in order for this to complete.") + + else: + + print("Carving rollback journal file: {}. Rollback journal file carving is under development and " + "currently only outputs to CSV. Any export type specified will be overridden for this.") + + carve_rollback_journal(output_directory, rollback_journal_file, rollback_journal_file_name, + specified_tables_to_carve, rollback_journal_exempted_tables, + version_history, signatures, logger) + + print("Finished in {} seconds.".format(round(time() - start_time, 2))) + + +def print_text(output_directory, file_prefix, export_type, carve, carve_freelists, specified_tables_to_carve, + version_history, signatures, logger): + + if output_directory: + + file_postfix = ".txt" + text_file_name = file_prefix + file_postfix + + # Export all index and table histories to a text file while supplying signature to carve with + print("\nExporting history as {} to {}{}{}...".format(export_type, output_directory, sep, text_file_name)) + logger.debug("Exporting history as {} to {}{}{}." + .format(export_type, output_directory, sep, text_file_name)) + + with CommitTextExporter(output_directory, text_file_name) as commit_text_exporter: + + for master_schema_entry in version_history.versions[BASE_VERSION_NUMBER]\ + .master_schema.master_schema_entries: + + # Only account for the specified tables + if specified_tables_to_carve and master_schema_entry.name not in specified_tables_to_carve: + continue + + if master_schema_entry.row_type in [MASTER_SCHEMA_ROW_TYPE.INDEX, MASTER_SCHEMA_ROW_TYPE.TABLE]: + + signature = None + if carve: + signature = signatures[master_schema_entry.name] if master_schema_entry.name in signatures\ + else None + + if not signature and master_schema_entry.row_type is MASTER_SCHEMA_ROW_TYPE.TABLE \ + and not master_schema_entry.without_row_id \ + and not master_schema_entry.internal_schema_object: + print("Unable to find signature for: {}. This table will not be carved." + .format(master_schema_entry.name)) + logger.error("Unable to find signature for: {}. This table will not be carved." + .format(master_schema_entry.name)) + + if signature: + version_history_parser = VersionHistoryParser(version_history, master_schema_entry, None, None, + signature, carve_freelists) + else: + version_history_parser = VersionHistoryParser(version_history, master_schema_entry) + + page_type = version_history_parser.page_type + commit_text_exporter.write_header(master_schema_entry, page_type) + + for commit in version_history_parser: + commit_text_exporter.write_commit(commit) + + else: + + # Export all index and table histories to csv files while supplying signature to carve with + logger.debug("Exporting history to {} as {}.".format("console", export_type)) + + for master_schema_entry in version_history.versions[BASE_VERSION_NUMBER].master_schema.master_schema_entries: + + # Only account for the specified tables + if specified_tables_to_carve and master_schema_entry.name not in specified_tables_to_carve: + continue + + if master_schema_entry.row_type in [MASTER_SCHEMA_ROW_TYPE.INDEX, MASTER_SCHEMA_ROW_TYPE.TABLE]: + + signature = None + if carve: + signature = signatures[master_schema_entry.name] if master_schema_entry.name in signatures else None + + if not signature and master_schema_entry.row_type is MASTER_SCHEMA_ROW_TYPE.TABLE \ + and not master_schema_entry.without_row_id \ + and not master_schema_entry.internal_schema_object: + print("Unable to find signature for: {}. This table will not be carved." + .format(master_schema_entry.name)) + logger.error("Unable to find signature for: {}. This table will not be carved." + .format(master_schema_entry.name)) + + if signature: + version_history_parser = VersionHistoryParser(version_history, master_schema_entry, None, None, + signature, carve_freelists) + else: + version_history_parser = VersionHistoryParser(version_history, master_schema_entry) + + page_type = version_history_parser.page_type + CommitConsoleExporter.write_header(master_schema_entry, page_type) + + for commit in version_history_parser: + CommitConsoleExporter.write_commit(commit) + + +def print_csv(output_directory, file_prefix, export_type, carve, carve_freelists, specified_tables_to_carve, + version_history, signatures, logger): + + # Export all index and table histories to csv files while supplying signature to carve with + print("\nExporting history as {} to {}...".format(export_type, output_directory)) + logger.debug("Exporting history to {} as {}.".format(output_directory, export_type)) + + commit_csv_exporter = CommitCsvExporter(output_directory, file_prefix) + + for master_schema_entry in version_history.versions[BASE_VERSION_NUMBER].master_schema.master_schema_entries: + + # Only account for the specified tables + if specified_tables_to_carve and master_schema_entry.name not in specified_tables_to_carve: + continue + + if master_schema_entry.row_type in [MASTER_SCHEMA_ROW_TYPE.INDEX, MASTER_SCHEMA_ROW_TYPE.TABLE]: + + signature = None + if carve: + signature = signatures[master_schema_entry.name] if master_schema_entry.name in signatures else None + + if not signature and master_schema_entry.row_type is MASTER_SCHEMA_ROW_TYPE.TABLE \ + and not master_schema_entry.without_row_id \ + and not master_schema_entry.internal_schema_object: + print("Unable to find signature for: {}. This table will not be carved." + .format(master_schema_entry.name)) + logger.error("Unable to find signature for: {}. This table will not be carved." + .format(master_schema_entry.name)) + + if signature: + version_history_parser = VersionHistoryParser(version_history, master_schema_entry, None, None, + signature, carve_freelists) + else: + version_history_parser = VersionHistoryParser(version_history, master_schema_entry) + + for commit in version_history_parser: + commit_csv_exporter.write_commit(master_schema_entry, commit) + + +def print_sqlite(output_directory, file_prefix, export_type, carve, carve_freelists, + specified_tables_to_carve, version_history, signatures, logger): + + file_postfix = "-sqlite-dissect.db3" + sqlite_file_name = file_prefix + file_postfix + + print("\nExporting history as {} to {}{}{}...".format(export_type, output_directory, sep, sqlite_file_name)) + logger.debug("Exporting history as {} to {}{}{}.".format(export_type, output_directory, sep, sqlite_file_name)) + + with CommitSqliteExporter(output_directory, sqlite_file_name) as commit_sqlite_exporter: + + for master_schema_entry in version_history.versions[BASE_VERSION_NUMBER].master_schema.master_schema_entries: + + # Only account for the specified tables + if specified_tables_to_carve and master_schema_entry.name not in specified_tables_to_carve: + continue + + if master_schema_entry.row_type in [MASTER_SCHEMA_ROW_TYPE.INDEX, MASTER_SCHEMA_ROW_TYPE.TABLE]: + + signature = None + if carve: + signature = signatures[master_schema_entry.name] if master_schema_entry.name in signatures else None + + if not signature and master_schema_entry.row_type is MASTER_SCHEMA_ROW_TYPE.TABLE \ + and not master_schema_entry.without_row_id \ + and not master_schema_entry.internal_schema_object: + print("Unable to find signature for: {}. This table will not be carved." + .format(master_schema_entry.name)) + logger.error("Unable to find signature for: {}. This table will not be carved." + .format(master_schema_entry.name)) + + if signature: + version_history_parser = VersionHistoryParser(version_history, master_schema_entry, None, None, + signature, carve_freelists) + else: + version_history_parser = VersionHistoryParser(version_history, master_schema_entry) + + for commit in version_history_parser: + commit_sqlite_exporter.write_commit(master_schema_entry, commit) + + +def print_xlsx(output_directory, file_prefix, export_type, carve, carve_freelists, specified_tables_to_carve, + version_history, signatures, logger): + + file_postfix = ".xlsx" + xlsx_file_name = file_prefix + file_postfix + + # Export all index and table histories to a xlsx workbook while supplying signature to carve with + print("\nExporting history as {} to {}{}{}...".format(export_type, output_directory, sep, xlsx_file_name)) + logger.debug("Exporting history as {} to {}{}{}.".format(export_type, output_directory, sep, xlsx_file_name)) + + with CommitXlsxExporter(output_directory, xlsx_file_name) as commit_xlsx_exporter: + + for master_schema_entry in version_history.versions[BASE_VERSION_NUMBER].master_schema.master_schema_entries: + + # Only account for the specified tables + if specified_tables_to_carve and master_schema_entry.name not in specified_tables_to_carve: + continue + + if master_schema_entry.row_type in [MASTER_SCHEMA_ROW_TYPE.INDEX, MASTER_SCHEMA_ROW_TYPE.TABLE]: + + signature = None + if carve: + signature = signatures[master_schema_entry.name] if master_schema_entry.name in signatures else None + + if not signature and master_schema_entry.row_type is MASTER_SCHEMA_ROW_TYPE.TABLE \ + and not master_schema_entry.without_row_id \ + and not master_schema_entry.internal_schema_object: + print("Unable to find signature for: {}. This table will not be carved." + .format(master_schema_entry.name)) + logger.error("Unable to find signature for: {}. This table will not be carved." + .format(master_schema_entry.name)) + + if signature: + version_history_parser = VersionHistoryParser(version_history, master_schema_entry, None, None, + signature, carve_freelists) + else: + version_history_parser = VersionHistoryParser(version_history, master_schema_entry) + + for commit in version_history_parser: + commit_xlsx_exporter.write_commit(master_schema_entry, commit) + + +def carve_rollback_journal(output_directory, rollback_journal_file, rollback_journal_file_name, + specified_tables_to_carve, rollback_journal_exempted_tables, + version_history, signatures, logger): + + """ + + Carve the Rollback Journal file (Under Development) + + Note: Since there is no normal parsing of the rollback journal file implemented yet, this is only done when + carving is specified. Also, since we are blindly carving each page in the rollback journal currently, + we are not checking for pointer map pages, freelist pages, and so on. Therefore, we do not care about the + carve_freelist_pages option here. The rollback journal file is being carved as it were all unallocated space. + + """ + + csv_prefix_rollback_journal_file_name = basename(normpath(rollback_journal_file_name)) + print("Exporting rollback journal carvings as CSV to {}...".format(output_directory)) + logger.debug("Exporting rollback journal carvings as csv to output directory: {}.".format(output_directory)) + + commit_csv_exporter = CommitCsvExporter(output_directory, csv_prefix_rollback_journal_file_name) + + for master_schema_entry in version_history.versions[BASE_VERSION_NUMBER].master_schema.master_schema_entries: + + # Only account for the specified tables + if specified_tables_to_carve and master_schema_entry.name not in specified_tables_to_carve: + continue + + if master_schema_entry.name in rollback_journal_exempted_tables: + logger.debug("Skipping exempted table: {} from rollback journal parsing.".format(master_schema_entry.name)) + continue + + """ + + Only account for OrdinaryTableRow objects (not VirtualTableRow objects) that are not "without rowid" tables. + All signatures generated will not be outside this criteria either. + + """ + + if isinstance(master_schema_entry, OrdinaryTableRow) and not master_schema_entry.without_row_id: + + signature = None + if signatures and master_schema_entry.name in signatures: + signature = signatures[master_schema_entry.name] + + # Make sure we found the error but don't error out if we don't. Alert the user. + if not signature and master_schema_entry.row_type is MASTER_SCHEMA_ROW_TYPE.TABLE \ + and not master_schema_entry.without_row_id \ + and not master_schema_entry.internal_schema_object: + print("Unable to find signature for: {}. This table will not be carved from the rollback journal." + .format(master_schema_entry.name)) + logger.error("Unable to find signature for: {}. This table will not be carved from the " + "rollback journal.".format(master_schema_entry.name)) + + else: + + # Carve the rollback journal with the signature + carved_commits = RollBackJournalCarver.carve(rollback_journal_file, + version_history.versions[BASE_VERSION_NUMBER], + master_schema_entry, signature) + + for commit in carved_commits: + commit_csv_exporter.write_commit(master_schema_entry, commit) + + +if __name__ == "__main__": + + description = "SQLite Dissect is a SQLite parser with recovery abilities over SQLite databases " \ + "and their accompanying journal files. If no options are set other than the file " \ + "name, the default behaviour will be to check for any journal files and print to " \ + "the console the output of the SQLite files. The directory of the SQLite file " \ + "specified will be searched through to find the associated journal files. If " \ + "they are not in the same directory as the specified file, they will not be found " \ + "and their location will need to be specified in the command. SQLite carving " \ + "will not be done by default. Please see the options below to enable carving." + + parser = ArgumentParser(description=description) + + parser.add_argument("sqlite_file", metavar="SQLITE_FILE", help="The SQLite database file") + + parser.add_argument("-v", "--version", action="version", version="version {version}".format(version=__version__), + help="display the version of SQLite Dissect") + parser.add_argument("-d", "--directory", metavar="OUTPUT_DIRECTORY", help="directory to write output to " + "(must be specified for outputs other " + "than console text)") + parser.add_argument("-p", "--file-prefix", default="", metavar="FILE_PREFIX", + help="the file prefix to use on output files, default is the name of the SQLite " + "file (the directory for output must be specified)") + parser.add_argument("-e", "--export", choices=["text", "csv", "sqlite", "xlsx"], default="text", + metavar="EXPORT_TYPE", + help="the format to export to {text, csv, sqlite, xlsx} (text written to console if -d " + "is not specified)") + + journal_group = parser.add_mutually_exclusive_group() + journal_group.add_argument("-n", "--no-journal", action="store_true", default=False, + help="turn off automatic detection of journal files") + journal_group.add_argument("-w", "--wal", + help="the wal file to use instead of searching the SQLite file directory by default") + journal_group.add_argument("-j", "--rollback-journal", + help="the rollback journal file to use in carving instead of searching the SQLite file " + "directory by default (under development, currently only outputs to csv, output " + "directory needs to be specified)") + + parser.add_argument("-r", "--exempted-tables", metavar="EXEMPTED_TABLES", + help="comma-delimited string of tables [table1,table2,table3] to exempt (only implemented " + "and allowed for rollback journal parsing currently) ex.) table1,table2,table3") + + parser.add_argument("-s", "--schema", action="store_true", + help="output the schema to console, the initial schema found in the main database file") + parser.add_argument("-t", "--schema-history", action="store_true", + help="output the schema history to console, prints the --schema information and " + "write-head log changes") + + parser.add_argument("-g", "--signatures", action="store_true", + help="output the signatures generated to console") + + parser.add_argument("-c", "--carve", action="store_true", default=False, + help="carves and recovers table data") + parser.add_argument("-f", "--carve-freelists", action="store_true", default=False, + help="carves freelist pages (carving must be enabled, under development)") + + parser.add_argument("-b", "--tables", metavar="TABLES", + help="specified comma-delimited string of tables [table1,table2,table3] to carve " + "ex.) table1,table2,table3") + + parser.add_argument("-k", "--disable-strict-format-checking", action="store_true", default=False, + help="disable strict format checks for SQLite databases " + "(this may result in improperly parsed SQLite files)") + + logging_group = parser.add_mutually_exclusive_group() + logging_group.add_argument("-l", "--log-level", default="off", + choices=["critical", "error", "warning", "info", "debug", "off"], + metavar="LOG_LEVEL", + help="level to log messages at {critical, error, warning, info, debug, off}") + parser.add_argument("-i", "--log-file", default=None, metavar="LOG_FILE", + help="log file to write too, default is to " + "write to console, ignored if log " + "level set to off (appends if file " + "already exists)") + + parser.add_argument("--warnings", action="store_true", default=False, help="enable runtime warnings") + + # Call the main function + main(parser.parse_args()) diff --git a/pyinstaller/sqlite_dissect_linux-x64_onedir.spec b/pyinstaller/sqlite_dissect_linux-x64_onedir.spec new file mode 100644 index 0000000..c400d13 --- /dev/null +++ b/pyinstaller/sqlite_dissect_linux-x64_onedir.spec @@ -0,0 +1,41 @@ +# Initially generated with the "pyinstaller main.py" command. Altered after for minor changes. +# Consecutively run after modifications from the project root directory as: +# pyinstaller pyinstaller\sqlite_dissect_linux-x64_onedir.spec +# Please see https://github.com/pyinstaller/pyinstaller/issues/5540 if errors with the ldconfig are encountered. +# -*- mode: python -*- + +import PyInstaller.config + +PyInstaller.config.CONF['distpath'] = "./dist/linux-x64" + +block_cipher = None + + +a = Analysis(['../main.py'], + pathex=[], + binaries=[], + datas=[], + hiddenimports=[], + hookspath=[], + runtime_hooks=[], + excludes=[], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher) +pyz = PYZ(a.pure, a.zipped_data, + cipher=block_cipher) +exe = EXE(pyz, + a.scripts, + exclude_binaries=True, + name='sqlite_dissect', + debug=False, + strip=False, + upx=True, + console=True ) +coll = COLLECT(exe, + a.binaries, + a.zipfiles, + a.datas, + strip=False, + upx=True, + name='sqlite_dissect') diff --git a/pyinstaller/sqlite_dissect_linux-x64_onefile.spec b/pyinstaller/sqlite_dissect_linux-x64_onefile.spec new file mode 100644 index 0000000..82dd684 --- /dev/null +++ b/pyinstaller/sqlite_dissect_linux-x64_onefile.spec @@ -0,0 +1,37 @@ +# Initially generated with the "pyinstaller main.py --onefile" command. Altered after for minor changes. +# Consecutively run after modifications from the project root directory as: +# pyinstaller pyinstaller\sqlite_dissect_linux-x64_onefile.spec +# Please see https://github.com/pyinstaller/pyinstaller/issues/5540 if errors with the ldconfig are encountered. +# -*- mode: python -*- + +import PyInstaller.config + +PyInstaller.config.CONF['distpath'] = "./dist/linux-x64/bin" + +block_cipher = None + + +a = Analysis(['../main.py'], + pathex=[], + binaries=[], + datas=[], + hiddenimports=[], + hookspath=[], + runtime_hooks=[], + excludes=[], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher) +pyz = PYZ(a.pure, a.zipped_data, + cipher=block_cipher) +exe = EXE(pyz, + a.scripts, + a.binaries, + a.zipfiles, + a.datas, + name='sqlite_dissect', + debug=False, + strip=False, + upx=True, + runtime_tmpdir=None, + console=True ) diff --git a/pyinstaller/sqlite_dissect_win-x86_64_onedir.spec b/pyinstaller/sqlite_dissect_win-x86_64_onedir.spec new file mode 100644 index 0000000..9daa043 --- /dev/null +++ b/pyinstaller/sqlite_dissect_win-x86_64_onedir.spec @@ -0,0 +1,40 @@ +# Initially generated with the "pyinstaller main.py" command. Altered after for minor changes. +# Consecutively run after modifications from the project root directory as: +# pyinstaller pyinstaller\sqlite_dissect_win-x86_64_onedir.spec +# -*- mode: python -*- + +import PyInstaller.config + +PyInstaller.config.CONF['distpath'] = "./dist/win-x86_64" + +block_cipher = None + + +a = Analysis(['../main.py'], + pathex=[], + binaries=[], + datas=[], + hiddenimports=[], + hookspath=[], + runtime_hooks=[], + excludes=[], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher) +pyz = PYZ(a.pure, a.zipped_data, + cipher=block_cipher) +exe = EXE(pyz, + a.scripts, + exclude_binaries=True, + name='sqlite_dissect', + debug=False, + strip=False, + upx=True, + console=True ) +coll = COLLECT(exe, + a.binaries, + a.zipfiles, + a.datas, + strip=False, + upx=True, + name='sqlite_dissect') diff --git a/pyinstaller/sqlite_dissect_win-x86_64_onefile.spec b/pyinstaller/sqlite_dissect_win-x86_64_onefile.spec new file mode 100644 index 0000000..1ca52aa --- /dev/null +++ b/pyinstaller/sqlite_dissect_win-x86_64_onefile.spec @@ -0,0 +1,36 @@ +# Initially generated with the "pyinstaller main.py --onefile" command. Altered after for minor changes. +# Consecutively run after modifications from the project root directory as: +# pyinstaller pyinstaller\sqlite_dissect_win-x86_64_onefile.spec +# -*- mode: python -*- + +import PyInstaller.config + +PyInstaller.config.CONF['distpath'] = "./dist/win-x86_64/bin" + +block_cipher = None + + +a = Analysis(['../main.py'], + pathex=[], + binaries=[], + datas=[], + hiddenimports=[], + hookspath=[], + runtime_hooks=[], + excludes=[], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher) +pyz = PYZ(a.pure, a.zipped_data, + cipher=block_cipher) +exe = EXE(pyz, + a.scripts, + a.binaries, + a.zipfiles, + a.datas, + name='sqlite_dissect', + debug=False, + strip=False, + upx=True, + runtime_tmpdir=None, + console=True ) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..c265a6f --- /dev/null +++ b/setup.py @@ -0,0 +1,41 @@ +from setuptools import setup +from _version import __version__ + +""" + +setup.py + +This script will be used to setup the sqlite_dissect package for use in python environments. + +Note: To compile a distribution for the project run "python setup.py sdist" in the directory this file is located in. + +Note: openpyxl is needed for the xlsx export and will install jdcal and et-xmlfile ["openpyxl>=2.4.0b1"] + +Note: PyInstaller is used for generation of executables but not included in this setup.py script and will + install altgraph, dis3, macholib, pefile, pypiwin32, pywin32 as dependencies. [pyinstaller==3.6 needs to be used + for Python 2.7 since the newer versions of PyInstaller of 4.0+ require Python 3.6] Information on how to run + PyInstaller is included in the spec files under the pyinstaller directory. Four files are here, two for windows + and two for linux, both for x64 platforms. The two different files for each allow you to build it as one single + file or a directory of decompressed files. Since the one file extracts to a temp directory in order to run, on + some systems this may be blocked and therefore the directory of files is preferred. + +""" + +setup(name="sqlite_dissect", + version=__version__, + url="https://github.com/Defense-Cyber-Crime-Center/sqlite-dissect", + description="This package allows parsing and carving of sqlite files", + author="Defense Cyber Crime Center (DC3)", + author_email="TSD@dc3.mil", + packages=["sqlite_dissect", + "sqlite_dissect.file", + "sqlite_dissect.file.database", + "sqlite_dissect.file.journal", + "sqlite_dissect.file.schema", + "sqlite_dissect.file.wal", + "sqlite_dissect.file.wal_index", + "sqlite_dissect.carving", + "sqlite_dissect.export"], + install_requires=["openpyxl>=2.4.0b1"], + zip_safe=False + ) diff --git a/sqlite_dissect/README.md b/sqlite_dissect/README.md new file mode 100644 index 0000000..a02d3b5 --- /dev/null +++ b/sqlite_dissect/README.md @@ -0,0 +1,205 @@ + +# sqlite_dissect + +This package will have scripts for overall usage throughout the SQLite Dissect library allowing the functionality +to parse through the data and access to underlying functions through an interface. + +The init script will initialize the logger for this library with a NullHandler to prevent unexpected output +from applications that may not be implementing logging. It will also ignore warnings reported by the python +warning by default. (Warnings are also thrown to the logger when they occur in addition to the warnings +framework.) + +>Note: This library will use warnings for things that may not be fully implemented or handled yet. (In other cases, +> NotImplementedErrors may be raised.) To turn off warnings use the "-W ignore" option. See the Python +> documentation for further options. + +- constants.py +- exception.py +- interface.py +- output.py +- utilities.py +- version_history.py + +TODO items for the "sqlite_dissect" package: + +- [ ] Finish UML class diagrams. +- [ ] \_\_init\_\_.py: Create a raise exception function to call to reduce lines of code that will log inside of it. +- [ ] \_\_init\_\_.py: Create global static variables to be used for store_in_memory, strict_format_checking, etc. +- [ ] \_\_init\_\_.py: Implement strict_format_checking into journal, other types besides database, wal +- [ ] \_\_init\_\_.py: Investigate differences in use of logging.warn vs. warning.warn. +- [ ] \_\_init\_\_.py: Create custom warnings for the library. + +
+ +### constants.py + +This script holds constants defined for reference by the sqlite carving library. Additionally, a class has been +added to this script for constant enumerations. + +This script holds the following object(s): +- Enum(MutableMapping) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. + +
+ +### exception.py + +This script holds the custom exceptions used in this library. + +This script holds the following object(s): +- SqliteError(Exception) +- ParsingError(SqliteError) +- HeaderParsingError(ParsingError) +- MasterSchemaParsingError(ParsingError) +- MasterSchemaRowParsingError(MasterSchemaParsingError) +- PageParsingError(ParsingError) +- BTreePageParsingError(PageParsingError) +- CellParsingError(BTreePageParsingError) +- RecordParsingError(CellParsingError) +- VersionParsingError(ParsingError) +- DatabaseParsingError(VersionParsingError) +- WalParsingError(VersionParsingError) +- WalFrameParsingError(WalParsingError) +- WalCommitRecordParsingError(WalParsingError) +- SignatureError(SqliteError) +- CarvingError(SqliteError) +- CellCarvingError(CarvingError) +- InvalidVarIntError(CarvingError) +- OutputError(SqliteError) +- ExportError(SqliteError) +

+ +TODO: +- [ ] Documentation improvements. + +
+ +### interface.py + +This script acts as a simplified interface for common operations for the sqlite carving library. + +This script holds the following object(s): +- create_database(file_identifier, store_in_memory=False, strict_format_checking=True) +- create_write_ahead_log(file_name, file_object=None) +- create_version_history(database, write_ahead_log=None) +- get_table_names(database) +- get_index_names(database) +- select_all_from_table(table_name, version) +- select_all_from_index(index_name, version) +- create_table_signature(table_name, version, version_history=None) +- carve_table(table_name, signature, version) +- get_version_history_iterator(table_or_index_name, version_history, signature=None) +- export_table_or_index_version_history_to_csv(export_directory, version_history, table_or_index_name, signature=None, carve_freelist_pages=False) +- export_version_history_to_csv(export_directory, version_history, signatures=None, carve_freelist_pages=False) +- export_table_or_index_version_history_to_sqlite(export_directory, sqlite_file_name, version_history, table_or_index_name, signature=None, carve_freelist_pages=False): +- export_version_history_to_sqlite(export_directory, sqlite_file_name, version_history, signatures=None, carve_freelist_pages=False): +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Account for schema changes across the versions. +- [ ] Implement index signatures. +- [ ] Update documentation on the BASE_VERSION_NUMBER where it is used. +- [ ] create_table_signature: Note on how the version history is recommended if possible. + +
+ +### output.py + +This script holds general output functions used for debugging, logging, and general output for the +sqlite carving library. + +This script holds the following object(s): +- get_page_breakdown(pages) +- get_pointer_map_entries_breakdown(version) +- stringify_b_tree(version_interface, b_tree_root_page, padding="") +- stringify_cell_record(cell, database_text_encoding, page_type) +- stringify_cell_records(cells, database_text_encoding, page_type) +- stringify_master_schema_version(version) +- stringify_master_schema_versions(version_history) +- stringify_page_history(version_history, padding="") +- stringify_page_information(version, padding="") +- stringify_page_structure(version, padding="") +- stringify_unallocated_space(version, padding="", include_whitespace=True, whitespace_threshold=0) +- stringify_version_pages(version, padding="") +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Implement better exception handling when parsing objects. +- [ ] Make sure different encodings are handled in every function in this script where applicable. +- [ ] get_pointer_map_entries_breakdown: Handle the pointer map page breakdown tuple better. +- [ ] stringify_unallocated_space: Implement a whitespace threshold for trimming, etc. + +
+ +### utilities.py + +This script holds general utility functions for reference by the sqlite carving library. + +This script holds the following object(s): +- calculate_expected_overflow(overflow_byte_size, page_size) +- decode_varint(byte_array, offset) +- encode_varint(value) +- get_class_instance(class_name) +- get_md5_hash(string) +- get_record_content(serial_type, record_body, offset=0) +- get_serial_type_signature(serial_type) +- get_storage_class(serial_type) +- has_content(byte_array) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Implement try/except exception handling for struct.error and ord. +- [ ] The varint related functions only work in big endian. Are there use cases for little endian? + +
+ +### version_history.py + +This script holds the superclass objects used for parsing the database and write ahead log in a sequence of versions +throughout all of the commit records in the write ahead log. + +This script holds the following object(s): +- VersionHistory(object) +- VersionHistoryParser(VersionParser) (with VersionHistoryParserIterator(object) as an inner class) +- Commit(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Incorporate journal files once they are implemented. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Handle exceptions that may be raised from creating and working with objects better. + ##### VersionHistory Class: + - [ ] Better exception handling when creating objects such as commit records, etc. + - [ ] Investigate what occurs if the last commit record is not committed (warning currently thrown). + ##### VersionHistoryParser Class: + - [ ] Support the same master schema entry being deleted and then re-added (Keep in mind row id). + - [ ] How to handle master schema entries not found in specified versions? + - [ ] Support for virtual table modules of master schema entry table type. + - [ ] Support for "without rowid" tables (index b-tree pages). + - [ ] Support for index b-trees that are internal schema objects with no SQL. + - [ ] Investigate issues with same rows in index b-tree leaf pages that might get deleted. + - [ ] Track pages being moved to the freelist to account for carving with other signatures? + - [ ] Handle master schema entries that have no entries (view, trigger, etc.) in the iterator. + - [ ] Handle master schema entries that are not supported yet (virtual, etc.) in the iterator. + - [ ] Use accounted for cell digests for deleted cells in the aggregate leaf cells function? + - [ ] How to detect index leaf page cell updates (file offset may not work and no row id). + - [ ] Is checking on the row id sufficient for detecting updates on table leaf pages for cells. + - [ ] Does this class belong here and should carving be incorporated or separate to this class? + - [ ] Have a better way to specify if carving was enabled or not (possibly in Commit?). + - [ ] VersionParserIterator: Investigate what to return for version with no modification. + - [ ] VersionParserIterator: Extend carving capabilities beyond tables once implemented. + - [ ] VersionParserIterator: Check carvings are correctly being detected as duplicates per md5. + - [ ] VersionParserIterator: Use dictionary comprehension for added and deleted cells for loops. + ##### Commit Class: + - [ ] Handle the updated property differently depending on differences in b-tree and freelist changes. diff --git a/sqlite_dissect/__init__.py b/sqlite_dissect/__init__.py new file mode 100644 index 0000000..2114bac --- /dev/null +++ b/sqlite_dissect/__init__.py @@ -0,0 +1,48 @@ +import logging +import warnings +from sqlite_dissect.constants import LOGGER_NAME + +""" + +__init__.py + +This package will have scripts for overall usage throughout the SQLite Dissect library allowing the functionality +to parse through the data and access to underlying functions through an interface. + +This init script will initialize the logger for this library with a NullHandler to prevent unexpected output +from applications that may not be implementing logging. It will also ignore warnings reported by the python +warning by default. (Warnings are also thrown to the logger when they occur in addition to the warnings +framework.) + +Note: This library will use warnings for things that may not be fully implemented or handled yet. (In other cases, + NotImplementedErrors may be raised.) To turn off warnings use the "-W ignore" option. See the Python + documentation for further options. + +""" + + +# Import interface as api +from sqlite_dissect.interface import * + + +def null_logger(): + try: + + # Import the NullHandler from the logging package + from logging import NullHandler + + except ImportError: + + # Make our own if an error occurring while importing + class NullHandler(logging.Handler): + + def emit(self, record): + pass + + # Get the logger from the LOGGER_NAME constant and add the NullHandler to it + logging.getLogger(LOGGER_NAME).addHandler(NullHandler()) + + logging.getLogger(LOGGER_NAME).propagate = False + + # Ignore warnings by default + warnings.filterwarnings("ignore") diff --git a/sqlite_dissect/carving/README.md b/sqlite_dissect/carving/README.md new file mode 100644 index 0000000..ee7ac5f --- /dev/null +++ b/sqlite_dissect/carving/README.md @@ -0,0 +1,224 @@ + +# sqlite_dissect.carving + +This package will control signature generation and carving of SQLite files. + +- carved_cell.py +- carver.py +- rollback_journal_carver.py +- signature.py +- utilities.py + +TODO items for the "carving" package: + +- [ ] Finish UML class diagrams. + +
+ +### carved_cell.py + +This script holds the objects used for carving cells from the unallocated and freeblock space in SQLite +b-tree pages used in conjunction with other classes in the carving package. These objects subclass their +respective higher level SQLite database object type and add to them while parsing the data in a different way. + +This script holds the following object(s): +- CarvedBTreeCell(BTreeCell) +- CarvedRecord(Payload) +- CarvedRecordColumn(RecordColumn) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Investigate a way to account for overflow. +- [ ] Investigate if fragments exist, have any affect on carving. +- [ ] Subclass CarvedBTreeCell for b-tree cell types. +- [ ] Subclass CarvedRecord for freeblock and unallocated space algorithms for carving. +- [ ] Handle multi-byte varints (blob and text serial types) better. +- [ ] How to account for use cases where carved data is all 0x00 bytes. +- [ ] Handle use cases where the primary key is an integer and negative resulting in negative (9-byte) varints. +- [ ] Fix the start and end offset and account for the freeblock, freeblock_size, and next_freeblock_offset. +- [ ] For the first serial types need to cross reference first column if integer primary key in table b-tree leaf table == null 00 +- [ ] For the first serial types need to cross reference with row signatures (if not schema) (prob + focued + schema + first removed on unalloc etc) +- [ ] Address the row_id as being set initially to "Unknown" which was temporarily added for consistence with other cells (b-tree) and need to check other use cases. +- [ ] Check that the payload size is less than the length or else partial entry. +- [ ] Add better logging. +- [ ] Calculate or analyze MD5s of headers. +- [ ] Figure out how MD5 hashes will work on carved record, carved record columns, and carved b-tree cells. +- [ ] Look into the calculated body content size assuming one (correct) entry in the signature. +- [ ] Address header/body/etc byte sizes. +- [ ] Check size of record columns to expected columns. + ##### CarvedBTreeCell Class + - [ ] Remove the first column serial types now that the signature is sent in? + - [ ] handle the version and page version number correctly in reference to journal file parsing. + ##### CarvedRecord Class + - [ ] See if basing the first_serial_type off of other carved cells if found before redoing unallocated/freeblocks if possible. + - [ ] When checking the signature, see if is there a better way to utilize it if there are no entries like switching to the schema signature (b-tree leaf?). + - [ ] Address the truncated record column index/column name. + - [ ] Handle cutoff_offset relation to truncated and indexing. + - [ ] Handle overflow. + - [ ] Fragment parsing. + - [ ] Subclass types of cells, freeblock. + - [ ] What if the assumed preceding serial type is not in the first serial types sign (use prob?). + - [ ] Address issues that can occur when first_serial_type_varint_length != -1. + - [ ] Need documentation on how the serial type is always obtainable for freeblocks at least only if the next two bytes != size (ie. sub freeblock) if the start offset >= 2 and it is a freeblock. + - [ ] Check the equals (>= and <) for start offset >= 2 and is a freeblock while iterating through the carved record columns. + - [ ] Update debugging messages (for example, after except like with InvalidVarIntError) + - [ ] If string or blob may be able to iterate backwards until proper offsets are found and look into other use cases. + - [ ] Document use cases for first_column_serial_types (4?). + - [ ] Report size of missing data/columns/etc if truncated for carved_record_column objects. + - [ ] Look into sending unallocated byte size in the constructor for carved_record_column objects. + - [ ] Specify if the unallocated information is included or overwritten in the header for carved_record_column objects. + - [ ] Document after adjusting the serial type definition size off of the first serial type specified for carved_record_column objects. + - [ ] Need documentation on the "32" number [ (9 - 4) + 9 + 9 + 9 ] = up to 32 bytes preceding (derived header byte size). + - [ ] Using the simplified_probabilistic_signature can give bad data. + - [ ] Fix when the serial type is 12 or 13. If the signatures is -1 or -2 should be 0->57 (min/max). + - [ ] Try doing a reverse search for row id and payload length (assuming 1 varint length for row id). + - [ ] Derive differences between derived payload and actual payload if actual is not found (and other fields). + - [ ] Need to reverse search for row id and payload length (assuming 1 varint length for row id). + ##### CarvedRecordColumn Class + - [ ] Incorporate absolute offsets. + - [ ] Calculate and set the md5 hex digest. + - [ ] Handle the value and md5 hex digest (and probably others) so values are sent into \_\_init\_\_? + - [ ] Handle table interior, index interior, index leaf, and additional use cases. + - [ ] Make sure string values are in the correct text encoding for the database. + - [ ] Use \_\_slots\_\_ or some other way to reduce memory since many of these objects will be created. + - [ ] Update documentation around the no bytes preceding note. + +
+ +### carver.py + +This script holds carver objects for identifying and parsing out cells from unallocated and +freeblock space in SQLite b-tree pages. + +This script holds the following object(s): +- SignatureCarver(Carver) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] On some files (ex. talk.sqlite), lots of "empty space" signatures were printed. Fix these use cases. +- [ ] Account for changing schemas (schema cookie, etc.). +- [ ] Investigate if there is a way to handle fragments (fragment "sizes" can be > 3). +- [ ] Better handling of errors thrown while generating carved cells. +- [ ] Handle use cases where the primary key is an integer and negative resulting in negative (9-byte) varints. +- [ ] Investigate if there is any need to account for different database encodings. + ##### SignatureCarver Class + - [ ] Incorporate altered tables within the signature in carving, not just the full signature. + - [ ] Address overflow. + - [ ] Specify which signatures to carve with (if important or schema vs simplified)? + - [ ] Currently matches are done in reverse for better findings. Should this also be done in order? + - [ ] Update the cutoff offset based on the earliest offset found in the carved b-tree cell. + - [ ] Remove the cutoff offset by sending in a truncated data array in to the CarvedBTreeCell? + - [ ] Change the first column serial types from an array to boolean since signature is now sent in. + - [ ] carve_freeblocks: Handle use cases where the first serial type in the record header exists. + - [ ] carve_freeblocks: Check why originally there was an exception if the first serial types > 1. + - [ ] carve_freeblocks: Handle multi-byte varints in the first serial types (warning currently raised). + - [ ] carve_freeblocks: Apply additional use cases to the use of the cutoff offset. + - [ ] carve_freeblocks: Check why search was used if len(signature) == 2 and -1/-2 in signature\[1\]. + - [ ] carve_unallocated_space: Address carving of the cell pointer array for deleted cells. + - [ ] carve_unallocated_space: Handle carving of freeblocks (see documentation in section of code). + - [ ] carve_unallocated_space: Handle varint first serial type (see documentation in section of code). + - [ ] carve_unallocated_space: Support for other cell types than b-tree table leaf cells. + - [ ] carve_unallocated_space: Address parsing of fields such as payload size, row id, etc. + - [ ] carve_unallocated_space: Update partial carving indices (see documentation in section of code). + - [ ] carve_unallocated_space: Have an option for partial/freeblock carving of unallocated space? + - [ ] carve_unallocated_space: Revise the partial carving algorithm. + +
+ +### rollback_journal_carver.py + +This script carves through a journal file with the specified master schema entry and signature and returns the entries. + +This script holds the following object(s): +- RollBackJournalCarver(Carver) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Investigate possible alternatives to computing or reading the database page size from the journal file. + +
+ +### signature.py + +This script holds the objects for the signature generation of SQLite table and index b-trees for carving. + +This script holds the following object(s): +- Signature(VersionParser) +- SchemaColumnSignature(object) +- TableColumnSignature(object) +- TableRowSignature(object) +- ColumnSignature(object) +- ColumnFixedLengthSignature(ColumnSignature) +- ColumnVariableLengthSignature(ColumnSignature) +- ColumnReducedVariableLengthSignature(ColumnVariableLengthSignature) +- ColumnNonReducedVariableLengthSignature(ColumnVariableLengthSignature) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Handle exceptions that may be raised from creating and working with objects such as signatures better. +- [ ] Incorporate any column and/or table constraint use cases that may affect the signature. +- [ ] Create superclass for schema, table row, and table column signatures? +- [ ] Create constants for serial type arrays in signatures? +- [ ] Updated signature classes to take in a column signature argument instead of sending in individual fields of it. +- [ ] Right now signatures are only derived from leaf pages. Interior pages should be have signatures as well. +- [ ] Have a way to send in a maximum amount of (unique) records to generate the signature from (reduces time)? +- [ ] Have an extension to the Epilog XSD that can be used for signature exportation. +- [ ] Have a way to merge like signatures from external files. +- [ ] Investigate if it is better to put the altered columns flag in a master schema associated class or leave here? + ##### Signature Class + - [ ] Create a field that has a max number of rows to look at to determine a signature to reduce time? + - [ ] Test and investigation on how to handle virtual tables with signatures. + - [ ] Note on how table interior pages cannot have (serial type header) signatures since no records exist. + - [ ] Change the signature to take in a master schema entry identifier instead of the entry itself? + - [ ] Signatures need to be made for the master schema pages. + - [ ] Check support for index b-tree pages and ensure it is working correctly (warning currently raised). + - [ ] The accounted_for_cell_digests may not work for index pages since there is no row id. + - [ ] There may not be a page type in reference to a virtual table since it is not required to have pages. + - [ ] Support for virtual table modules of master schema entry table type. + - [ ] Support for index b-trees that are internal schema objects with no SQL (warning currently raised). + - [ ] Check to make sure index b-tree internal schema objects can not have column definitions (SQL). + - [ ] How do 0 serial types (NULL) work with signatures (like epilog signatures)? + - [ ] Combines simple (or focused) and schema epilog signatures for a more complete epilog signature? + - [ ] Check 8 and 9 serial type on non-integer storage classes for simplified and focused epilog signatures. + - [ ] Is there a use case for only parsing the schema signature and nothing else? + - [ ] How to handle master schema entries not found in specified versions? + - [ ] Have a b-tree page type (either table or index). + - [ ] Investigate better ways for probability calculations between altered columns and column breakdown. + - [ ] How does defaulting fields work in reference to virtual tables. How does is the signature generated? + ##### SchemaColumnSignature Class + - [ ] Handle NULL serial types in the recommended signatures. + - [ ] Incorporate NOT NULL column constraints (and other uses - primary key?) as not having a 0. + +
+ +### utilities.py + +This script holds carving utility functions for reference by the SQLite carving module. + +This script holds the following object(s): +- decode_varint_in_reverse(byte_array, offset) +- calculate_body_content_size(serial_type_header) +- calculate_serial_type_definition_content_length_min_max(simplified_serial_types, allowed_varint_length=5) +- calculate_serial_type_varint_length_min_max(simplified_serial_types) +- generate_regex_for_simplified_serial_type(simplified_serial_type) +- generate_signature_regex(signature, skip_first_serial_type=False) +- get_content_size(serial_type) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Handle use cases where the primary key is an integer and negative resulting in negative (9-byte) varints. +- [ ] decode_varint_in_reverse: Handle the 9 byte varints correctly. +- [ ] decode_varint_in_reverse: Should the InvalidVarIntError be logged as an error? +- [ ] decode_varint_in_reverse: Document on how conclusiveness/truncation can not be certain. +- [ ] generate_regex_for_simplified_serial_type: Fix to account for 9 byte varint serial types. +- [ ] generate_signature_regex: Account for small signatures. +- [ ] generate_signature_regex: Account for regular expressions that skip the first byte of a multi-byte serial type. diff --git a/sqlite_dissect/carving/__init__.py b/sqlite_dissect/carving/__init__.py new file mode 100644 index 0000000..a4a5cb7 --- /dev/null +++ b/sqlite_dissect/carving/__init__.py @@ -0,0 +1,10 @@ + +""" + +__init__.py + +This init script will initialize any needed logic for this package. + +This package will control signature generation and carving of SQLite files. + +""" diff --git a/sqlite_dissect/carving/carved_cell.py b/sqlite_dissect/carving/carved_cell.py new file mode 100644 index 0000000..3cbb263 --- /dev/null +++ b/sqlite_dissect/carving/carved_cell.py @@ -0,0 +1,898 @@ +from struct import unpack +from warnings import warn +from sqlite_dissect.carving.utilities import calculate_body_content_size +from sqlite_dissect.carving.utilities import calculate_serial_type_definition_content_length_min_max +from sqlite_dissect.carving.utilities import decode_varint_in_reverse +from sqlite_dissect.carving.utilities import get_content_size +from sqlite_dissect.constants import BLOB_SIGNATURE_IDENTIFIER +from sqlite_dissect.constants import CELL_LOCATION +from sqlite_dissect.constants import FILE_TYPE +from sqlite_dissect.constants import TEXT_SIGNATURE_IDENTIFIER +from sqlite_dissect.exception import CellCarvingError +from sqlite_dissect.exception import InvalidVarIntError +from sqlite_dissect.file.database.page import BTreeCell +from sqlite_dissect.file.database.payload import Payload +from sqlite_dissect.file.database.payload import RecordColumn +from sqlite_dissect.utilities import decode_varint +from sqlite_dissect.utilities import encode_varint +from sqlite_dissect.utilities import get_md5_hash +from sqlite_dissect.utilities import get_record_content +from sqlite_dissect.utilities import get_serial_type_signature + +""" + +carved_cell.py + +This script holds the objects used for carving cells from the unallocated and freeblock space in SQLite +b-tree pages used in conjunction with other classes in the carving package. These objects subclass their +respective higher level SQLite database object type and add to them while parsing the data in a different way. + +This script holds the following object(s): +CarvedBTreeCell(BTreeCell) +CarvedRecord(Payload) +CarvedRecordColumn(RecordColumn) + +""" + + +class CarvedBTreeCell(BTreeCell): + + """ + + This class will be responsible for carving a b-tree cell to the best it can out of a block of data either from + unallocated data or freeblocks. Since the header to freeblocks can be overwritten meaning at most the first + serial type identifier could be overwritten in the record, a list of first column serial types can be specified. + The header of the record is in the following form: + [ HEADER [ HEADER_BYTE_SIZE SERIAL_TYPE_1 ... SERIAL_TYPE_N] ][ BODY [ BODY_CONTENT_1 ... BODY_CONTENT_N ] ] + For table leaf cells which are mainly being focused on here the cell is in the following format. + + Since unallocated space can contain freeblocks, this class will be used for both use cases of carving from + unallocated space and freeblocks. + + If the carved b-tree cell has first column serial types set, a probabilistic flag will be set in both the carved + b-tree cell, record, and record column indicating that not all fields were completely deterministic. + + Table interior, index interior, index leaf pages, and additional use cases still need to be accounted for. + + + """ + + def __init__(self, version, file_offset, source, page_number, location, index, data, + serial_type_definition_start_offset, serial_type_definition_end_offset, cutoff_offset, + number_of_columns, signature, first_column_serial_types=None, freeblock_size=None): + + """ + + + + Note: The md5 hex digest is set to the md5 hash of the data between the start offset and end offset determined + after the carving of the payload. It is important to note that these offsets may not be correct and + therefore the md5 hex digest is a best guess at what it may be. + + :param version: + :param file_offset: + :param source: + :param page_number: + :param location: + :param index: + :param data: + :param serial_type_definition_start_offset: + :param serial_type_definition_end_offset: + :param cutoff_offset: + :param number_of_columns: + :param signature: + :param first_column_serial_types: + :param freeblock_size: + + :return: + + """ + + """ + + Below we initialize the super constructor by sending in the version number of the version sent in to be the + page version number. The location will specify where the cell was carved from, either freeblocks in b-tree + cells or unallocated space in b-tree pages or any other pages. The index will be 0..N for freeblock carvings + or just 0 for unallocated space. The offset for the serial type definition start will be sent in as the offset, + however this will be updated as needed when carving processes are run against the preceding data, if applicable, + to determine payload length, row id, payload header size, and the first serial type in the payload header + depending on the size of the varint between those fields and which fields are needed depending on the cells + being parsed: + 1.) Table Leaf B-Tree Cell: PAYLOAD_LENGTH_VARINT ROW_ID_VARINT PAYLOAD [OVERFLOW_PAGE_NUMBER] + 2.) Table Interior B-Tree Cell: LEFT_CHILD_POINTER INTEGER_KEY_VARINT (the integer key is a row id) (no payload) + 3.) Index Leaf B-Tree Cell: PAYLOAD_LENGTH_VARINT PAYLOAD [OVERFLOW_PAGE_NUMBER] + 4.) Index Interior B-Tree Cell: LEFT_CHILD_POINTER PAYLOAD_LENGTH_VARINT PAYLOAD [OVERFLOW_PAGE_NUMBER] + + Better support needs to be done for supporting other cell types than the table leaf cell which is focused on + here. + + """ + + super(CarvedBTreeCell, self).__init__(version, version.version_number, file_offset, page_number, + index, serial_type_definition_start_offset, source, location) + + """ + + Since versioning is not implemented for rollback journal files we are going to set the version number to -1 + here. This is done since rollback journals store previous data to what is in the SQLite database file as + opposed to the WAL file where the most recent data in the WAL file reflects the most current state. + + """ + + if source is FILE_TYPE.ROLLBACK_JOURNAL: + self.version_number = -1 + self.page_version_number = -1 + + self.payload = CarvedRecord(location, data, serial_type_definition_start_offset, + serial_type_definition_end_offset, cutoff_offset, number_of_columns, signature, + first_column_serial_types, freeblock_size, version.page_size) + + """ + + After calling the above super constructor and setting the payload, we are left with a few more fields that + need to be accounted for in the BTreeCell class. These fields are as follows: + 1.) self.start_offset: This is originally set to the serial_type_definition_start_offset through the super + constructor but needs to be updated based on what is determined after carving the + payload. + 2.) self.end_offset: Updated after carving of the payload. + 3.) self.byte_size: Calculated from the start and end offset after carving of the payload. + 4.) self.md5_hex_digest: This is set to the md5 hash of the data between the start offset and end offset + determined after the carving of the payload. It is important to note that these + offsets may not be correct and therefore the md5 hex digest is a best guess at what + it may be. + + """ + + self.start_offset = self.payload.cell_start_offset + self.end_offset = self.payload.cell_end_offset + + self.byte_size = self.end_offset - self.start_offset + self.md5_hex_digest = get_md5_hash(data[self.start_offset:self.end_offset]) + + """ + + Additionally to the fields in the BTreeCell class, we add truncated fields to signify if the record was + truncated at either the beginning or ending. + + """ + + self.truncated_beginning = self.payload.truncated_beginning + self.truncated_ending = self.payload.truncated_ending + + self.row_id = "Unknown" + + def stringify(self, padding=""): + string = "\n"\ + + padding + "Truncated Beginning: {}\n" \ + + padding + "Truncated Ending: {}" + string = string.format(self.truncated_beginning, + self.truncated_ending) + return super(CarvedBTreeCell, self).stringify(padding) + string + + +class CarvedRecord(Payload): + + def __init__(self, location, data, serial_type_definition_start_offset, serial_type_definition_end_offset, + cutoff_offset, number_of_columns, signature, first_column_serial_types=None, + freeblock_size=None, page_size=None): + + super(CarvedRecord, self).__init__() + + """ + + Note: The overflow fields below will stay their default values of False and None initialized in the super + class: + + self.has_overflow = False + self.bytes_on_first_page = None + self.overflow_byte_size = None + + There is a TODO in reference to figuring out the best way to handle overflow. Keep in mind that a lot + of times the end portion of a cell may be overwritten, especially in a freeblock, since SQLite adds cells + from the ending of the unallocated or freeblock content which would in turn overwrite the four byte + overflow page number. However, it is possible to calculate if the entry had overflow if the payload + size is correctly determined. + + """ + + self.start_offset = None + self.byte_size = None + self.end_offset = None + + self.header_byte_size = None + self.header_byte_size_varint_length = None + self.header_start_offset = None + self.header_end_offset = None + self.body_start_offset = None + self.body_end_offset = None + + self.md5_hex_digest = None + + self.location = location + self.serial_type_definition_start_offset = serial_type_definition_start_offset + self.serial_type_definition_end_offset = serial_type_definition_end_offset + self.number_of_columns = number_of_columns + self.first_column_serial_types = first_column_serial_types + self.freeblock_size = freeblock_size + self.serial_type_definition_size = \ + self.serial_type_definition_end_offset - self.serial_type_definition_start_offset + + self.cutoff_offset = cutoff_offset + self.truncated_beginning = False + self.truncated_ending = False + + record_column_md5_hash_strings = [""] * self.number_of_columns + + column_index = 0 + body_byte_size = 0 + + serial_type_definition_content_size = calculate_body_content_size( + data[self.serial_type_definition_start_offset:self.serial_type_definition_end_offset]) + + if self.serial_type_definition_start_offset == 0: + + if self.location == CELL_LOCATION.UNALLOCATED_SPACE: + warn("unsupported", RuntimeWarning) + + """ + + We do not know what the header amount could have been here. We could check in reference to the + header + byte array ( == 10 for table leaf cell) but we do not seem to gain a lot from this. + + We could also use probability on row and columns to figure out what the first column type is here + (using a row signatures) or apply probability on the record and record column. + + """ + + elif self.location == CELL_LOCATION.FREEBLOCK: + + # All 4 fields are 1 byte + header_byte_size_varint_length = 1 + header_byte_size = header_byte_size_varint_length + self.serial_type_definition_size + 1 + payload_byte_size = self.freeblock_size - 2 + body_content_size = payload_byte_size - header_byte_size + + first_serial_type_varint_length = 1 + first_serial_type_content_size = body_content_size - serial_type_definition_content_size + + if first_serial_type_content_size > int('1111111', 2): + warn("first serial type too big", RuntimeWarning) + + matching_serial_types = [] + for serial_type in self.first_column_serial_types: + if get_content_size(serial_type) == first_serial_type_content_size or serial_type in \ + [BLOB_SIGNATURE_IDENTIFIER, TEXT_SIGNATURE_IDENTIFIER]: + matching_serial_types.append(serial_type) + + if len(matching_serial_types) > 1: + warn("multiple matching, need to use probability") + + elif len(matching_serial_types) == 1: + + first_serial_type = matching_serial_types[0] + + self.serial_type_signature += str(get_serial_type_signature(first_serial_type)) + + record_column_md5_hash_strings[column_index] = "" + + self.serial_type_definition_size += first_serial_type_varint_length + + first_carved_record_column = CarvedRecordColumn(column_index, first_serial_type, + first_serial_type_varint_length, + first_serial_type_content_size) + first_carved_record_column.truncated_first_serial_type = True + self.truncated_beginning = True + self.record_columns.append(first_carved_record_column) + column_index += 1 + body_byte_size += first_serial_type_content_size + + else: + warn("could not find matching serial types", RuntimeWarning) + + else: + raise CellCarvingError() + + elif self.serial_type_definition_start_offset == 1: + + if self.location == CELL_LOCATION.UNALLOCATED_SPACE: + warn("unsupported", RuntimeWarning) + + """ + + A way to address this may be checking if the signature does not have a -1 or -2 (blob or string), then + check the single byte to get the serial type and then check this against the signatures. If it does + not, then use the probability but we will not know hte length of the type unless the cutoff is + correctly implemented. Freeblocks do not count since the size may not match (since they need two bytes) + but you may be able to check on one byte. + + """ + + elif self.location == CELL_LOCATION.FREEBLOCK: + + """ + + The row id was 2 varint length in bytes 128 <= x <= 16383 or payload >= 2 varint bytes (or both) + or header size. Use cases for this need to be investigated further. + + """ + + first_serial_type, first_serial_type_varint_length = \ + decode_varint(data, self.serial_type_definition_start_offset - 1) + + if first_serial_type_varint_length != 1: + raise CellCarvingError() + + if get_serial_type_signature(first_serial_type) in self.first_column_serial_types: + + self.serial_type_definition_size += first_serial_type_varint_length + + first_serial_type_content_size = get_content_size(first_serial_type) + + header_byte_size_varint_length = 1 + + if self.serial_type_definition_size >= int('1111111' * 1, 2): + header_byte_size_varint_length += 1 + elif self.serial_type_definition_size >= int('1111111' * 2, 2): + header_byte_size_varint_length += 2 + + header_byte_size = self.serial_type_definition_size + header_byte_size_varint_length + + body_content_size = serial_type_definition_content_size + first_serial_type_content_size + + payload_byte_size = header_byte_size + body_content_size + + self.serial_type_signature += str(get_serial_type_signature(first_serial_type)) + + record_column_md5_hash_strings[column_index] = data[self.serial_type_definition_start_offset - 1: + self.serial_type_definition_start_offset] + + first_carved_record_column = CarvedRecordColumn(column_index, first_serial_type, + first_serial_type_varint_length, + first_serial_type_content_size) + + self.record_columns.append(first_carved_record_column) + column_index += 1 + body_byte_size += first_serial_type_content_size + + else: + warn("unable to find serial type with 1 preceding", RuntimeWarning) + + else: + raise CellCarvingError() + + elif self.serial_type_definition_start_offset >= 2: + + if self.location == CELL_LOCATION.UNALLOCATED_SPACE: + warn("unsupported unallocated space with serial type definition start offset >= 2", RuntimeWarning) + + elif self.location == CELL_LOCATION.FREEBLOCK: + + """ + + There are three use cases that can occur here: + 1.) Everything was overwritten up to this point and there is nothing more to carve + 2.) Freeblock cutting off beginning with size up to the first serial type + 3.) Freeblock cutting off beginning but not the first serial type and the header size/row id may still + be in tact somewhat (payload must be overwritten partially in best case) + + """ + + # First check first byte against serial types but also parse freeblock size and check which is best + freeblock_size = unpack(b">H", data[self.serial_type_definition_start_offset - 2: + self.serial_type_definition_start_offset])[0] + freeblock_first_serial_type_min, freeblock_first_serial_type_max = \ + calculate_serial_type_definition_content_length_min_max(None, 1) + + header_byte_size_varint_length = 1 + header_byte_size = header_byte_size_varint_length + self.serial_type_definition_size + 1 + + body_content_size_min = serial_type_definition_content_size + freeblock_first_serial_type_min + body_content_size_max = serial_type_definition_content_size + freeblock_first_serial_type_max + + payload_size_min = header_byte_size + body_content_size_min + payload_size_max = header_byte_size + body_content_size_max + + freeblock_size_valid = False + if freeblock_size >= payload_size_min and freeblock_size <= payload_size_max: + freeblock_size_valid = True + + next_free_block_offset = None + if freeblock_size_valid and self.serial_type_definition_start_offset >= 4: + next_free_block_offset = unpack(b">H", data[self.serial_type_definition_start_offset - 4: + self.serial_type_definition_start_offset - 2])[0] + if next_free_block_offset >= page_size: + freeblock_size_valid = False + + """ + + Check first serial types not in freeblock size first byte. + + """ + + # Check freeblock size valid over first serial type + if freeblock_size_valid: + + # All 4 fields are 1 byte + header_byte_size_varint_length = 1 + header_byte_size = header_byte_size_varint_length + self.serial_type_definition_size + 1 + payload_byte_size = freeblock_size - 2 + body_content_size = payload_byte_size - header_byte_size + + first_serial_type_varint_length = 1 + first_serial_type_content_size = body_content_size - serial_type_definition_content_size + + if first_serial_type_content_size > int('1111111', 2): + warn("first serial type too big", RuntimeWarning) + + matching_serial_types = [] + for serial_type in self.first_column_serial_types: + if get_content_size(serial_type) == first_serial_type_content_size or serial_type in \ + [BLOB_SIGNATURE_IDENTIFIER, TEXT_SIGNATURE_IDENTIFIER]: + matching_serial_types.append(serial_type) + + if len(matching_serial_types) > 1: + warn("multiple matching, need to use probability") + + elif len(matching_serial_types) == 1: + + first_serial_type = matching_serial_types[0] + + self.serial_type_signature += str(get_serial_type_signature(first_serial_type)) + + record_column_md5_hash_strings[column_index] = "" + + self.serial_type_definition_size += first_serial_type_varint_length + + first_carved_record_column = CarvedRecordColumn(column_index, first_serial_type, + first_serial_type_varint_length, + first_serial_type_content_size) + first_carved_record_column.truncated_first_serial_type = True + self.truncated_beginning = True + self.record_columns.append(first_carved_record_column) + column_index += 1 + body_byte_size += first_serial_type_content_size + + else: + warn("could not find matching serial types", RuntimeWarning) + + else: + + """ + + There are two main use cases here: + 1.) single byte varint 00-09 + 2.) multi byte varint (if in signature) + + A possible third use case may be a inner freeblock. + + """ + + simplified_variable_length_serial_types = [BLOB_SIGNATURE_IDENTIFIER, TEXT_SIGNATURE_IDENTIFIER] + text_or_blob_serial_type = \ + any(i in first_column_serial_types for i in simplified_variable_length_serial_types) + + if not text_or_blob_serial_type: + + freeblock_size = None + + # Check the previous two bytes if they exist: + if self.serial_type_definition_start_offset >= 3: + freeblock_size = unpack(b">H", data[self.serial_type_definition_start_offset - 3: + self.serial_type_definition_start_offset - 1])[0] + + """ + + The row id was 2 varint length in bytes 128 <= x <= 16383 or payload >= 2 varint bytes (or both) + or header size. Use cases for this need to be investigated further. + + """ + + first_serial_type, first_serial_type_varint_length = \ + decode_varint(data, self.serial_type_definition_start_offset - 1) + + if first_serial_type_varint_length != 1: + + """ + + Note: Issues can occur here where the pattern matches something not in a serial type + header that is a serial type. For instance: 000000900302 will match a simple signature + (freeblock) of [[02], [03]] which will result in [03] will match the 03 and detect 90 + as the first serial type where it could be the size of the freeblock in the form of 0090. + + """ + + raise CellCarvingError("Invalid first serial type varint size determined. " + "Unable to carve due to probable false positive.") + + if get_serial_type_signature(first_serial_type) in self.first_column_serial_types: + + self.serial_type_definition_size += first_serial_type_varint_length + + first_serial_type_content_size = get_content_size(first_serial_type) + + header_byte_size_varint_length = 1 + + if self.serial_type_definition_size >= int('1111111' * 1, 2): + header_byte_size_varint_length += 1 + elif self.serial_type_definition_size >= int('1111111' * 2, 2): + header_byte_size_varint_length += 2 + + header_byte_size = self.serial_type_definition_size + header_byte_size_varint_length + + body_content_size = serial_type_definition_content_size + first_serial_type_content_size + + payload_byte_size = header_byte_size + body_content_size + + # Add one since row id, payload, or serial type header (not) >= 1 varint + calculated_freeblock_size = payload_byte_size + 2 + 1 + freeblock_size_valid = False + if freeblock_size == calculated_freeblock_size: + freeblock_size_valid = True + + next_free_block_offset = None + if freeblock_size_valid and self.serial_type_definition_start_offset >= 5: + next_free_block_offset = unpack(b">H", + data[self.serial_type_definition_start_offset - 5: + self.serial_type_definition_start_offset - 3])[0] + if next_free_block_offset >= page_size: + freeblock_size_valid = False + + self.serial_type_signature += str(get_serial_type_signature(first_serial_type)) + + record_column_md5_hash_strings[column_index] = \ + data[self.serial_type_definition_start_offset - 1: + self.serial_type_definition_start_offset] + + first_carved_record_column = CarvedRecordColumn(column_index, first_serial_type, + first_serial_type_varint_length, + first_serial_type_content_size) + self.record_columns.append(first_carved_record_column) + + column_index += 1 + body_byte_size += first_serial_type_content_size + + else: + warn("unable to find serial type with 1 preceding", RuntimeWarning) + + else: + + first_serial_type = None + first_serial_type_varint_length = None + try: + + first_serial_type, first_serial_type_varint_length = \ + decode_varint_in_reverse(data, self.serial_type_definition_start_offset, 5) + + except InvalidVarIntError: + pass + + if self.first_column_serial_types and not len(self.record_columns): + + first_serial_type = first_column_serial_types[0] + if signature.total_records == 0: + # Set as null for now + first_serial_type = 0 + if len(first_column_serial_types) != 1: + simplified_probabilistic_signature = signature.simplified_probabilistic_signature + if simplified_probabilistic_signature: + # Found probability otherwise it is a schema without probability + first_probabilistic_column_serial_types = simplified_probabilistic_signature[0] + first_serial_type = max(first_probabilistic_column_serial_types, + key=lambda first_probabilistic_column_serial_type: + first_probabilistic_column_serial_type[1])[0] + first_serial_type_varint_length = 1 + self.serial_type_signature += str(get_serial_type_signature(first_serial_type)) + self.serial_type_definition_size += first_serial_type_varint_length + if first_serial_type == TEXT_SIGNATURE_IDENTIFIER: + first_serial_type = 12 + if first_serial_type == BLOB_SIGNATURE_IDENTIFIER: + first_serial_type = 13 + first_serial_type_content_size = get_content_size(first_serial_type) + first_carved_record_column = CarvedRecordColumn(column_index, first_serial_type, + first_serial_type_varint_length, + first_serial_type_content_size) + first_carved_record_column.probabilistic_first_serial_type = True + first_carved_record_column.truncated_first_serial_type = True + self.truncated_beginning = True + self.record_columns.append(first_carved_record_column) + column_index += 1 + body_byte_size += first_serial_type_content_size + + """ + + We iterate through the header and generate all of the carved record columns off of the header. We know we have + at least enough information in the header to be able to determine the types and size of the body regardless of + if we have the body or not. This is due to the expression being sent in determined from regular expressions + which match the header, with the possible exception of the first serial type which if existing, has already + been handled above. + + """ + + current_header_offset = self.serial_type_definition_start_offset + while current_header_offset < self.serial_type_definition_end_offset: + + serial_type, serial_type_varint_length = decode_varint(data, current_header_offset) + + serial_type_varint_end_offset = current_header_offset + serial_type_varint_length + + if serial_type_varint_end_offset > self.serial_type_definition_end_offset: + raise CellCarvingError() + + self.serial_type_signature += str(get_serial_type_signature(serial_type)) + + record_column_md5_hash_strings[column_index] = data[current_header_offset:serial_type_varint_end_offset] + + content_size = get_content_size(serial_type) + + carved_record_column = CarvedRecordColumn(column_index, serial_type, serial_type_varint_length, + content_size) + self.record_columns.append(carved_record_column) + + current_header_offset += serial_type_varint_length + body_byte_size += content_size + column_index += 1 + + if len(self.record_columns) != number_of_columns: + raise CellCarvingError() + + self.body_start_offset = self.serial_type_definition_end_offset + self.body_end_offset = self.serial_type_definition_end_offset + body_byte_size + + if self.body_end_offset > len(data): + self.truncated_ending = True + + """ + + Note: This does not currently work for multiple options in the first or variable length serial types. + + """ + + # First truncated column field + current_body_offset = self.body_start_offset + for carved_record_column in self.record_columns: + + if (current_body_offset + carved_record_column.content_size) > len(data): + carved_record_column.truncated_value = True + if current_body_offset < len(data): + carved_record_column.value = data[current_body_offset:] + record_column_md5_hash_strings[carved_record_column.index] += data[current_body_offset:] + carved_record_column.md5_hex_digest = \ + get_md5_hash(record_column_md5_hash_strings[carved_record_column.index]) + + else: + + """ + + This means that: offset + content_size <= len(data) + + """ + + value_data = data[current_body_offset:current_body_offset + carved_record_column.content_size] + content_size, value = get_record_content(carved_record_column.serial_type, value_data) + + if content_size != carved_record_column.content_size: + raise CellCarvingError() + carved_record_column.value = value + record_column_md5_hash_strings[carved_record_column.index] += value_data + carved_record_column.md5_hex_digest = \ + get_md5_hash(record_column_md5_hash_strings[carved_record_column.index]) + + current_body_offset += carved_record_column.content_size + + if self.body_end_offset != current_body_offset: + raise CellCarvingError() + + # This assumes the length of the header is 1 byte (most cases it will or would mean # of rows > 127 for table). + self.header_byte_size = self.serial_type_definition_size + 1 + + self.header_byte_size_varint = encode_varint(self.header_byte_size) + self.header_byte_size_varint_length = len(self.header_byte_size_varint) + + self.payload_byte_size = self.header_byte_size + body_byte_size + + self.payload_byte_size_varint = encode_varint(self.payload_byte_size) + self.payload_byte_size_varint_length = len(self.payload_byte_size_varint) + + # Below is relative to the unallocated space. The "-1" is to account for the row id. + self.cell_start_offset = self.serial_type_definition_start_offset - self.record_columns[0].\ + serial_type_varint_length - self.header_byte_size_varint_length - 1 - self.payload_byte_size_varint_length + self.cell_end_offset = self.body_end_offset + + +class CarvedRecordColumn(RecordColumn): + + def __init__(self, index, serial_type, serial_type_varint_length, content_size): + + """ + + Constructor. + + This method constructs the carved record column by calling it's super constructor and then setting a few + additional fields for itself in reference to carving traits. + + If this carved record column was truncated (ie. the rest of the record was overwritten at some point), then + the truncated value flag will be set to True. If this is the case, the value may or may not be set depending + if this column was the actually column that got cut off. Past the first column that gets truncated, all + following carved record columns will not have the value set. + + Keep in mind that the column value may be "None" if it was a NULL value in the database. However, this will + only be truly NULL if the field is not truncated. If the field is truncated, then if it has a value of "None" + it is due to the fact that it was unable to be obtained. + + The md5 hex digest will be the md5 of the found portions of the record column whether that just be the serial + type header, serial type header and value, or serial type header and truncated value. + + It is also important to keep in mind that parts of the record could be overwritten without being detected + resulting in some weird values. + + Note: For reference the RecordColumn super class has the following attributes: + 1.) index + 2.) serial_type + 3.) serial_type_varint_length + 4.) content_size + 5.) value + 6.) md5_hex_digest + + :param index: + :param serial_type: + :param serial_type_varint_length: + :param content_size: + + :return: + + """ + + """ + + Call to the constructor of the super record column class but specify "None" for the value and + md5 hex digest since they aren't known at this time. + + """ + + super(CarvedRecordColumn, self).__init__(index, serial_type, serial_type_varint_length, content_size, + None, None) + + self.simplified_serial_type = self.serial_type + if self.serial_type >= 12 and self.serial_type % 2 == 0: + self.simplified_serial_type = -1 + elif self.serial_type >= 13 and self.serial_type % 2 == 1: + self.simplified_serial_type = -2 + + """ + + Note: The below values are set to defaults and expected to be updated by the calling class if intended for use. + + """ + + self.value = None + self.md5_hex_digest = None + + self.truncated_first_serial_type = False + self.truncated_value = False + self.probabilistic = False + + def stringify(self, padding=""): + string = "\n" \ + + padding + "Simplified Serial Type: {}\n" \ + + padding + "Truncated First Serial Type: {}\n" \ + + padding + "Truncated Value: {}\n" \ + + padding + "Probabilistic: {}" + string = string.format(self.simplified_serial_type, + self.truncated_first_serial_type, + self.truncated_value, + self.probabilistic) + return super(CarvedRecordColumn, self).stringify(padding) + string + + """ + + If we have a the first column serial types set, then the full serial type definition (referring to the + payload header excepting the header size) was not determined previously. However, since freeblocks + overwrite the first four bytes, assuming there is a payload size, row id, and serial type header size + followed by the serial types (ie. a b-tree table leaf cell), at most the first serial type can be + overwritten, or the first varint byte of a varint serial type if it is more than 1 byte in length. + Again, this only accounts for b-tree table leaf cells and there is a TODO in reference to supporting + other cell types. + + There are two use cases to address for the first column serial types: + 1.) Preceding bytes detected. + 2.) No Preceding bytes detected (or invalid varint from #1). + + 1.) Preceding bytes detected: + + If there are bytes preceding the serial type definition start offset in the data, then we may be able + to parse backwards in order to determine the first serial type and payload header size assuming the best + case scenario and a b-tree table leaf, index interior, or index leaf cell since b-tree table interiors do + not have payloads associated with their cells. We also have to assume that the preceding bytes were not + overwritten in some manner. + + The way we will check the first column serial type will be to see what serial types are possible for it. + Remember that the first column serial types is an array of the different serial types that can exist and + will be a subset of: [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] where -2 and -1 are varint serial types + representing the TEXT and BLOB storage classes respectfully. + + If a varint serial type exists that can be more than 1 byte (TEXT or BLOB), we will call the + decode_varint_in_reverse function in order to retrieve it. However, before we do this we will AND the byte + with 0x80 to see if the most significant bit is set (ie. msb_set = varint_serial_type_byte & 0x80). + If the most significant bit is set, then it is likely this is not the least significant byte of a serial + type since the least significant byte should never have the most significant bit set. Since all serial + types that are not multi-byte serial types will be within the range of 0x00 to 0x09, this will tell us a + few things. The first thing is if the first serial type is a single byte varint serial type + meaning it could be any of the serial types including TEXT and BLOB with the size of 57 or less with + regards to the first serial type least significant byte: + + 1.) TEXT: Min single byte size: 0x0D = (13 - 13)/2 = 0 + Max single byte size: 0x7F = (127 - 13)/2 = 57 + + Note: However, there may be additional, preceding bytes signifying a larger size. + + Note: The TEXT is "odd" and can be determined by checking if the byte > 13 and + if the byte % 2 == 1. Similarly, if it the byte > 13 then if byte & 0x01, + it is also TEXT. + + 2.) BLOB: Min single byte size: 0x0C = (12 - 12)/2 = 0 + Max single byte size: 0x7E = (126 - 12)/2 = 57 + + Note: However, there may be additional, preceding bytes signifying a larger size. + + Note: The BLOB is "even" and can be determined by checking if the byte > 12 and + if the byte % 2 == 0. Similarly, if it the byte > 12 then if NOT byte & 0x01, + it is also BLOB. + + 3.) All other serial types are single byte varints where 0x00 <= serial_type <= 0x09. + + Note: The bytes 0x0A and 0x0B are not used and are currently reserved for expansion. This in combination + of the above use cases and those where the most significant bit is set cover all use cases for + relating the preceding byte (the least significant byte of the possible multi-byte varint) to their + respective serial type. However, we still may not have the correct length of the serial types in + respect to the variable length multi-byte varints for TEXT and BLOB with a size greater than 57. + This will be determined by looking at preceding bytes, if existing, and accuracy will be depending + on how many bytes preceding this byte remain and if it has not been overwritten in any way. + + If either the 0x0A, 0x0B or msb_set (varint_serial_type_byte & 0x80), then we do not have a serial type + and we resort to the same use cases as #2 below since we have determined an invalid varint. + + If we do have a serial type where the byte is between 0x0C and 0x7F, then we have to look at the preceding + bytes, if existing to hopefully determine if it is a portion of a larger varint determining a larger size + for that data type. In order to get the correct size of the serial type we call the + decode_varint_in_reverse function to parse backwards until we either hit the 9 byte maximum for varints or + find a most significant byte where the most significant bit is not set. However, there is a chance we will + run out of data in the array going backwards. In order to facilitate this, the decode_varint_in_reverse + returns three fields in the form of a tuple: + (unsigned_integer_value, varint_relative_offset, truncated) + Keep in mind that even if it was not truncated and found all bytes for the varint, the varint still may be + incorrect due to use cases where it was overwritten with bytes that may be mistaken for valid varint bytes. + + If the variable length serial type turns out to be truncated, then we set that flag in the carved record + since we can not be certain if it is either partially carved or completely erroneous. We leave this in + order to be addressed as needed when parsing the first serial type data content from the body. + + However, the function can also throw an InvalidVarIntError in which case the varint will be assumed to be + overwritten in some way and we will default to the process explained further below where we do not have + preceding bytes. This is also true if we find a invalid serial type on the first preceding byte. + + Note: There is a chance of false positives being returned by this function and validation checks need to + be investigated in order to make this value more deterministic. A TODO has been placed at the top + of this script in reference to this issue. + + Note: Also, 9 byte varints are not currently handled. There are TODOs in references to 9 byte varint + parsing in both this script and their respective parsing function scripts. + + 2.) No Preceding bytes detected (or invalid varint from #1). + + If there are no bytes preceding the serial type definition start offset, we will assume the field is the + one with the most probability. + + """ + + """ + + 1.) Preceding bytes detected: + + In order to check if we have preceding bytes and then parse backwards through them we first check if + the serial type definition start offset is greater than one. If this is true, we know we have at least + one preceding byte that we can check to see the serial type of. + + Keep in mind that although this will give us a serial type, it may be a byte overwritten by something else + and is not completely deterministic. + + """ diff --git a/sqlite_dissect/carving/carver.py b/sqlite_dissect/carving/carver.py new file mode 100644 index 0000000..14ef80b --- /dev/null +++ b/sqlite_dissect/carving/carver.py @@ -0,0 +1,593 @@ +from logging import getLogger +from re import compile +from warnings import warn +from sqlite_dissect.carving.carved_cell import CarvedBTreeCell +from sqlite_dissect.carving.utilities import generate_signature_regex +from sqlite_dissect.constants import BLOB_SIGNATURE_IDENTIFIER +from sqlite_dissect.constants import CELL_LOCATION +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import TEXT_SIGNATURE_IDENTIFIER +from sqlite_dissect.exception import CarvingError +from sqlite_dissect.exception import CellCarvingError + +""" + +carver.py + +This script holds carver objects for identifying and parsing out cells from unallocated and +freeblock space in SQLite b-tree pages. + +This script holds the following object(s): +SignatureCarver(Carver) + +""" + + +class SignatureCarver(object): + + @staticmethod + def carve_freeblocks(version, source, freeblocks, signature): + + """ + + This function will carve the freeblocks list with the signature specified. + + Note: The signature that will be used from the signature object will be the simplified signature unless + one does not exist (in the case where one was generated with no row entries), in which case the + simplified schema signature will be used. + + Note: The serial type definition nomenclature does not include the serial type header size field in reference + to the offsets and may also not include the first (or first byte of a multi-byte varint) serial type and + therefor dubbed "definition" instead of header signifying only a portion of the header. + + :param version: + :param source: + :param freeblocks: + :param signature: + + :return: + + """ + + logger = getLogger(LOGGER_NAME) + + number_of_columns = signature.number_of_columns + + simplified_signature = signature.simplified_signature + + if not simplified_signature: + simplified_signature = signature.recommended_schema_signature + logger.debug("Using recommended schema signature: {}.".format(simplified_signature)) + else: + logger.debug("Using simplified signature: {}.".format(simplified_signature)) + + if not simplified_signature: + log_message = "No signature was found." + logger.error(log_message) + raise CarvingError(log_message) + + """ + + Since we are carving freeblocks here, we will remove the first column serial type. This is due to the fact + that the freeblock header overwrites the first four bytes of the cell which usually overwrites the first + serial type in the header of the record since that is the fourth byte (assuming payload, row id, and header + length (where applicable) are all less than 1 varint). + + """ + + first_column_serial_types = simplified_signature[0] + + if BLOB_SIGNATURE_IDENTIFIER in first_column_serial_types or TEXT_SIGNATURE_IDENTIFIER in \ + first_column_serial_types: + log_message = "A variable length serial type was found in the first column serial types: {} while" \ + "carving freeblocks with signatures: {}. Signatures starting with variable length serial " \ + "types are not fully implemented and may result in carving false positives." + log_message = log_message.format(first_column_serial_types, simplified_signature) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + # Retrieve and compile the serial type definition signature pattern + serial_type_definition_signature_pattern = compile(generate_signature_regex(simplified_signature, True)) + + # Initialize the carved cells + carved_cells = [] + + # Iterate through the freeblocks + for freeblock in freeblocks: + + # Get the content for the current freeblock + freeblock_content = freeblock.content + + # Initialize the list for the serial type definition match objects + serial_type_definition_match_objects = [] + + # Find all matches for the serial type definition signature pattern + for serial_type_definition_match in serial_type_definition_signature_pattern.finditer(freeblock_content): + serial_type_definition_match_objects.append(serial_type_definition_match) + + """ + + In order to carve the freeblocks we have to start from the ending match and move backwards through the + matches in the freeblock. This is due to the fact that when a freeblock is made, it can be reallocated, + and then have the entry deleted again in it expanding it back to the original size it previously was. When + a freeblock is reallocated it counts the space it needs from the end of the freeblock rather than from + the beginning. This means that the ending portion (usually the data) of the previous freeblock that was + in the spot will be overwritten. Therefore, there is a good chance we should be able to parse out the last + match successfully, but will end up have truncated carvings "beneath" the last one. + + As an example freeblocks are overwritten in the following pattern: + [Third Freeblock Entry .............] + [Second Freeblock Entry ................] + [First Freeblock Entry .........................] + + This can also be in the following pattern though: + [Allocated Cell Entry ..............] + [Second Freeblock Entry ................] + [First Freeblock Entry .........................] + + In the above example we have the possibility of losing all of the data and being unable to parse anything + but the header of the previous freeblocks. + + """ + + """ + + The cutoff offset will be initialized to the length of the freeblock content and then be updated for + "beneath" freeblock entries to be the starting offset of the previous entry. There is some variation on + if this is the actual cutoff or not but will always be after the actual cutoff when done this way. + It is just important to keep in mind that the previous freeblocks may actually be cutoff before this offset + and the "above" freeblocks may go back that length for things like payload size, row id, serial type header + length and the first serial type depending on the use case. + + """ + + cutoff_offset = len(freeblock_content) + + page_offset = version.get_page_offset(freeblock.page_number) + + # Iterate through the serial type definition matches in reverse + for serial_type_definition_match in reversed(serial_type_definition_match_objects): + + """ + + For the serial type definition match objects returned from the iterator above, the match object has a + start and a end function to get the beginning offset and ending offset. This is done by calling + start(0) or end (0) with 0 being the group number. The ending offset is exclusive + ie. [start(0):end(0)). + + """ + + serial_type_definition_start_offset = serial_type_definition_match.start(0) + serial_type_definition_end_offset = serial_type_definition_match.end(0) + file_offset = page_offset + freeblock.start_offset + serial_type_definition_start_offset + + try: + + # Create and append the carved b-tree cell to the carved cells list + carved_cells.append(CarvedBTreeCell(version, file_offset, source, freeblock.page_number, + CELL_LOCATION.FREEBLOCK, + freeblock.index, freeblock_content, + serial_type_definition_start_offset, + serial_type_definition_end_offset, cutoff_offset, + number_of_columns, signature, + first_column_serial_types, freeblock.byte_size)) + + # Update the cutoff offset + cutoff_offset = serial_type_definition_start_offset + + except (CellCarvingError, ValueError): + log_message = "Carved b-tree cell creation failed at file offset: {} page number: {} " \ + "cell source: {} in location: {} with partial serial type definition " \ + "start offset: {} and partial serial type definition end offset: {} with " \ + "cutoff offset of: {} number of columns: {} for master schema " \ + "entry with name: {} and table name: {}." + log_message = log_message.format(file_offset, freeblock.page_number, source, + CELL_LOCATION.UNALLOCATED_SPACE, + serial_type_definition_start_offset, + serial_type_definition_end_offset, cutoff_offset, + number_of_columns, signature.name, signature.table_name) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + # Return the cells carved from the freeblocks + return carved_cells + + @staticmethod + def carve_unallocated_space(version, source, page_number, unallocated_space_start_offset, + unallocated_space, signature, page_offset=None): + + """ + + This function will carve the unallocated space with the signature specified. + + Note: The signature that will be used from the signature object will be the simplified signature unless + one does not exist (in the case where one was generated with no row entries), in which case the + simplified schema signature will be used. + + Note: The serial type definition nomenclature does not include the serial type header size field in reference + to the offsets and may also not include the first (or first byte of a multi-byte varint) serial type and + therefor dubbed "definition" instead of header signifying only a portion of the header. + + :param version: + :param source: + :param page_number: + :param unallocated_space_start_offset: + :param unallocated_space: + :param signature: + :param page_offset: Page offset if needed to be specified. Currently only used for proof of concept + journal page parsing. + + :return: + + """ + + logger = getLogger(LOGGER_NAME) + + number_of_columns = signature.number_of_columns + + simplified_signature = signature.simplified_signature + + if not simplified_signature: + simplified_signature = signature.recommended_schema_signature + logger.debug("Using recommended schema signature: {}.".format(simplified_signature)) + else: + logger.debug("Using simplified signature: {}.".format(simplified_signature)) + + if not simplified_signature: + log_message = "No signature was found." + logger.error(log_message) + raise CarvingError(log_message) + + # Retrieve and compile the serial type definition signature pattern + serial_type_definition_signature_pattern = compile(generate_signature_regex(simplified_signature)) + + """ + + In reference for supporting freeblocks and additional use cases in unallocated space: + + Currently, unallocated space is carved using a full signature (not removing the first serial type) in order + to detect deleted entries. This can result in the following two use cases in reference to deleted entries + in the unallocated space: + 1.) Cell entries that were deleted or left over from a previous page being reused that ended up in the + unallocated space where the serial type header (excepting possibly the header size) of the payload + is in tact. Due to the way cells are inserted from the back of the page moving forward it is very + likely to have the beginning of the cell as well (but not a certainty). + 2.) Freeblocks that had either a payload, row id, or serial type header size that one or more of which were + either 2 byte or greater varints. This would push the serial type header (excepting possibly the header + size) into the main body of the freeblock. This is due to the fact that the freeblock overwrites the first + 4 bytes of the entry with the next freeblock offset and freeblock size. A freeblock needs at least 4 bytes + to exist, and if not, it is a fragment. Keep in mind this is also assuming a b-tree table leaf page and + may not be the case for b-tree index pages or b-tree table interiors. + + In comparison to the not detected use case below, it is important to note that the first serial type may + also be a varint of length greater than 2 bytes and therefore still detected where The #1 use case below + is true but would incorrectly determine the size of the varint causing issues parsing the body of the cell. + Additional research and handling of this use case is needed. + + The use of a "full" signature will not detect: + 1.) Freeblocks that have a payload, row id, and serial type header size of 1 varint will end up having the first + serial type overwritten (excepting the use case defined in #2 above) which will result in the entries + not being carved unless checking for the signature without the first serial type, like freeblocks are done. + + There are a few ways to do this (very similar to the freeblock carving code above) and needs to + be implemented. + + Discussion: There are a few ways to determine freeblocks. One way is to calculate the size of the serial type + definition plus 1 byte for the header (depending on size) and compare that to the previous byte to + see if it matches. If it does, the full header should be in tact and the body content can be + calculated from the serial type definition. (The body content may still be able to be calculated + from the serial type definition without finding the serial type header length assuming the rest of + the serial types are all existent (the first serial type or portion of first multi-byte varint + serial type is not missing). Once the body content and header content are calculated, moving + backwards the bytes can be checked for the size of the freeblock + 4 adding on one byte for each + byte gone back that does not match the size (this is to account for larger than 1 byte varints for + payload or row id). If this is within the acceptable range of the varint sizes and matches the + size, there is a good chance this is a freeblock. + + Pseudocode: + + serial_type_header_size = + ord(unallocated_space[serial_type_definition_start_offset - 1: + serial_type_definition_start_offset]) + + if serial_type_header_size == + serial_type_definition_end_offset - serial_type_definition_start_offset + 1 + This is the serial type header size (1 is added for the one byte serial type header byte size). + else: + This is not the serial type header size or the first serial type may be a multi-byte varint + use case which would then cause this process to move back one byte and repeat or the serial + type header size may be a multi-byte varint. + + However the third use case below should be predetermined in the above serial_type_header_size + setting statement based on the size between the serial_type_definition_end_offset and + serial_type_definition_start_offset. + + After the above: + + Given additional_serial_type_header_bytes is the amount of extra bytes for the header calculated + above and header_start_offset refers to the location the full header starts at: + calculated_payload_length = additional_serial_type_header_bytes + + serial_type_definition_end_offset - + serial_type_definition_start_offset + body_content_size + 4 + if calculated_payload_length == + unpack(b">H", unallocated_space[header_start_offset - 2:header_start_offset])[0]: + There is a freeblock possibility but may also be a payload to a b-tree index cell. + else: + This may be a table leaf cell where this first number would be the row id, in which we should + reverse parse out the varint and then check the next index for the size (excepting adding in the + size of the row id since the payload size is only the actual payload following the row id). + + A similar process could be used for parsing out cells that are not freeblocks in order to determine + things such as payload size, row id, serial type header length, or missing (or partially missing + portion of a multi-byte varint) first serial type in actual cells. This will be left up to the + CarvedBTreeCell class to do and the above documentation may end up applying more to that class + then here. + + Note: Overflow still needs to be addressed. + + Note: The above use cases have been determined from investigation into how SQLite stores data and may not be + a complete list. + + """ + + # Initialize the list for the serial type definition match objects + serial_type_definition_match_objects = [] + + # Find all matches for the serial type definition signature pattern + for serial_type_definition_match in serial_type_definition_signature_pattern.finditer(unallocated_space): + serial_type_definition_match_objects.append(serial_type_definition_match) + + # Initialize the carved cells + carved_cells = [] + + """ + + Like above, in the freeblock carving code, we find all of the matches for the signature and then work in reverse + through the unallocated space. The idea here is very similar to the freeblock carving (see the documentation + above) since cells are added from the unallocated space at the end of the page moving back towards the front + of the page much like how cells are added back into freeblocks from the end if there is enough space. + + """ + + """ + + The cutoff offset will be initialized to the length of the unallocated space and then be updated for + entries that may have been overwritten previously by the entries at the end of the unallocated space. + There is some variation on if this is the actual cutoff or not but will always be after the actual cutoff + when done this way. It is just important to keep in mind that the previous entries (including possibly + freeblocks) may actually be cutoff before this offset and the entries overwritten on top of previous entries + may go back that length for things like payload size, row id, serial type header length and the first serial + type depending on the use case. + + """ + + cutoff_offset = len(unallocated_space) + + # Retrieve the page offset if it was not set through the constructor (should only be set for + # proof of concept journal file parsing). + if page_offset is None: + page_offset = version.get_page_offset(page_number) + + # Iterate through the serial type definition matches in reverse + for serial_type_definition_match in reversed(serial_type_definition_match_objects): + + """ + + For the serial type definition match objects returned from the iterator above, the match object has a + start and a end function to get the beginning offset and ending offset. This is done by calling + start(0) or end (0) with 0 being the group number. The ending offset is exclusive ie. [start(0):end(0)). + + """ + + serial_type_definition_start_offset = serial_type_definition_match.start(0) + serial_type_definition_end_offset = serial_type_definition_match.end(0) + file_offset = page_offset + unallocated_space_start_offset + serial_type_definition_start_offset + + try: + + # Create and append the carved b-tree cell to the carved cells list + carved_cells.append(CarvedBTreeCell(version, file_offset, source, page_number, + CELL_LOCATION.UNALLOCATED_SPACE, 0, unallocated_space, + serial_type_definition_start_offset, + serial_type_definition_end_offset, cutoff_offset, + number_of_columns, signature)) + + # Update the cutoff offset + cutoff_offset = serial_type_definition_start_offset + + except (CellCarvingError, ValueError): + log_message = "Carved b-tree cell creation failed at file offset: {} page number: {} " \ + "cell source: {} in location: {} with partial serial type definition " \ + "start offset: {} and partial serial type definition end offset: {} with " \ + "cutoff offset of: {} number of columns: {} for master schema " \ + "entry with name: {} and table name: {}." + log_message = log_message.format(file_offset, page_number, source, + CELL_LOCATION.UNALLOCATED_SPACE, + serial_type_definition_start_offset, + serial_type_definition_end_offset, cutoff_offset, + number_of_columns, signature.name, signature.table_name) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + """ + + At this point we have carved all the "full signatures" in reference to the full serial type definition in + the cell headers that we found. However, although the above may be freeblocks in the unallocated space (in + the use case where the combination of the payload, row id, and/or payload header varint equate out to 4 or + more bytes), the use case still remains where all 3 are 1 byte as well as the first serial type. In this case + we would only have the 2nd through Nth serial types like the above code in carve freeblocks. Therefore, we + recompute the signature removing the first serial type, recheck for patterns and if they do not match the + patterns above, add them as well. + + + Note: If this matches, it does not mean this is necessarily a freeblock since it could have just have been a + cell removed and then overwritten partially by another cell. Use cases like these should be addressed + in the carved cell classes. + + """ + + # Reset the signature pattern removing the first serial type and compile + serial_type_definition_signature_pattern = compile(generate_signature_regex(simplified_signature, True)) + + # Initialize the list for the partial serial type definition match objects + partial_serial_type_definition_match_objects = [] + + # Find all matches for the partial serial type definition signature pattern + for serial_type_definition_match in serial_type_definition_signature_pattern.finditer(unallocated_space): + partial_serial_type_definition_match_objects.append(serial_type_definition_match) + + """ + + The partial serial type definition match objects should now be a superset of the serial type definition match + objects above. We now go through these match objects and remove any of the data segments found above by + comparing the indices. + + Note: This is done after instead of before the full serial type signature matching since it is more conclusive + to carve the whole cells rather than the ones without the full serial type header. + + Note: The indices should be updated with the correct cutoff offset and beginning offset where found in the + carved cells from the match objects. Currently, these indices only reflect the serial type definition + header. This will further improve the validity of the result set. This will be done once the carved + cell class and use cases are fully handled. + + """ + + # Create a list of all ending indices for the serial type definition match objects and sort by beginning index + serial_type_definition_match_objects_indices = sorted([(match_object.start(0), match_object.end(0)) + for match_object in serial_type_definition_match_objects], + key=lambda x: x[0]) + + unallocated_space_length = len(unallocated_space) + serial_type_definition_match_objects_indices_length = len(serial_type_definition_match_objects_indices) + uncarved_unallocated_space_indices = [] + + # If there were no serial type definition matches, we set the whole unallocated space to be checked + if not serial_type_definition_match_objects_indices: + uncarved_unallocated_space_indices.append((0, unallocated_space_length)) + + else: + last_offset = None + for index, match_object_index in enumerate(serial_type_definition_match_objects_indices): + + if index == 0 and index != len(serial_type_definition_match_objects_indices) - 1: + + """ + + Check if we are at the first index and if there are additional indexes in the match object. If + this is the case, add the section of data from the beginning of the unallocated data to the + beginning of this index. This is only done if data is found. If there is no data (ie. the first + index of the first match object is the first index of the unallocated data), then we do not set + the new index on this first iteration. + + """ + + if match_object_index[0] != 0: + uncarved_unallocated_space_indices.append((0, match_object_index[0])) + last_offset = match_object_index[1] + + elif index == 0 and index == serial_type_definition_match_objects_indices_length - 1: + + """ + + Check if we are at the first index and if there are no additional indexes in the match object. If + this is the case, we add an index from the beginning of the unallocated data to the first index of + the first (and only) match index. If there is data between the ending index of the match we are + currently looking at and the end of the unallocated space, we add an index from the ending match + index to the ending of the unallocated data. + + """ + + uncarved_unallocated_space_indices.append((0, match_object_index[0])) + if match_object_index[1] != len(unallocated_space): + uncarved_unallocated_space_indices.append((match_object_index[1], unallocated_space_length)) + last_offset = match_object_index[1] + + elif index != 0 and index != serial_type_definition_match_objects_indices_length - 1: + + """ + + If we are not on the first index and there are more indexes to come, we just add the data portion + between the ending offset of the last match offset and the beginning index of this first match + offset. + + """ + + uncarved_unallocated_space_indices.append((last_offset, match_object_index[0])) + last_offset = match_object_index[1] + + elif index != 0 and index == serial_type_definition_match_objects_indices_length - 1: + + """ + + If we are not on the first index and this is the last index of the previous match objects, we then + add the index of the last entry and the first index of this match object. Then, if there is data + left in the unallocated space between the ending index of this match object and the end of the + unallocated space, we add the last entry between these indices. + + """ + + uncarved_unallocated_space_indices.append((last_offset, match_object_index[0])) + if match_object_index[1] != len(unallocated_space): + uncarved_unallocated_space_indices.append((match_object_index[1], unallocated_space_length)) + else: + + log_message = "Found invalid use case while carving unallocated space for page number: {} " \ + "starting from the unallocated space start offset: {} with signature: {}." + log_message = log_message.format(page_number, unallocated_space_start_offset, signature.name) + logger.error(log_message) + raise CarvingError(log_message) + + """ + + Iterate through the uncarved portions of the unallocated space and update the cutoff offset to the be the + min index of the previous partial cutoff offset and the current uncarved allocated space index ending offset. + + """ + + partial_cutoff_offset = len(unallocated_space) + for partial_serial_type_definition_match in reversed(partial_serial_type_definition_match_objects): + for uncarved_allocated_space_index in reversed(uncarved_unallocated_space_indices): + + cutoff_offset = min(uncarved_allocated_space_index[1], partial_cutoff_offset) + + partial_serial_type_definition_start_offset = partial_serial_type_definition_match.start(0) + partial_serial_type_definition_end_offset = partial_serial_type_definition_match.end(0) + + if partial_serial_type_definition_start_offset >= uncarved_allocated_space_index[0] and \ + partial_serial_type_definition_end_offset <= uncarved_allocated_space_index[1]: + + relative_offset = unallocated_space_start_offset + partial_serial_type_definition_start_offset + file_offset = page_offset + relative_offset + first_column_serial_types = simplified_signature[0] + + try: + + # Create and append the carved b-tree cell to the carved cells list + carved_cells.append(CarvedBTreeCell(version, file_offset, source, page_number, + CELL_LOCATION.UNALLOCATED_SPACE, + 0, unallocated_space, + partial_serial_type_definition_start_offset, + partial_serial_type_definition_end_offset, + cutoff_offset, number_of_columns, signature, + first_column_serial_types)) + + # Update the partial cutoff offset + partial_cutoff_offset = partial_serial_type_definition_start_offset + + except (CellCarvingError, ValueError): + log_message = "Carved b-tree cell creation failed at file offset: {} page number: {} " \ + "cell source: {} in location: {} with partial serial type definition " \ + "start offset: {} and partial serial type definition end offset: {} with " \ + "partial cutoff offset of: {} number of columns: {} for master schema " \ + "entry with name: {} and table name: {}." + log_message = log_message.format(file_offset, page_number, source, + CELL_LOCATION.UNALLOCATED_SPACE, + partial_serial_type_definition_start_offset, + partial_serial_type_definition_end_offset, + partial_cutoff_offset, number_of_columns, signature.name, + signature.table_name) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + # Return the cells carved from the freeblocks + return carved_cells diff --git a/sqlite_dissect/carving/rollback_journal_carver.py b/sqlite_dissect/carving/rollback_journal_carver.py new file mode 100644 index 0000000..7003455 --- /dev/null +++ b/sqlite_dissect/carving/rollback_journal_carver.py @@ -0,0 +1,124 @@ +from binascii import hexlify +from logging import getLogger +from struct import unpack +from sqlite_dissect.constants import FILE_TYPE +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import PAGE_TYPE +from sqlite_dissect.carving.carver import SignatureCarver +from sqlite_dissect.version_history import Commit + +""" + +rollback_journal_carver.py + +This script carves through a rollback journal file with the specified master schema entry and +signature and returns the entries. + +This script holds the following object(s): +RollBackJournalCarver(Carver) + +""" + + +class RollBackJournalCarver(object): + + @staticmethod + def carve(rollback_journal, version, master_schema_entry, signature): + + logger = getLogger(LOGGER_NAME) + + """ + + Read the page size in from the version class (the base SQLite database). This will be used instead of checking + the journal header since that is overwritten with zeros in most cases. If there is no database file, then + other means to determine the page size can be used by analyzing the journal file. This is something outside + the current scope of this project and could be something followed up on in the future for stand alone rollback + journal carving. + + """ + + page_size = version.page_size + + """ + + This is currently a hard coded value as to what is currently seen (sector size). + Some research was done and this value appeared to be hard coded in the SQLite c library. + Newer version so the library should be checked as to this was the 3090200 version. + + """ + + sector_size = 512 + + # The page record header and checksum sizes are fixed + page_record_header_size = 4 + page_record_checksum_size = 4 + + page_record_size = page_record_header_size + page_size + page_record_checksum_size + + # Initialize the carve commits + carved_commits = [] + + logger.debug("Starting carving table: %s... " % master_schema_entry.name) + + has_data = True + offset = sector_size + while has_data: + + page_number = unpack(b">I", rollback_journal.file_handle.read_data(offset, page_record_header_size))[0] + page_content = rollback_journal.file_handle.read_data(offset + page_record_header_size, page_size) + page_type = hexlify(page_content[:1]) + page_checksum = hexlify(rollback_journal.file_handle.read_data(offset + page_record_header_size + + page_size, page_record_checksum_size)) + + logger.debug("At offset: %s page Number: %s of type: %s has content with checksum of: %s" + % (offset, page_number, page_type, page_checksum)) + + if page_type in ["0d", "05"]: + + page_type_string = PAGE_TYPE.B_TREE_TABLE_LEAF if page_type == "0d" else PAGE_TYPE.B_TREE_TABLE_INTERIOR + carved_cells = SignatureCarver.carve_unallocated_space(version, FILE_TYPE.ROLLBACK_JOURNAL, page_number, + 0, page_content, signature, + offset + page_record_header_size) + + commit = Commit(master_schema_entry.name, FILE_TYPE.ROLLBACK_JOURNAL, -1, + version.database_text_encoding, page_type_string, -1, None) + commit.carved_cells.update({cell.md5_hex_digest: cell for cell in carved_cells}) + carved_commits.append(commit) + + offset += page_record_size + + # Check if the next page record is a full page record size or not + if (offset + page_record_size) >= rollback_journal.file_handle.file_size: + + # The page record is cut off since it is goes beyond the end of the file + has_data = False + + """ + + This accounts for the last incomplete block/frame of the journal file for carving. + + Since this isn't a full page record, we do not care about the checksum since it should be cut off. + + """ + + page_number = unpack(b">I", rollback_journal.file_handle.read_data(offset, 4))[0] + page_content = rollback_journal.file_handle.read_data(offset + page_record_header_size, + rollback_journal.file_handle.file_size - + page_record_header_size - offset) + page_type = hexlify(page_content[:1]) + + if page_type in ["0d", "05"]: + + page_type_string = PAGE_TYPE.B_TREE_TABLE_LEAF if page_type == "0d" \ + else PAGE_TYPE.B_TREE_TABLE_INTERIOR + carved_cells = SignatureCarver.carve_unallocated_space(version, FILE_TYPE.ROLLBACK_JOURNAL, + page_number, 0, page_content, signature, + offset + page_record_header_size) + + commit = Commit(master_schema_entry.name, FILE_TYPE.ROLLBACK_JOURNAL, -1, + version.database_text_encoding, page_type_string, -1, None) + commit.carved_cells.update({cell.md5_hex_digest: cell for cell in carved_cells}) + carved_commits.append(commit) + + logger.debug("Finished carving table: %s... " % master_schema_entry.name) + return carved_commits diff --git a/sqlite_dissect/carving/signature.py b/sqlite_dissect/carving/signature.py new file mode 100644 index 0000000..812b449 --- /dev/null +++ b/sqlite_dissect/carving/signature.py @@ -0,0 +1,1628 @@ +from abc import ABCMeta +from abc import abstractmethod +from copy import copy +from logging import getLogger +from re import sub +from warnings import warn +from sqlite_dissect.carving.utilities import get_content_size +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import MASTER_SCHEMA_ROW_TYPE +from sqlite_dissect.constants import STORAGE_CLASS +from sqlite_dissect.constants import TYPE_AFFINITY +from sqlite_dissect.file.database.utilities import aggregate_leaf_cells +from sqlite_dissect.file.database.utilities import get_pages_from_b_tree_page +from sqlite_dissect.file.schema.master import OrdinaryTableRow +from sqlite_dissect.file.schema.master import VirtualTableRow +from sqlite_dissect.file.version_parser import VersionParser +from sqlite_dissect.exception import SignatureError + +""" + +signature.py + +This script holds the objects for the signature generation of SQLite table and index b-trees for carving. + +This script holds the following object(s): +Signature(VersionParser) +SchemaColumnSignature(object) +TableColumnSignature(object) +TableRowSignature(object) +ColumnSignature(object) +ColumnFixedLengthSignature(ColumnSignature) +ColumnVariableLengthSignature(ColumnSignature) +ColumnReducedVariableLengthSignature(ColumnVariableLengthSignature) +ColumnNonReducedVariableLengthSignature(ColumnVariableLengthSignature) + +""" + + +class Signature(VersionParser): + + def __init__(self, version_history, master_schema_entry, version_number=None, ending_version_number=None): + + """ + + + + Note: The schema and table column signatures will be lists ordered in relation to the index of the column + referred to in the table. The table row signatures will be a dictionary indexed by the serial type + signature from the record representing the unique combination of serial types for that row pointing + to the related table row signature. + + Note: The above note is not true for "without rowid" tables. A warning will be raised if this + case is encountered. + + Note: It is important to pay attention to the column breakdown in the usage of this class in the case of an + altered table. This class leaves it up to the user to check for these fields and make use of them + accordingly. + + :param version_history: + :param master_schema_entry: + :param version_number: + :param ending_version_number: + + :return: + + :raise: + + """ + + # Call to the super class + super(Signature, self).__init__(version_history, master_schema_entry, version_number, ending_version_number) + + logger = getLogger(LOGGER_NAME) + + """ + + Since the index signatures have not been fully investigated, a warning is printed here to alert of this. + + """ + + if master_schema_entry.row_type == MASTER_SCHEMA_ROW_TYPE.INDEX: + log_message = "An index row type was found for signature which is not fully supported for master " \ + "schema entry root page number: {} row type: {} name: {} table name: {} and sql: {}." + log_message = log_message.format(master_schema_entry.root_page_number, + master_schema_entry.row_type, master_schema_entry.name, + master_schema_entry.table_name, master_schema_entry.sql) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + if master_schema_entry.internal_schema_object: + log_message = "An internal schema object index row type was found for the version parser which is " \ + "not fully supported for master schema entry root page number: {} type: {} name: {} " \ + "table name: {} and sql: {} and may result in erroneous cells." + log_message = log_message.format(master_schema_entry.root_page_number, + master_schema_entry.row_type, master_schema_entry.name, + master_schema_entry.table_name, master_schema_entry.sql) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + log_message = "Creating signature for master schema entry with name: {} table name: {} row type: {} and " \ + "sql: {} for version number: {} and ending version number: {}." + log_message = log_message.format(self.name, self.table_name, self.row_type, self.sql, + self.parser_starting_version_number, self.parser_ending_version_number) + logger.debug(log_message) + + """ + + Create and initialize the variables for the signature + + The schema column signatures and table column signatures will be in order that the fields are in the table. The + table row signatures will be in a dictionary keyed off of the record serial type signature. + + """ + + self.schema_column_signatures = [] + self.table_row_signatures = {} + self.table_column_signatures = [] + + """ + + Below variables are declared for total records and unique records. These are counters to determine the number + of total rows reviewed across all versions (including duplicates) and the unique rows (non-duplicated) between + all versions. This is due to the face that we can have multiple pages with the same data and only minor + additions/subtractions to that data. Therefore, total records will record the running total of all records + regardless of uniqueness and unique records will be the total number of records with no duplicates included. + + Note: We include the row id into the uniqueness. This way similar signatures between different rows will + build up a more accurate probability. + + """ + + self.total_records = 0 + self.unique_records = 0 + + """ + + Derived the schema column signatures from the SQL statements in the master schema from the + table and index types. + + Note: The order of column definitions will match the columns as defined in the schema SQL statement. + + Note: IndexRow master schema entries do not have column definitions at this time so we need to make sure + the object is a OrdinaryTableRow object. (VirtualTableRow objects or OrdinaryTableRow that are + "without rowid" tables do not have column definitions at this time either.) This results in only + normal tables currently having signatures. Warnings have already been thrown in regards to these + use cases above. + + """ + + if isinstance(master_schema_entry, OrdinaryTableRow) and not master_schema_entry.without_row_id: + for column_definition in master_schema_entry.column_definitions: + self.schema_column_signatures.append(SchemaColumnSignature(column_definition)) + + if isinstance(master_schema_entry, VirtualTableRow): + + """ + + Below we initialize variables for the signature to prevent issues with the stringify method. After that, + a warning message is printed and the application continues on since the virtual tables in SQLite are not + currently supported. All fields are set to the defaults (False and/or None/Empty values). + + """ + self.altered_columns = False + self.column_breakdown = {} + + log_message = "Virtual table found in signature for master schema entry with name: {} table name: {} " \ + "row type: {} and sql: {} for version number: {} and ending version number: {}. A " \ + "signature will not be generated since virtual tables are not fully supported yet." + log_message = log_message.format(self.name, self.table_name, self.row_type, self.sql, + self.parser_starting_version_number, self.parser_ending_version_number) + log_message = log_message.format() + getLogger(LOGGER_NAME).warn(log_message) + warn(log_message, RuntimeWarning) + + elif self.parser_starting_version_number is not None and self.parser_ending_version_number is not None: + + # Get the versions + versions = version_history.versions + + """ + + Below the column definitions are pulled from the initial, base version, master schema. Since these columns + will stay the same across all updates to the master schema entry, it is safe to set it here. The only field + that can be updated in the master schema entry without causing a new master schema entry is the root page + number. + + """ + + # Set the column definitions + column_definitions = master_schema_entry.column_definitions + + # Create a set for account cells so we don't account for the same record twice across versions + accounted_for_cell_digests = set() + + # Initialize the b-tree page numbers + root_b_tree_page_numbers = [] + + # Iterate through the versions in reference to this master schema entry + for version_number in range(self.parser_starting_version_number, + self.parser_ending_version_number + 1): + + version = versions[version_number] + root_page_number = self.root_page_number_version_index[version_number] + + b_tree_updated = False + + # Check if this is the first version to be investigated + if version_number == self.parser_starting_version_number: + b_tree_updated = True + + # Check if the root page number changed + elif root_page_number != self.root_page_number_version_index[version_number - 1]: + b_tree_updated = True + + # Check if any of the non-root pages changed + elif [page_number for page_number in root_b_tree_page_numbers + if page_number in version.updated_b_tree_page_numbers]: + b_tree_updated = True + + # Parse the b-tree page structure if it was updated + if b_tree_updated: + + # Get the root page and root page numbers from the first version + root_page = version.get_b_tree_root_page(root_page_number) + root_b_tree_page_numbers = [b_tree_page.number for b_tree_page + in get_pages_from_b_tree_page(root_page)] + + """ + + Below we aggregate the records together. This function returns the total of records and then + a dictionary of records indexed by their cell md5 hex digest to record. This dictionary may + hold less records than the total since records may have already been accounted for in previous + versions and are ignored since their cell md5 hex digests are in the accounted for cell digests + already. + + Note: The number of unique records reflects the total of all records in terms of uniqueness + regardless of the number of columns that are reflected in each row. + + """ + + total, records = aggregate_leaf_cells(root_page, accounted_for_cell_digests, True) + + # Add the totals to the counts + self.total_records += total + self.unique_records += len(records) + + """ + + The column definitions in the master schema entry are parsed in order. Therefore, the order of the + column definitions should be in the same order as the columns in the record. These orders are + assumed to be equivalent to each other. + + Note: In SQLite, it is not possible to rename or remove columns, but columns can be added. + Therefore, some records may have less entries in then than the number of column definitions + and the table row signatures may have a different number of columns (lesser or equal to + the number of column definitions) in them. + + """ + + # Iterate through each of the records + for cell_md5_hex_digest, record in records.iteritems(): + + """ + + Note: The serial type signature is a series of serial types in a string to determine the + structure of that record. For variable length columns, -2 is used for strings and + -1 is used for blobs. The variable length signatures are similar to Epilog. + + """ + + # Check if the serial type signature of the record is not already in the row signatures + if record.serial_type_signature not in self.table_row_signatures: + + # Create and add a new table row signature + table_row_signature = TableRowSignature(column_definitions, record) + self.table_row_signatures[record.serial_type_signature] = table_row_signature + + # The signature already exists + else: + + # Update the table row signature + self.table_row_signatures[record.serial_type_signature].update(record) + + """ + + Iterate through each of the table row signatures and update the total number of records that were parsed + in order to create probability statistics for that row. + + We also track the count of each row and then match that against the accounted for records for additional + validation. + + """ + + total_table_row_signature_count = 0 + + # Iterate through the table row signatures and set the total rows and increment the count + for serial_type_signature, table_row_signature in self.table_row_signatures.iteritems(): + table_row_signature.number_of_rows = self.unique_records + total_table_row_signature_count += table_row_signature.count + + # Make sure the count of records match + if total_table_row_signature_count != self.unique_records: + log_message = "The total table row signature count: {} does not match the number of unique " \ + "records: {} for master schema entry row type: {} with root page number: {} name: {} " \ + "table name: {} and sql: {}." + log_message = log_message.format(total_table_row_signature_count, self.unique_records, + master_schema_entry.row_type, master_schema_entry.root_page_number, + master_schema_entry.name, master_schema_entry.table_name, + master_schema_entry.sql) + logger.error(log_message) + raise SignatureError(log_message) + + """ + + Below we have to account for the use case of altered tables. + + In order to do this we have a altered columns boolean that is set to true if this is detected. We also + create a dictionary to represent the breakdown of the columns: + + column_breakdown[NUMBER_OF_COLUMNS] = (NUMBER_OF_ROWS, PROBABILITY) + + where NUMBER_OF_ROWS is the number of rows that has exactly the NUMBER_OF_COLUMNS in it, and + where PROBABILITY is the NUMBER_OF_ROWS divided by the number of unique records. + + Additionally, there may be no entries in for the last modification to the table. For example, there may be + 5 rows with 10 columns, but the latest SQL/schema for the table shows that it has 11 columns. This can + occur if no rows are inserted after the last alter statement. In order to account for this, the number + of columns found for the schema are checked against the column breakdown dictionary and if the number of + columns is not found, it is added to the dictionary with 0 NUMBER_OF_ROWS and 0 PROBABILITY. It is + important to note that it is only added if the number of columns in the SQL/schema are greater than the + number of columns in the row. If the number of columns in the SQL/schema are less, than an exception + will be raised. + + In the case that there are no entries in the table itself, the NUMBER_OF_ROWS and PROBABILITY will both + be set to 0 for the SQL/schema number of columns in the column breakdown. + + It is up to the user of the signature class to check against the column breakdown in order to determine the + best way to carve the data they are looking at. This class merely supplies the information and leaves it up + to the user on how to make use of it. + + Also, in regards to probability, the column signatures created have probability based off of the number of + rows that column appeared in. Therefore, columns added in later through alter table statements will have + probability calculated based off of the number of rows that only had those columns in it. In order to + calculate the probability of a column signature across all rows, the probability of that signature should + be multiplied by the probability that column shows up which can be derived through the column breakdown + based off of it's column index. A better way to do this may be able to be done moving forward. + + Note: The altered columns flag is not 100% deterministic. It can only be determined when: + 1.) The number of columns are different lengths across rows + 2.) The number of columns in the SQL/schema is greater than the number of columns in the rows + + Note: It may be better to find a way to correlate the altered columns flag to a master schema associated + class. + + """ + + # Instantiate the altered columns flag and the column breakdown + self.altered_columns = False + self.column_breakdown = {} + + # Iterate through all of the table row signatures and add up the counts of each one based on column count + for table_row_signature in self.table_row_signatures.values(): + column_signature_length = len(table_row_signature.column_signatures) + if column_signature_length in self.column_breakdown: + self.column_breakdown[column_signature_length] += table_row_signature.count + else: + self.column_breakdown[column_signature_length] = table_row_signature.count + + # Get the number of columns in the schema and add it to the column breakdown if not already added + schema_column_length = len(self.schema_column_signatures) + if schema_column_length not in self.column_breakdown: + self.column_breakdown[schema_column_length] = 0 + + # Iterate through the column breakdown and compute probabilities + for column_count in self.column_breakdown: + row_count = self.column_breakdown[column_count] + probability = float(row_count) / self.unique_records if self.unique_records else 0 + self.column_breakdown[column_count] = (row_count, probability) + + # The columns have been altered if there is more than one entry in the column breakdown + if len(self.column_breakdown) > 1: + self.altered_columns = True + + """ + + At this point we have iterated through all the versions and found all of the table row signatures to each + unique row structure that we found. If there was no root page or no rows found in any of the pages, then + the table row signatures will be empty. Below we parse through each of the table row signatures and create + column signatures across them inverting the data so we can see the signatures in two ways. First, across + the rows, and second, across the columns. + + """ + + # Check if there were table row signatures found + if self.table_row_signatures: + + """ + + Next, we create a table row column dictionary with the column index as the key and the value an array + of serial types aggregated across all of the table row signatures of that column index. Once we get + the table row column serial type arrays, we create the table column signatures. + + This process basically inverts the table row signatures in order to generate the table + column signatures. + + Note: The column definitions in the master schema entry are parsed in order. Therefore, the order of + the column definitions should be in the same order as the columns in the record. Also, since the + table row signatures are created off of the record columns and definitions the columns in the + table row signature will also be in the same order. Previously, the column definition size was + used to iterate through each row with to get the columns pertaining to the column index of the + column definition. However, every row may not have every column and therefore the length of the + column signatures for each row being iterated through is used. This will occur if multiple + variations of columns occur in the row indicating a table that has been altered at some point. + + Note: The indices of the column signatures should match the indices of the record columns and the + columns in the table row signatures since they are all derived originally from the master schema. + Below, the index in the range of column definitions size is used for the table row columns + creation and the column signatures in the table row signatures. + + """ + + table_row_columns = {} + + # Iterate through the table row signatures and create the table row columns dictionary + for table_row_md5_hex_digest, table_row_signature in self.table_row_signatures.iteritems(): + + # Iterate through all of the column signatures in the current table row signature + for column_index in range(len(table_row_signature.column_signatures)): + + # Add or append the column signature in the table row columns dictionary + if column_index in table_row_columns: + table_row_columns[column_index].append(table_row_signature.column_signatures[column_index]) + else: + table_row_columns[column_index] = [table_row_signature.column_signatures[column_index]] + + # Iterate through the table row columns and create the table column signatures + for table_row_column_index, table_row_column_serial_type_array in table_row_columns.iteritems(): + column_name = column_definitions[table_row_column_index].column_name + self.table_column_signatures.append(TableColumnSignature(table_row_column_index, column_name, + table_row_column_serial_type_array)) + + # No table row signatures were found + else: + + """ + + Note: Both of these should be 0 if no table row signatures were found. Checking the total records + should actually be enough for this check but both are checked for additional validity. + + """ + + # Make sure no records were found + if self.total_records or self.unique_records: + log_message = "The total records: {} and unique records: {} are both not 0 as expected for " \ + "master schema entry row type: {} with root page number: {} name: {} table " \ + "name: {} and sql: {}." + log_message = log_message.format(self.total_records, self.unique_records, + master_schema_entry.row_type, master_schema_entry.root_page_number, + master_schema_entry.name, master_schema_entry.table_name, + master_schema_entry.sql) + logger.error(log_message) + raise SignatureError(log_message) + + """ + + At this point we now have two sets of signatures depending on the way you want to view the table signatures. + 1.) self._table_row_signatures: Each unique row of the table in relation to serial types with probability of + each row and column serial type if it is a string or blob. + 2.) self._table_column_signatures: Each column of the table with the serial types realized across all the + rows along with probability of each serial type in respect to that + column. + + """ + + """ + + Since we may not have records, and may possibly not have a schema to parse schema column signatures from + (depending if it is a virtual table, internal schema object, etc.), we check the lengths of the schema + column signatures and table column signatures so that if both signatures exist, the column lengths must + be equal. We take the max of the two lengths as the number of columns. + + """ + + schema_column_signatures_length = len(self.schema_column_signatures) + table_column_signatures_length = len(self.table_column_signatures) + + if schema_column_signatures_length and table_column_signatures_length: + if schema_column_signatures_length != table_column_signatures_length: + log_message = "The schema column signatures length: {} is not equal to the table column signatures " \ + "length: {} for master schema entry row type: {} with root page number: {} name: {} " \ + "table name: {} and sql: {}." + log_message = log_message.format(schema_column_signatures_length, table_column_signatures_length, + master_schema_entry.row_type, master_schema_entry.root_page_number, + master_schema_entry.name, master_schema_entry.table_name, + master_schema_entry.sql) + logger.error(log_message) + raise SignatureError(log_message) + + self.number_of_columns = max(schema_column_signatures_length, table_column_signatures_length) + + def stringify(self, padding="", print_table_row_signatures=True, print_schema_column_signatures=True, + print_table_column_signatures=True, print_column_signatures=True): + string = "\n" \ + + padding + "Number of Columns: {}\n" \ + + padding + "Total Records: {}\n" \ + + padding + "Unique Records: {}\n" \ + + padding + "Altered Columns: {}\n" \ + + padding + "Column Breakdown: {}\n" \ + + padding + "Schema Column Signatures Length: {}\n" \ + + padding + "Table Row Signatures Length: {}\n" \ + + padding + "Table Column Signatures Length: {}\n" \ + + padding + "Recommended Schema Column Signature: {}\n" \ + + padding + "Complete Schema Column Signature: {}\n" \ + + padding + "Focused Signature: {}\n" \ + + padding + "Simplified Signature: {}\n" \ + + padding + "Focused Probability Signature: {}\n" \ + + padding + "Simplified Probability Signature: {}\n" \ + + padding + "Epilog Schema Signature: {}\n" \ + + padding + "Epilog Focused Signature: {}\n" \ + + padding + "Epilog Simplified Signature: {}" + string = string.format(self.number_of_columns, + self.total_records, + self.unique_records, + self.altered_columns, + self.column_breakdown, + len(self.schema_column_signatures), + len(self.table_row_signatures), + len(self.table_column_signatures), + self.recommended_schema_signature, + self.complete_schema_signature, + self.focused_signature, + self.simplified_signature, + self.focused_probabilistic_signature, + self.simplified_probabilistic_signature, + self.epilog_schema_signature, + self.epilog_focused_signature, + self.epilog_simplified_signature) + if print_schema_column_signatures: + for schema_column_signature in self.schema_column_signatures: + signature_string = "\n" + padding + "Schema Column Signature: {}" + signature_string = signature_string.format(schema_column_signature.stringify("\t")) + string += signature_string + if print_table_row_signatures: + for table_row_md5_hex_digest, table_row_signature in self.table_row_signatures.iteritems(): + signature_string = "\n" + padding + "Table Row Signature:\n{}" + signature_string = signature_string.format(table_row_signature.stringify("\t", print_column_signatures)) + string += signature_string + if print_table_column_signatures: + for table_column_signature in self.table_column_signatures: + signature_string = "\n" + padding + "Table Column Signature: {}" + signature_string = signature_string.format(table_column_signature.stringify("\t", + print_column_signatures)) + string += signature_string + return super(Signature, self).stringify(padding) + string + + @property + def epilog_focused_signature(self): + + epilog_focused_signature = [] + + for column_signature in self.focused_signature: + + # Copy the column signature signature as a base + epilog_column_signature = copy(column_signature) + + """ + + Epilog does not log the 8 and 9 serial types in the focused schema. Instead it uses serial type 1 for + 8 and 9. + + In order to represent 8 and 9 serial types in epilog column signatures, after epilog replaces the 8 or 9 + with a 1, it sets the min and max files appropriately for that field. For example setting max = 1. + + More investigation needs to go into the use of epilog signatures with 8 and 9. + + """ + + insert_single_byte_integer = False + + if 8 in epilog_column_signature: + epilog_column_signature.remove(8) + insert_single_byte_integer = True + + if 9 in epilog_column_signature: + epilog_column_signature.remove(9) + insert_single_byte_integer = True + + if insert_single_byte_integer and 1 not in epilog_column_signature: + epilog_column_signature.append(1) + + epilog_focused_signature.append(sorted(epilog_column_signature, key=int)) + + return epilog_focused_signature + + @property + def epilog_schema_signature(self): + + epilog_schema_signature = [] + + for schema_column_signature in self.schema_column_signatures: + + """ + + Note: The recommended signature is used here instead of the complete since this seems more in line + to the epilog signatures themselves, along with reducing a lot of serial types in the complete + signature that may not apply. + + """ + + # Copy the recommended signature from this particular schema column signature as a base + epilog_column_signature = copy(schema_column_signature.recommended_signature) + + # Append a null value as epilog does if it is not in the column signature already + if 0 not in epilog_column_signature: + epilog_column_signature.append(0) + + epilog_schema_signature.append(sorted(epilog_column_signature, key=int)) + + return epilog_schema_signature + + @property + def epilog_simplified_signature(self): + + epilog_simplified_signature = [] + + for column_signature in self.simplified_signature: + + # Copy over the like serial types between this column signature and the epilog column signature + epilog_column_signature = [x for x in column_signature if x in [-2, -1, 0, 7]] + + """ + + Check if any of the integer serial types are in the column signature and add all integer serial + types if any of them exist since this is how epilog seems to do it. However, there may be use + cases in regards to 8 and 9 being used for non-integer storage classes. + + """ + + integer_serial_types = [1, 2, 3, 4, 5, 6, 8, 9] + if len(set(integer_serial_types).intersection(set(column_signature))): + epilog_column_signature.extend(integer_serial_types) + + epilog_simplified_signature.append(sorted(epilog_column_signature, key=int)) + + return epilog_simplified_signature + + @property + def complete_schema_signature(self): + simplified_signatures = [] + for schema_column_signature in self.schema_column_signatures: + simplified_signatures.append(schema_column_signature.complete_signature) + return simplified_signatures + + @property + def focused_probabilistic_signature(self): + focused_signatures = [] + for table_column_signature in self.table_column_signatures: + focused_signatures.append(table_column_signature.focused_probabilistic_signature) + return focused_signatures + + @property + def focused_signature(self): + focused_signatures = [] + for table_column_signature in self.table_column_signatures: + focused_signatures.append(table_column_signature.focused_signature) + return focused_signatures + + @property + def recommended_schema_signature(self): + simplified_signatures = [] + for schema_column_signature in self.schema_column_signatures: + simplified_signatures.append(schema_column_signature.recommended_signature) + return simplified_signatures + + @property + def simplified_probabilistic_signature(self): + simplified_signatures = [] + for table_column_signature in self.table_column_signatures: + simplified_signatures.append(table_column_signature.simplified_probabilistic_signature) + return simplified_signatures + + @property + def simplified_signature(self): + simplified_signatures = [] + for table_column_signature in self.table_column_signatures: + simplified_signatures.append(table_column_signature.simplified_signature) + return simplified_signatures + + +class SchemaColumnSignature(object): + + """ + + SchemaColumnSignature + + This class will take a column definition and create a schema column definition from it. This is mostly useful + in the case where there are not row entries in the table and a signature has to be built directly off the data + types in the column definition. Otherwise, the table column signature or table row signature would be recommended. + This is due to the fact that this signature cannot validate the fields will be the types derived from the data types + of the column due to the way SQLite works with storage classes and type affinities. This class will retrieve the + type affinity derived from the column data type (if specified) and base the signatures off of those affinities. + Due to this, there will be two signatures in this class that can be retrieved: + + 1.) Recommended Signature: The recommended signature for what is most likely to be seen in the columns based on the + type affinity. + + The recommended signature will be based off the data type and recommended storage class used for that data type, + if specified. The following serial types are used for the following type affinities: + + Type Affinity Serial Type Signature + INTEGER [1, 2, 3, 4, 5, 6, 8, 9] + REAL [1, 2, 3, 4, 5, 6, 7, 8, 9] + NUMERIC [-2] + TEXT [-1] + BLOB (or if not specified) [1, 2, 3, 4, 5, 6, 7, 8, 9] + + 2.) Complete Signature: The full possibility of what can be seen in the columns based on the type affinity. + + Unfortunately, almost every type affinity can be stored as any storage class with the exception of the TEXT + type affinity. The storage class is derived from the combination of the type affinity and the actual value. + Therefore the complete signature will include all storage classes for every type affinity except TEXT will + will only include the TEXT, BLOB, and NULL storage classes. (The TEXT, BLOB and NULL storage classes can be + used for all type affinities.) + + Type Affinity Storage Class + INTEGER INTEGER, REAL, TEXT, BLOB, NULL + REAL INTEGER, REAL, TEXT, BLOB, NULL + NUMERIC INTEGER, REAL, TEXT, BLOB, NULL + TEXT TEXT, BLOB, NULL + BLOB (or if not specified) INTEGER, REAL, TEXT, BLOB, NULL + + Due to this, similar to above, there is also recommended storage class and possible storage class array for + what the storage classes of the particular column may be. + + However, the REAL type affinity only uses the storage class INTEGER to store it's values into the file but + reads it back out as REAL even though it is not in the file. This conversion is done behind the scenes in + SQLite and therefore the possible storage classes for REAL can be updated as: + + REAL REAL, TEXT, BLOB, NULL + + This is a very important (hidden) use case to keep in mind. + + This results in all type affinities having a signature of: [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], instead + of the TEXT type affinity which has a signature of: [-2, -1, 0]. + + Since many storage classes are possible for each data type, the possible storage classes are set in an array and + are as specified above. + + Note: Serial types 8 and 9 are used in all recommended signatures (except TEXT) since these two types are for 0 and + 1 constants which are used a lot in order to reserve space in the SQLite file. + + Note: In the column definition, the derived data type name may be None if no data type was specified in the + SQL. If this is the case, the data type will be invalid and the type affinity will be BLOB per the + way affinities and storage classes are related depending on data type to the SQLite documentation. + + Note: Since TEXT and BLOB are variable length data types, -1 will be used to represent a BLOB and -2 will be used + to represent a string. This is similar to Epilog's handling of variable length data types in signatures. + + Note: There may be the possibility that columns were added causing inconsistencies between previous versions of the + row data that may not be picked up if solely going off of a schema based signature. However, if there is no + data to derive a signature from, we have no other recourse but to use the schema signature. In the future + signature files may be able to be imported in and out for this purpose based on os, application, and version. + + """ + + def __init__(self, column_definition): + + self.derived_data_type_name = column_definition.derived_data_type_name + self.data_type = column_definition.data_type + self.type_affinity = column_definition.type_affinity + + if self.type_affinity == TYPE_AFFINITY.INTEGER: + + self.recommended_storage_class = STORAGE_CLASS.INTEGER + self.possible_storage_classes = [STORAGE_CLASS.INTEGER, STORAGE_CLASS.REAL, STORAGE_CLASS.TEXT, + STORAGE_CLASS.BLOB, STORAGE_CLASS.NULL] + + self.recommended_signature = [1, 2, 3, 4, 5, 6, 8, 9] + self.complete_signature = [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + + elif self.type_affinity == TYPE_AFFINITY.REAL: + + self.recommended_storage_class = STORAGE_CLASS.REAL + self.possible_storage_classes = [STORAGE_CLASS.REAL, STORAGE_CLASS.TEXT, + STORAGE_CLASS.BLOB, STORAGE_CLASS.NULL] + + self.recommended_signature = [1, 2, 3, 4, 5, 6, 7, 8, 9] + self.complete_signature = [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + + elif self.type_affinity == TYPE_AFFINITY.TEXT: + + self.recommended_storage_class = TYPE_AFFINITY.TEXT + self.possible_storage_classes = [STORAGE_CLASS.TEXT, STORAGE_CLASS.BLOB, STORAGE_CLASS.NULL] + + self.recommended_signature = [-2] + self.complete_signature = [-2, -1, 0] + + elif self.type_affinity == TYPE_AFFINITY.BLOB: + + self.recommended_storage_class = TYPE_AFFINITY.BLOB + self.possible_storage_classes = [STORAGE_CLASS.INTEGER, STORAGE_CLASS.REAL, STORAGE_CLASS.TEXT, + STORAGE_CLASS.BLOB, STORAGE_CLASS.NULL] + + self.recommended_signature = [-1] + self.complete_signature = [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + + elif self.type_affinity == TYPE_AFFINITY.NUMERIC: + + self.recommended_storage_class = TYPE_AFFINITY.NUMERIC + self.possible_storage_classes = [STORAGE_CLASS.INTEGER, STORAGE_CLASS.REAL, STORAGE_CLASS.TEXT, + STORAGE_CLASS.BLOB, STORAGE_CLASS.NULL] + + self.recommended_signature = [1, 2, 3, 4, 5, 6, 7, 8, 9] + self.complete_signature = [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + + else: + + log_message = "Invalid type affinity found: {}.".format(self.type_affinity) + getLogger(LOGGER_NAME).error(log_message) + raise SignatureError(log_message) + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding=""): + string = padding + "Derived Data Type Name: {}\n" \ + + padding + "Data Type: {}\n" \ + + padding + "Type Affinity: {}\n" \ + + padding + "Recommended Storage Class: {}\n" \ + + padding + "Possible Storage Classes: {}\n" \ + + padding + "Recommended Signature: {}\n" \ + + padding + "Complete Signature: {}" + string = string.format(self.derived_data_type_name, + self.data_type, + self.type_affinity, + self.recommended_storage_class, + self.possible_storage_classes, + self.recommended_signature, + self.complete_signature) + return string + + +class TableColumnSignature(object): + + def __init__(self, index, name, column_signatures): + + self._logger = getLogger(LOGGER_NAME) + + self.count = 0 + self.index = index + self.name = name + self.column_signatures = {} + + for column_signature in column_signatures: + + if column_signature.index != self.index: + log_message = "Invalid column signature index: {} found for table column signature with index: {} " \ + "and name: {}." + log_message = log_message.format(column_signature.index, self.index, self.name) + self._logger.error(log_message) + raise SignatureError(log_message) + + if column_signature.name != self.name: + log_message = "Invalid column signature name: {} found for table column signature with name: {} " \ + "and name: {}." + log_message = log_message.format(column_signature.name, self.index, self.name) + self._logger.error(log_message) + raise SignatureError(log_message) + + self.count += column_signature.count + + if column_signature.serial_type in self.column_signatures: + + if isinstance(column_signature, ColumnFixedLengthSignature): + updated_column_signature = self.column_signatures[column_signature.serial_type] + updated_column_signature.update(column_signature.serial_type, column_signature.count) + + elif isinstance(column_signature, ColumnVariableLengthSignature): + updated_column_signature = self.column_signatures[column_signature.serial_type] + updated_column_signature.update(column_signature.serial_type, column_signature.count, + column_signature.variable_length_serial_types) + + else: + log_message = "Invalid column signature type: {} found for table column signature with index: {} " \ + "and name: {}." + log_message = log_message.format(type(column_signature), self.index, self.name) + self._logger.error(log_message) + raise SignatureError(log_message) + + else: + + if isinstance(column_signature, ColumnFixedLengthSignature): + new_column_signature = ColumnFixedLengthSignature(index, column_signature.name, + column_signature.serial_type, + column_signature.count) + self.column_signatures[column_signature.serial_type] = new_column_signature + + elif isinstance(column_signature, ColumnVariableLengthSignature): + new_column_signature = ColumnReducedVariableLengthSignature(index, column_signature.name, + column_signature.serial_type, + column_signature.count, + column_signature. + variable_length_serial_types) + self.column_signatures[column_signature.serial_type] = new_column_signature + + else: + log_message = "Invalid column signature type: {} found for table column signature with index: {} " \ + "and name: {}." + log_message = log_message.format(type(column_signature), self.index, self.name) + self._logger.error(log_message) + raise SignatureError(log_message) + + for column_signature_index, column_signature in self.column_signatures.iteritems(): + column_signature.number_of_rows = self.count + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding="", print_column_signatures=True): + string = padding + "Index: {}\n" \ + + padding + "Name: {}\n" \ + + padding + "Count: {}\n" \ + + padding + "Focused Signature: {}\n" \ + + padding + "Simple Signature: {}\n" \ + + padding + "Column Signature Length: {}" + string = string.format(self.index, + self.name, + self.count, + self.focused_signature, + self.simplified_signature, + len(self.column_signatures)) + if print_column_signatures: + for column_signature_index, column_signature in self.column_signatures.iteritems(): + string += "\n" + padding + "Column Signature:\n{}".format(column_signature.stringify(padding + "\t")) + return string + + @property + def focused_probabilistic_signature(self): + focused_signatures = [] + for column_signature_index, column_signature in self.column_signatures.iteritems(): + if isinstance(column_signature, ColumnVariableLengthSignature): + for serial_type in column_signature.variable_length_serial_types: + serial_type_probability = column_signature.get_variable_length_serial_type_probability(serial_type) + focused_signatures.append((serial_type, serial_type_probability)) + elif isinstance(column_signature, ColumnFixedLengthSignature): + focused_signatures.append((column_signature.serial_type, column_signature.probability)) + else: + log_message = "Invalid column signature type: {} found for table column signature with index: {} " \ + "and name: {}." + log_message = log_message.format(type(column_signature), self.index, self.name) + self._logger.error(log_message) + raise ValueError(log_message) + return sorted(focused_signatures, key=lambda x: x[0]) + + @property + def focused_signature(self): + focused_signatures = [] + for column_signature_index, column_signature in self.column_signatures.iteritems(): + if isinstance(column_signature, ColumnVariableLengthSignature): + focused_signatures.extend(column_signature.variable_length_serial_types.keys()) + elif isinstance(column_signature, ColumnFixedLengthSignature): + focused_signatures.append(column_signature.serial_type) + else: + log_message = "Invalid column signature type: {} found for table column signature with index: {} " \ + "and name: {}." + log_message = log_message.format(type(column_signature), self.index, self.name) + self._logger.error(log_message) + raise ValueError(log_message) + return sorted(focused_signatures, key=int) + + @property + def simplified_probabilistic_signature(self): + simplified_signatures = [] + for column_signature_index, column_signature in self.column_signatures.iteritems(): + simplified_signatures.append((column_signature.serial_type, column_signature.probability)) + return sorted(simplified_signatures, key=lambda x: x[0]) + + @property + def simplified_signature(self): + simplified_signatures = [] + for column_signature_index, column_signature in self.column_signatures.iteritems(): + simplified_signatures.append(column_signature.serial_type) + return sorted(simplified_signatures, key=int) + + +class TableRowSignature(object): + + """ + + TableRowSignature + + This class represents a signature of a particular row in a table. The idea is that each table has similar rows + in respect to their serial type ordering (storage classes and type affinities). A array is made of these + representing all signatures in a table and then can be inverted to represent the column signatures of a table. + + Note: The number of columns in a table row signature may be equal to or less than the number of column definitions + since columns can be added over time. However, columns cannot be removed or renamed in SQLite. + + Note: ColumnFixedLengthSignature column signatures will always have a probability of 1 in table row signatures, + since this is identifying a unique combination of column signatures (serial types). The + ColumnVariableLengthSignature column signatures will have a similar probability of 1 in reference to TEXT + and BLOB storage classes but may differ in the variable lengths themselves. Due to this, there is no + probabilistic signatures for table row signatures as there are in table column signatures. + + """ + + def __init__(self, column_definitions, record): + + """ + + Constructor. + + Note: Table row signatures are determined from the record serial type signature. Rows with the same serial + type signature for records will be grouped into individual table row signatures and "counted". + + Note: The column definitions array and the record columns in the record are relative to each other in terms + of order since the column definitions are pulled from the master schema. + + :param column_definitions: + :param record: + + :return: + + """ + + self._logger = getLogger(LOGGER_NAME) + + # Get the record columns + record_columns = record.record_columns + + self.count = 1 + self.column_signatures = {} + self.record_serial_type_signature = record.serial_type_signature + + """ + + Below we check to make sure the number of record column for this table row signature are less than or equal to + the number of column definitions. Since columns can be added, but not removed or renamed, the number of record + columns can be less than the number of column definitions. However, added columns are always appended to the + table and therefore the column definitions will align up to the number of record columns that are found. + + We raise an exception if we find that the number of record columns is greater than the number of column + definitions. If we find that the record columns is less than the number of column definitions, we print + a debug message. + + """ + + # Check the length of the column definitions to the record columns + if len(column_definitions) != len(record_columns): + + # Check if the column definitions is less than the number of record columns + if len(column_definitions) < len(record_columns): + log_message = "The length of column definitions: {} is less than the record column length: {} " \ + "for table row signature with record serial type signature: {}." + log_message = log_message.format(len(column_definitions), len(record_columns), + self.record_serial_type_signature) + self._logger.error(log_message) + raise ValueError(log_message) + + # The number of column definitions is greater than the number of record columns + else: + log_message = "The length of column definitions: {} is greater than the record column length: {} " \ + "for table row signature with record serial type signature: {}." + log_message = log_message.format(len(column_definitions), len(record_columns), + self.record_serial_type_signature) + self._logger.debug(log_message) + + """ + + Note: The count is the number of specific rows that were found with this serial type whereas the number of + rows is the total of the rows in the table this column signature is being derived from. Therefore, + the probability of this column signature with this serial type occurring in the particular column of + the table is the count/total. + + """ + + self._number_of_rows = None + + for index in range(len(record_columns)): + + column_name = column_definitions[index].column_name + serial_type = record_columns[index].serial_type + + if 0 <= serial_type <= 9: + self.column_signatures[index] = ColumnFixedLengthSignature(index, column_name, serial_type) + elif serial_type >= 12: + self.column_signatures[index] = ColumnNonReducedVariableLengthSignature(index, column_name, serial_type) + else: + log_message = "Invalid serial type: {} for table row signature with record serial type signature: {}." + log_message = log_message.format(serial_type, self.record_serial_type_signature) + self._logger.error(log_message) + raise SignatureError(log_message) + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding="", print_column_signatures=True): + string = padding + "Record Serial Type Signature: {}\n" \ + + padding + "Count: {}\n" \ + + padding + "Number of Rows: {}\n" \ + + padding + "Probability: {}\n" \ + + padding + "Focused Signature: {}\n" \ + + padding + "Simple Signature: {}\n" \ + + padding + "Column Signature Length: {}" + string = string.format(self.record_serial_type_signature, + self.count, + self.number_of_rows, + self.probability, + self.focused_signature, + self.simplified_signature, + len(self.column_signatures)) + if print_column_signatures: + for column_signature_index, column_signature in self.column_signatures.iteritems(): + string += "\n" + padding + "Column Signature:\n{}".format(column_signature.stringify(padding + "\t")) + return string + + @property + def focused_signature(self): + focused_signatures = [] + for column_signature_index, column_signature in self.column_signatures.iteritems(): + if isinstance(column_signature, ColumnVariableLengthSignature): + focused_signatures.append(sorted(column_signature.variable_length_serial_types.keys(), key=int)) + elif isinstance(column_signature, ColumnFixedLengthSignature): + focused_signatures.append([column_signature.serial_type]) + else: + log_message = "Invalid column signature type: {} found for table row signature with record serial " \ + "type signature: {}." + log_message = log_message.format(type(column_signature), self.record_serial_type_signature) + self._logger.error(log_message) + raise ValueError(log_message) + return focused_signatures + + @property + def number_of_rows(self): + + """ + + + + Note: A value of None will be returned if the number of rows is not set. + + :return: + + """ + + return self._number_of_rows + + @number_of_rows.setter + def number_of_rows(self, number_of_rows): + + if number_of_rows <= 0 or number_of_rows < self.count: + log_message = "Invalid number of rows: {} for table row signature with record serial type signature: {}." + log_message = log_message.format(number_of_rows, self.record_serial_type_signature) + self._logger.error(log_message) + raise ValueError(log_message) + + self._number_of_rows = number_of_rows + + for column_signature_index, column_signature in self.column_signatures.iteritems(): + column_signature.number_of_rows = number_of_rows + + @property + def probability(self): + + """ + + + + Note: A value of None will be returned if the number of rows is not set. + + :return: + + """ + + if self._number_of_rows: + return float(self.count) / self._number_of_rows + return None + + @property + def simplified_signature(self): + simplified_signatures = [] + for column_signature_index, column_signature in self.column_signatures.iteritems(): + simplified_signatures.append([column_signature.serial_type]) + return simplified_signatures + + def update(self, record): + + self.count += 1 + + record_columns = record.record_columns + + # Check the length of each (we assume the order in relative to each other is the same) + if len(self.column_signatures) != len(record_columns): + log_message = "The length of column signatures: {} does not match record column length from record: {} " \ + "for table row signature with record serial type signature: {}." + log_message = log_message.format(len(self.column_signatures), len(record_columns), + self.record_serial_type_signature) + self._logger.error(log_message) + raise ValueError(log_message) + + for index in self.column_signatures: + + serial_type = record_columns[index].serial_type + column_signature = self.column_signatures[index] + + if isinstance(column_signature, ColumnFixedLengthSignature): + + if column_signature.serial_type != serial_type: + log_message = "Column signature serial type: {} does not match record serial type: {} " \ + "for table row signature with record serial type signature: {}." + log_message = log_message.format(column_signature.serial_type, serial_type, + self.record_serial_type_signature) + self._logger.error(log_message) + raise SignatureError(log_message) + + column_signature.update(serial_type) + + elif isinstance(column_signature, ColumnVariableLengthSignature): + + if serial_type >= 12 and serial_type % 2 == 0: + if column_signature.serial_type != -1: + log_message = "Column signature serial type: {} does not equate to record column variable " \ + "length serial type: {} for table row signature with record serial " \ + "type signature: {}." + log_message = log_message.format(column_signature.serial_type, serial_type, + self.record_serial_type_signature) + self._logger.error(log_message) + raise SignatureError(log_message) + + elif serial_type >= 13 and serial_type % 2 == 1: + if column_signature.serial_type != -2: + log_message = "Column signature serial type: {} does not equate to record column variable " \ + "length serial type: {} for table row signature with record serial " \ + "type signature: {}." + log_message = log_message.format(column_signature.serial_type, serial_type, + self.record_serial_type_signature) + self._logger.error(log_message) + raise SignatureError(log_message) + + else: + log_message = "Invalid serial type: {} for column variable length signature " \ + "for table row signature with record serial type signature: {}." + log_message = log_message.format(serial_type, self.record_serial_type_signature) + self._logger.error(log_message) + raise SignatureError(log_message) + + column_signature.update(serial_type) + + else: + + log_message = "Invalid column signature type: {} found for table row signature with record serial " \ + "type signature: {}." + log_message = log_message.format(type(column_signature), self.record_serial_type_signature) + self._logger.error(log_message) + raise SignatureError(log_message) + + +class ColumnSignature(object): + + __metaclass__ = ABCMeta + + def __init__(self, index, name, serial_type, count=1): + + """ + + Constructor. + + Note: All columns within a signature may have different counts. This is due to the fact that columns can + be added in SQLite. If this occurs then columns towards the end of the rows may have less entries + (if any) than previous column counts. + + :param index: + :param name: + :param serial_type: + :param count: + + """ + + self._logger = getLogger(LOGGER_NAME) + + self.index = index + self.name = name + self.serial_type = serial_type + self.count = count + + """ + + Note: The count is the number of specific rows that were found with this serial type whereas the number of + rows is the total of the rows in the table this column signature is being derived from. Therefore, + the probability of this column signature with this serial type occurring in the particular column of + the table is the count/total. + + """ + + self._number_of_rows = None + + # These values are reserved and should not be found in SQLite files + if self.serial_type == 10 or self.serial_type == 11: + log_message = "Invalid serial type: {} found for column signature index: {} and name: {}." + log_message = log_message.format(self.serial_type, self.index, self.name) + self._logger.error(log_message) + raise ValueError(log_message) + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding=""): + string = padding + "Index: {}\n" \ + + padding + "Name: {}\n" \ + + padding + "Serial Type: {}\n" \ + + padding + "Count: {}\n" \ + + padding + "Number of Rows: {}\n" \ + + padding + "Probability: {}" + return string.format(self.index, + self.name, + self.serial_type, + self.count, + self.number_of_rows, + self.probability) + + @property + def number_of_rows(self): + + """ + + + + Note: A value of None will be returned if the number of rows is not set. + + :return: + + """ + + return self._number_of_rows + + @number_of_rows.setter + def number_of_rows(self, number_of_rows): + if number_of_rows <= 0 or number_of_rows < self.count: + log_message = "Invalid number of rows: {} for column signature index: {} and name: {}" + log_message = log_message.format(number_of_rows, self.index, self.name) + self._logger.error(log_message) + raise ValueError(log_message) + self._number_of_rows = number_of_rows + + @property + def probability(self): + + """ + + + + Note: A value of None will be returned if the number of rows is not set. + + :return: + + """ + + if self._number_of_rows: + return float(self.count) / self._number_of_rows + return None + + @abstractmethod + def update(self, serial_type, count=None, variable_length_serial_types=None): + raise NotImplementedError("The abstract method update was called directly and is not implemented.") + + +class ColumnFixedLengthSignature(ColumnSignature): + + def __init__(self, index, name, serial_type, count=1): + + super(ColumnFixedLengthSignature, self).__init__(index, name, serial_type, count) + + if serial_type not in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: + log_message = "Invalid serial type for column fixed-length signature index: {} and name: {}" + log_message = log_message.format(serial_type, self.index, self.name) + self._logger.error(log_message) + raise ValueError(log_message) + + self.content_size = get_content_size(self.serial_type) + + def stringify(self, padding=""): + string = "\n" + padding + "Content Size: {}" + string = string.format(self.content_size) + return super(ColumnFixedLengthSignature, self).stringify(padding) + string + + def update(self, serial_type, count=1, variable_length_serial_types=None): + + if serial_type != self.serial_type: + log_message = "Specified serial type: {} does not match column fixed-length signature serial type: {} " \ + "index: {} and name: {}" + log_message = log_message.format(serial_type, self.serial_type, self.index, self.name) + self._logger.error(log_message) + raise ValueError(log_message) + + if variable_length_serial_types: + log_message = "Variable length serial types: {} specified for column fixed-length signature " \ + "index: {} and name: {}" + log_message = log_message.format(variable_length_serial_types, self.index, self.name) + self._logger.error(log_message) + raise ValueError(log_message) + + self.count += count + + +class ColumnVariableLengthSignature(ColumnSignature): + + __metaclass__ = ABCMeta + + def __init__(self, index, name, serial_type, count=1): + + super(ColumnVariableLengthSignature, self).__init__(index, name, serial_type, count) + + """ + + Note: The variable length serial types is a dictionary where: + variable_length_serial_types[variable length serial type] = count of variable length serial type in column + + """ + + self.variable_length_serial_types = None + + def stringify(self, padding=""): + string = "\n" + padding + "Variable Length Serial Types: {}" + string = string.format(self.variable_length_serial_types) + return super(ColumnVariableLengthSignature, self).stringify(padding) + string + + def get_variable_length_serial_type_probability(self, variable_length_serial_type): + + """ + + + + Note: A value of None will be returned if the number of rows is not set. + + :param variable_length_serial_type: + + :return: + + """ + + if self._number_of_rows: + return float(self.variable_length_serial_types[variable_length_serial_type]) / self._number_of_rows + return None + + +class ColumnReducedVariableLengthSignature(ColumnVariableLengthSignature): + + """ + + ColumnReducedVariableLengthSignature + + + + Note: This class is used where the serial types for variable length signatures are reduced and therefore + are either -1 (for BLOB) or -2 (for TEXT). + + """ + + def __init__(self, index, name, serial_type, count, variable_length_serial_types): + + if serial_type not in [-2, -1]: + log_message = "Invalid serial type: {} for column reduced variable length signature index: {} and name: {}" + log_message = log_message.format(serial_type, self.index, self.name) + self._logger.error(log_message) + raise ValueError(log_message) + + if not count: + log_message = "Count not specified for column reduced variable length signature index: {} and name: {} " \ + "for serial type: {} and variable length serial types: {}." + log_message = log_message.format(index, name, serial_type, variable_length_serial_types) + self._logger.error(log_message) + raise ValueError(log_message) + + if not variable_length_serial_types: + log_message = "Variable length serial types not specified for column reduced variable length signature " \ + "index: {} and name: {} for serial type: {} and count: {}." + log_message = log_message.format(index, name, serial_type, count) + self._logger.error(log_message) + raise ValueError(log_message) + + super(ColumnReducedVariableLengthSignature, self).__init__(index, name, serial_type, count) + + self.variable_length_serial_types = variable_length_serial_types + + def update(self, serial_type, count=None, variable_length_serial_types=None): + + if serial_type != self.serial_type: + log_message = "Specified serial type: {} does not match column reduced variable length signature serial " \ + "type: {} index: {} and name: {}" + log_message = log_message.format(serial_type, self.serial_type, self.index, self.name) + self._logger.error(log_message) + raise ValueError(log_message) + + if not count: + log_message = "Count not specified for column reduced variable length signature index: {} and name: {} " \ + "for serial type: {} and variable length serial types: {}." + log_message = log_message.format(self.index, self.name, serial_type, variable_length_serial_types) + self._logger.error(log_message) + raise ValueError(log_message) + + if not variable_length_serial_types: + log_message = "Variable length serial types not specified for column reduced variable length signature " \ + "index: {} and name: {} for serial type: {} and count: {}." + log_message = log_message.format(self.index, self.name, serial_type, count) + self._logger.error(log_message) + raise ValueError(log_message) + + self.count += count + + for variable_length_serial_type, variable_length_serial_type_count in variable_length_serial_types.iteritems(): + if variable_length_serial_type in self.variable_length_serial_types: + self.variable_length_serial_types[variable_length_serial_type] += variable_length_serial_type_count + else: + self.variable_length_serial_types[variable_length_serial_type] = variable_length_serial_type_count + + +class ColumnNonReducedVariableLengthSignature(ColumnVariableLengthSignature): + + """ + + ColumnNonReducedVariableLengthSignature + + + + Note: This class is used where the serial types for variable length signatures are not reduced and therefore + are greater or equal to 12. + + """ + + def __init__(self, index, name, serial_type): + + if serial_type < 12: + log_message = "Invalid serial type: {} for column non-reduced variable length signature index: {} " \ + "and name: {}" + log_message = log_message.format(serial_type, self.index, self.name) + self._logger.error(log_message) + raise ValueError(log_message) + + super(ColumnNonReducedVariableLengthSignature, self).__init__(index, name, serial_type) + + self.variable_length_serial_types = {} + + # A BLOB that is (N-12)/2 bytes in length + if self.serial_type >= 12 and self.serial_type % 2 == 0: + self.variable_length_serial_types[self.serial_type] = 1 + self.serial_type = -1 + + # A string in the database encoding and is (N-13)/2 bytes in length (The nul terminator is omitted) + elif self.serial_type >= 13 and self.serial_type % 2 == 1: + self.variable_length_serial_types[self.serial_type] = 1 + self.serial_type = -2 + + else: + log_message = "Invalid serial type: {} for column non-reduced variable length signature index: {} and " \ + "name: {}" + log_message = log_message.format(serial_type, self.index, self.name) + self._logger.error(log_message) + raise ValueError(log_message) + + def update(self, serial_type, count=None, variable_length_serial_types=None): + + if serial_type < 12: + log_message = "Invalid serial type: {} for column non-reduced variable length signature index: {} " \ + "and name: {}" + log_message = log_message.format(serial_type, self.index, self.name) + self._logger.error(log_message) + raise ValueError(log_message) + + if count: + log_message = "Count specified for column non-reduced variable length signature index: {} and name: {} " \ + "for serial type: {} and variable length serial types: {}." + log_message = log_message.format(self.index, self.name, serial_type, variable_length_serial_types) + self._logger.error(log_message) + raise ValueError(log_message) + + if variable_length_serial_types: + log_message = "Variable length serial types specified for column non-reduced variable length signature " \ + "index: {} and name: {} for serial type: {} and count: {}." + log_message = log_message.format(self.index, self.name, serial_type, count) + self._logger.error(log_message) + raise ValueError(log_message) + + self.count += 1 + + # A BLOB that is (N-12)/2 bytes in length + if serial_type >= 12 and serial_type % 2 == 0: + + if self.serial_type != -1: + log_message = "Specified serial type: {} does not equate to column non-reduced variable length " \ + "signature serial type: {} index: {} and name: {}" + log_message = log_message.format(serial_type, self.serial_type, self.index, self.name) + self._logger.error(log_message) + raise ValueError(log_message) + + # A string in the database encoding and is (N-13)/2 bytes in length (The nul terminator is omitted) + elif serial_type >= 13 and serial_type % 2 == 1: + + if self.serial_type != -2: + log_message = "Specified serial type: {} does not equate to column non-reduced variable length " \ + "signature serial type: {} index: {} and name: {}" + log_message = log_message.format(serial_type, self.serial_type, self.index, self.name) + self._logger.error(log_message) + raise ValueError(log_message) + + else: + + log_message = "Invalid serial type: {} for column non-reduced variable length signature index: {} and " \ + "name: {}" + log_message = log_message.format(serial_type, self.index, self.name) + self._logger.error(log_message) + raise ValueError(log_message) + + if serial_type in self.variable_length_serial_types: + self.variable_length_serial_types[serial_type] += 1 + else: + self.variable_length_serial_types[serial_type] = 1 diff --git a/sqlite_dissect/carving/utilities.py b/sqlite_dissect/carving/utilities.py new file mode 100644 index 0000000..78a3481 --- /dev/null +++ b/sqlite_dissect/carving/utilities.py @@ -0,0 +1,387 @@ +from binascii import hexlify +from binascii import unhexlify +from logging import getLogger +from sqlite_dissect.constants import BLOB_SIGNATURE_IDENTIFIER +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import TEXT_SIGNATURE_IDENTIFIER +from sqlite_dissect.exception import CarvingError +from sqlite_dissect.exception import InvalidVarIntError +from sqlite_dissect.utilities import decode_varint + +""" + +utilities.py + +This script holds carving utility functions for reference by the SQLite carving module. + +This script holds the following function(s): +decode_varint_in_reverse(byte_array, offset) +calculate_body_content_size(serial_type_header) +calculate_serial_type_definition_content_length_min_max(simplified_serial_types, allowed_varint_length=5) +calculate_serial_type_varint_length_min_max(simplified_serial_types) +generate_regex_for_simplified_serial_type(simplified_serial_type) +generate_signature_regex(signature, skip_first_serial_type=False) +get_content_size(serial_type) + +""" + + +def decode_varint_in_reverse(byte_array, offset, max_varint_length=9): + + """ + + This function will move backwards through a byte array trying to decode a varint in reverse. A InvalidVarIntError + will be raised if a varint is not found by this algorithm used in this function. The calling logic should check + for this case in case it is encountered which is likely in the context of carving. + + Note: This cannot determine if the field being parsed was originally a varint or not and may give false positives. + Please keep this in mind when calling this function. + + Note: If the array runs out of bytes while parsing in reverse, the currently determined varint will be returned. + + Note: Since the parsing starts from the left of the offset specified, the resulting byte string that represents + this varint can be determined by byte_array[varint_relative_offset:offset]. The length of the varint + in bytes can be determined likewise either from the len() of the above or offset - varint_relative_offset. + + :param byte_array: bytearray The byte array to parse for the varint in reverse. + :param offset: int The offset to move backwards from. The offset specified is not included in the parsing and the + algorithm starts with the last byte of the varint at offset - 1. If you want to start at the + end of the byte array then the offset should be the length of the byte array (where the offset + would refer to a non-existing index in the array). + :param max_varint_length: int The maximum number of varint bytes to go back in reverse. The default is 9 since + this is the maximum number of bytes a varint can be. + + :return: + + :raise: InvalidVarIntError: If a varint is not determined while parsing the byte array in reverse using the + algorithm in this function. This error is not logged as an error but rather a + debug statement since it is very likely to occur during carving and should be handled + appropriately. + + """ + + if offset > len(byte_array): + log_message = "The offset: {} is greater than the size of the byte array: {} for the bytes: {}." + log_message = log_message.format(offset, len(byte_array), hexlify(byte_array)) + getLogger(LOGGER_NAME).error(log_message) + raise ValueError(log_message) + + unsigned_integer_value = 0 + varint_inverted_relative_offset = 0 + + varint_byte = ord(byte_array[offset - 1 - varint_inverted_relative_offset:offset - varint_inverted_relative_offset]) + varint_byte &= 0x7f + unsigned_integer_value |= varint_byte + varint_inverted_relative_offset += 1 + + while offset - varint_inverted_relative_offset - 1 >= 0: + + if varint_inverted_relative_offset > max_varint_length: + + """ + + Since this exception is not considered a important exception to log as an error, it will be logged + as a debug statement. There is a good chance of this use case occurring and is even expected during + carving. + + """ + + log_message = "A varint was not determined from byte array: {} starting at offset: {} in reverse." + log_message = log_message.format(byte_array, offset) + getLogger(LOGGER_NAME).debug(log_message) + return InvalidVarIntError(log_message) + + varint_byte = ord(byte_array[offset - 1 - varint_inverted_relative_offset: + offset - varint_inverted_relative_offset]) + msb_set = varint_byte & 0x80 + if msb_set: + varint_byte &= 0x7f + varint_byte <<= (7 * varint_inverted_relative_offset) + unsigned_integer_value |= varint_byte + varint_inverted_relative_offset += 1 + else: + break + + varint_relative_offset = offset - varint_inverted_relative_offset + + return unsigned_integer_value, varint_relative_offset + + +def calculate_body_content_size(serial_type_header): + body_content_size = 0 + start_offset = 0 + while start_offset < len(serial_type_header): + serial_type, serial_type_varint_length = decode_varint(serial_type_header, start_offset) + body_content_size += get_content_size(serial_type) + start_offset += serial_type_varint_length + if start_offset > len(serial_type_header): + log_message = "Invalid start offset: {} retrieved from serial type header of length: {}: {}." + log_message = log_message.format(start_offset, len(serial_type_header), hexlify(serial_type_header)) + getLogger(LOGGER_NAME).error(log_message) + raise CarvingError(log_message) + return body_content_size + + +def calculate_serial_type_definition_content_length_min_max(simplified_serial_types=None, allowed_varint_length=5): + + content_max_length = int('1111111' * allowed_varint_length, 2) + + if not simplified_serial_types: + return 0, content_max_length + + serial_type_definition_content_length_min = content_max_length + serial_type_definition_content_length_max = 0 + + for simplified_serial_type in simplified_serial_types: + if simplified_serial_type in [BLOB_SIGNATURE_IDENTIFIER, TEXT_SIGNATURE_IDENTIFIER]: + serial_type_definition_content_length_min = min(serial_type_definition_content_length_min, 1) + serial_type_definition_content_length_max = max(serial_type_definition_content_length_max, + content_max_length) + else: + serial_type_content_length = get_content_size(simplified_serial_type) + serial_type_definition_content_length_min = min(serial_type_definition_content_length_min, + serial_type_content_length) + serial_type_definition_content_length_max = max(serial_type_definition_content_length_max, + serial_type_content_length) + + return serial_type_definition_content_length_min, serial_type_definition_content_length_max + + +def calculate_serial_type_varint_length_min_max(simplified_serial_types): + + serial_type_varint_length_min = 5 + serial_type_varint_length_max = 1 + + for simplified_serial_type in simplified_serial_types: + + if simplified_serial_type in [BLOB_SIGNATURE_IDENTIFIER, TEXT_SIGNATURE_IDENTIFIER]: + serial_type_varint_length_min = min(serial_type_varint_length_min, 1) + serial_type_varint_length_max = min(serial_type_varint_length_max, 5) + else: + serial_type_varint_length_min = min(serial_type_varint_length_min, 1) + serial_type_varint_length_max = min(serial_type_varint_length_max, 1) + + return serial_type_varint_length_min, serial_type_varint_length_max + + +def generate_regex_for_simplified_serial_type(simplified_serial_type): + + """ + + + + Note: Right now 9 byte varints are not supported in the regular expressions generated for blob and text storage + classes. + + :param simplified_serial_type: + + :return: + + """ + + if simplified_serial_type == -2: + return "(?:[\x0C-\x7F]|[\x80-\xFF]{1,7}[\x00-\x7F])" + elif simplified_serial_type == -1: + return "(?:[\x0D-\x7F]|[\x80-\xFF]{1,7}[\x00-\x7F])" + elif 0 <= simplified_serial_type <= 9: + return unhexlify("0{}".format(simplified_serial_type)) + else: + log_message = "Unable to generate regular expression for simplified serial type: {}." + log_message = log_message.format(simplified_serial_type) + getLogger(LOGGER_NAME).error(log_message) + raise CarvingError(log_message) + + +def generate_signature_regex(signature, skip_first_serial_type=False): + + """ + + This function will generate the regular expression for a particular signature sent in derived from a Signature + class. For instance, the signature should be in list form as the simplified signature, simplified schema + signature, etc. + + The skip first serial type field will omit the first serial type from the regular expression. This is to better + support carving of freeblocks since the first 4 bytes are overwritten of the entry and this could contain the first + serial type byte in the header as the fourth byte. Leaving this out will provide better accuracy for determining + deleted entries in freeblocks. + + Note: There may be issues if there is only one field either in the signature or left in the signature after the + first serial type is skipped, if specified. + + Note: There is also the case of the first serial type being a varint which needs to be addressed. + + :param signature: + :param skip_first_serial_type: + + :return: + + """ + + regex = "" + + if skip_first_serial_type: + signature = signature[1:] + + for column_serial_type_array in signature: + + number_of_possible_serial_types = len(column_serial_type_array) + + if number_of_possible_serial_types == 1: + + serial_type = column_serial_type_array[0] + serial_type_regex = generate_regex_for_simplified_serial_type(serial_type) + regex += serial_type_regex + + elif 1 < number_of_possible_serial_types < 13: + + """ + + The maximum number of possible serial types are in the range of 1 to 12. Since the case of just + a single serial type is handled above, this portion accounts for possible serial types of more than + 1 field up to 12. These can be the following 12 serial type fields: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -2. + + """ + + basic_serial_type_regex = "" + blob_regex = "" + text_regex = "" + + for column_serial_type in column_serial_type_array: + if column_serial_type == -1: + blob_regex = generate_regex_for_simplified_serial_type(column_serial_type) + elif column_serial_type == -2: + text_regex = generate_regex_for_simplified_serial_type(column_serial_type) + else: + basic_serial_type_regex += generate_regex_for_simplified_serial_type(column_serial_type) + + if blob_regex or text_regex: + + if basic_serial_type_regex: + basic_serial_type_regex = "[{}]".format(basic_serial_type_regex) + + if blob_regex and not text_regex: + + if not basic_serial_type_regex: + log_message = "No basic serial type regular expression found when multiple column serial " \ + "types were defined with a blob regular expression of: {} and no text regular " \ + "expression in the signature: {} where the skip first serial type was set to: {}." + log_message = log_message.format(blob_regex, signature, skip_first_serial_type) + getLogger(LOGGER_NAME).error(log_message) + raise CarvingError(log_message) + + regex += "(?:{}|{})".format(basic_serial_type_regex, blob_regex) + + elif not blob_regex and text_regex: + + if not basic_serial_type_regex: + log_message = "No basic serial type regular expression found when multiple column serial " \ + "types were defined with no blob regular expression and a text regular " \ + "expression of: {} in the signature: {} where the skip first serial type " \ + "was set to: {}." + log_message = log_message.format(text_regex, signature, skip_first_serial_type) + getLogger(LOGGER_NAME).error(log_message) + raise CarvingError(log_message) + + regex += "(?:{}|{})".format(basic_serial_type_regex, text_regex) + + elif blob_regex and text_regex: + + var_length_regex = blob_regex + "|" + text_regex + if basic_serial_type_regex: + regex += "(?:{}|{})".format(basic_serial_type_regex, var_length_regex) + else: + regex += "(?:{})".format(var_length_regex) + + else: + log_message = "No appropriate regular expressions were found for basic serial type, blob, or " \ + "text column signature types in the signature: {} where the skip first serial type " \ + "was set to: {}." + log_message = log_message.format(text_regex, signature, skip_first_serial_type) + getLogger(LOGGER_NAME).error(log_message) + raise CarvingError(log_message) + + else: + + """ + + Since a blob or text regex was not found, the signatures must only be basic serial types (which are + considered non-variable length serial types). + + """ + + if not basic_serial_type_regex: + log_message = "No basic serial type regular expression found when no variable length serial " \ + "types were determined in the signature: {} where the skip first serial type was " \ + "set to: {}." + log_message = log_message.format(signature, skip_first_serial_type) + getLogger(LOGGER_NAME).error(log_message) + raise CarvingError(log_message) + + regex += "[{}]".format(basic_serial_type_regex) + + else: + + log_message = "Invalid number of columns in the signature: {} to generate a regular expression from " \ + "where the skip first serial type was set to: {}." + log_message = log_message.format(signature, skip_first_serial_type) + getLogger(LOGGER_NAME).error(log_message) + raise CarvingError(log_message) + + return regex + + +def get_content_size(serial_type): + + # NULL + if serial_type == 0: + return 0 + + # 8-bit twos-complement integer + elif serial_type == 1: + return 1 + + # Big-endian 16-bit twos-complement integer + elif serial_type == 2: + return 2 + + # Big-endian 24-bit twos-complement integer + elif serial_type == 3: + return 3 + + # Big-endian 32-bit twos-complement integer + elif serial_type == 4: + return 4 + + # Big-endian 48-bit twos-complement integer + elif serial_type == 5: + return 6 + + # Big-endian 64-bit twos-complement integer + elif serial_type == 6: + return 8 + + # Big-endian IEEE 754-2008 64-bit floating point number + elif serial_type == 7: + return 8 + + # Integer constant 0 (schema format == 4) + elif serial_type == 8: + return 0 + + # Integer constant 1 (schema format == 4) + elif serial_type == 9: + return 0 + + # A BLOB that is (N-12)/2 bytes in length + elif serial_type >= 12 and serial_type % 2 == 0: + return (serial_type - 12) / 2 + + # A string in the database encoding and is (N-13)/2 bytes in length. The nul terminator is omitted + elif serial_type >= 13 and serial_type % 2 == 1: + return (serial_type - 13) / 2 + + else: + log_message = "Invalid serial type: {}." + log_message = log_message.format(serial_type) + getLogger(LOGGER_NAME).error(log_message) + raise ValueError(log_message) diff --git a/sqlite_dissect/constants.py b/sqlite_dissect/constants.py new file mode 100644 index 0000000..e6f4a57 --- /dev/null +++ b/sqlite_dissect/constants.py @@ -0,0 +1,288 @@ +from collections import MutableMapping +from logging import getLogger +from re import compile +from sys import maxunicode + +""" + +constants.py + +This script holds constants defined for reference by the sqlite carving library. Additionally, a class has been +added to this script for constant enumerations. + +This script holds the following object(s): +Enum(MutableMapping) + +""" + + +LOGGER_NAME = "sqlite_dissect" + + +class Enum(MutableMapping): + + def __init__(self, data): + if isinstance(data, list): + self._store = {value: value for value in data} + elif isinstance(data, dict): + self._store = data + else: + log_message = "Unable to initialize enumeration for: {} with type: {}.".format(data, type(data)) + getLogger(LOGGER_NAME).error(log_message) + raise ValueError(log_message) + + def __getattr__(self, key): + return self._store[key] + + def __getitem__(self, key): + return self._store[key] + + def __setitem__(self, key, value): + self._store[key] = value + + def __delitem__(self, key): + del self._store[key] + + def __contains__(self, key): + return True if key in self._store else False + + def __iter__(self): + return iter(self._store) + + def __len__(self): + return len(self._store) + + +UTF_8 = "utf-8" +UTF_16BE = "utf-16-be" +UTF_16LE = "utf-16-le" + +ENDIANNESS = Enum(["BIG_ENDIAN", "LITTLE_ENDIAN"]) + +# Supported file types +FILE_TYPE = Enum(["DATABASE", "WAL", "WAL_INDEX", "ROLLBACK_JOURNAL"]) + +SQLITE_3_7_0_VERSION_NUMBER = 3007000 + +PAGE_TYPE_LENGTH = 1 + +MASTER_PAGE_HEX_ID = b'\x53' +TABLE_LEAF_PAGE_HEX_ID = b'\x0d' +TABLE_INTERIOR_PAGE_HEX_ID = b'\x05' +INDEX_LEAF_PAGE_HEX_ID = b'\x0a' +INDEX_INTERIOR_PAGE_HEX_ID = b'\x02' + +PAGE_TYPE = Enum(["LOCK_BYTE", "FREELIST_TRUNK", "FREELIST_LEAF", "B_TREE_TABLE_INTERIOR", "B_TREE_TABLE_LEAF", + "B_TREE_INDEX_INTERIOR", "B_TREE_INDEX_LEAF", "OVERFLOW", "POINTER_MAP"]) + +LOCK_BYTE_PAGE_START_OFFSET = 1073741824 +LOCK_BYTE_PAGE_END_OFFSET = 1073742336 + +SQLITE_DATABASE_HEADER_LENGTH = 100 +MAGIC_HEADER_STRING = "SQLite format 3\000" +MAGIC_HEADER_STRING_ENCODING = UTF_8 +MAXIMUM_PAGE_SIZE_INDICATOR = 1 +MINIMUM_PAGE_SIZE_LIMIT = 512 +MAXIMUM_PAGE_SIZE_LIMIT = 32768 +MAXIMUM_PAGE_SIZE = 65536 +ROLLBACK_JOURNALING_MODE = 1 +WAL_JOURNALING_MODE = 2 +MAXIMUM_EMBEDDED_PAYLOAD_FRACTION = 64 +MINIMUM_EMBEDDED_PAYLOAD_FRACTION = 32 +LEAF_PAYLOAD_FRACTION = 32 +VALID_SCHEMA_FORMATS = [1, 2, 3, 4] +UTF_8_DATABASE_TEXT_ENCODING = 1 +UTF_16LE_DATABASE_TEXT_ENCODING = 2 +UTF_16BE_DATABASE_TEXT_ENCODING = 3 +DATABASE_TEXT_ENCODINGS = [UTF_8_DATABASE_TEXT_ENCODING, + UTF_16LE_DATABASE_TEXT_ENCODING, + UTF_16BE_DATABASE_TEXT_ENCODING] +RESERVED_FOR_EXPANSION_REGEX = "^0{40}$" + +FREELIST_NEXT_TRUNK_PAGE_LENGTH = 4 +FREELIST_LEAF_PAGE_POINTERS_LENGTH = 4 +FREELIST_LEAF_PAGE_NUMBER_LENGTH = 4 +FREELIST_HEADER_LENGTH = FREELIST_NEXT_TRUNK_PAGE_LENGTH + FREELIST_LEAF_PAGE_POINTERS_LENGTH # ptr+num size +LEAF_PAGE_HEADER_LENGTH = 8 +INTERIOR_PAGE_HEADER_LENGTH = 12 +RIGHT_MOST_POINTER_OFFSET = 8 +RIGHT_MOST_POINTER_LENGTH = 4 +CELL_POINTER_BYTE_LENGTH = 2 +LEFT_CHILD_POINTER_BYTE_LENGTH = 4 +FREEBLOCK_HEADER_LENGTH = 4 +NEXT_FREEBLOCK_OFFSET_LENGTH = 2 +FREEBLOCK_BYTE_LENGTH = 2 +PAGE_FRAGMENT_LIMIT = 60 +FIRST_OVERFLOW_PAGE_NUMBER_LENGTH = 4 +OVERFLOW_HEADER_LENGTH = 4 # This is the next overflow page number but we call it a header here +POINTER_MAP_ENTRY_LENGTH = 5 + +PAGE_HEADER_MODULE = "sqlite_dissect.file.database.header" +PAGE_MODULE = "sqlite_dissect.file.database.page" +CELL_MODULE = "sqlite_dissect.file.database.page" + +INTERIOR_PAGE_HEADER_CLASS = "InteriorPageHeader" +LEAF_PAGE_HEADER_CLASS = "LeafPageHeader" + +INDEX_INTERIOR_PAGE_CLASS = "IndexInteriorPage" +INDEX_LEAF_PAGE_CLASS = "IndexLeafPage" +TABLE_INTERIOR_PAGE_CLASS = "TableInteriorPage" +TABLE_LEAF_PAGE_CLASS = "TableLeafPage" +INDEX_INTERIOR_CELL_CLASS = "IndexInteriorCell" +INDEX_LEAF_CELL_CLASS = "IndexLeafCell" +TABLE_INTERIOR_CELL_CLASS = "TableInteriorCell" +TABLE_LEAF_CELL_CLASS = "TableLeafCell" + +FIRST_OVERFLOW_PARENT_PAGE_NUMBER = 0 +FIRST_OVERFLOW_PAGE_INDEX = 0 +FIRST_FREELIST_TRUNK_PARENT_PAGE_NUMBER = 0 +FIRST_FREELIST_TRUNK_PAGE_INDEX = 0 + +CELL_LOCATION = Enum({"ALLOCATED_SPACE": "Allocated Space", + "UNALLOCATED_SPACE": "Unallocated Space", + "FREEBLOCK": "Freeblock"}) + +CELL_SOURCE = Enum({"B_TREE": "B-Tree", + "DISPARATE_B_TREE": "Disparate B-Tree", + "FREELIST": "Freelist"}) + +BLOB_SIGNATURE_IDENTIFIER = -1 +TEXT_SIGNATURE_IDENTIFIER = -2 + +ZERO_BYTE = b'\x00' +ALL_ZEROS_REGEX = "^0*$" + +SQLITE_MASTER_SCHEMA_ROOT_PAGE = 1 +MASTER_SCHEMA_COLUMN = Enum({"TYPE": 0, "NAME": 1, "TABLE_NAME": 2, "ROOT_PAGE": 3, "SQL": 4}) +MASTER_SCHEMA_ROW_TYPE = Enum({"TABLE": "table", "INDEX": "index", "VIEW": "view", "TRIGGER": "trigger"}) +MASTER_SCHEMA_NUMBER_OF_COLUMNS = 5 + +COLUMN_DEFINITION = Enum(["COLUMN_NAME", "DATA_TYPE_NAME", "COLUMN_CONSTRAINT"]) +STORAGE_CLASS = Enum(["NULL", "INTEGER", "REAL", "TEXT", "BLOB"]) +TYPE_AFFINITY = Enum(["TEXT", "NUMERIC", "INTEGER", "REAL", "BLOB"]) +DATA_TYPE = Enum(["INT", "INTEGER", "TINYINT", "SMALLINT", "MEDIUMINT", "BIGINT", + "UNSIGNED_BIG_INT", "INT2", "INT8", + "CHARACTER_20", "VARCHAR_255", "VARYING_CHARACTER_255", "NCHAR_55", + "NATIVE_CHARACTER_70", "NVARCHAR_100", "TEXT", "CLOB", + "BLOB", "NOT_SPECIFIED", + "REAL", "DOUBLE", "DOUBLE_PRECISION", "FLOAT", + "NUMERIC", "DECIMAL_10_5", "BOOLEAN", "DATE", "DATETIME", + "INVALID"]) + +CREATE_TABLE_CLAUSE = "CREATE TABLE" +ORDINARY_TABLE_AS_CLAUSE = "AS" +CREATE_VIRTUAL_TABLE_CLAUSE = "CREATE VIRTUAL TABLE" +VIRTUAL_TABLE_USING_CLAUSE = "USING" + +CREATE_INDEX_CLAUSE = "CREATE INDEX" +CREATE_UNIQUE_INDEX_CLAUSE = "CREATE UNIQUE INDEX" +INDEX_ON_COMMAND = "ON" +INDEX_WHERE_CLAUSE = "WHERE" + +INTERNAL_SCHEMA_OBJECT_PREFIX = "sqlite_" +INTERNAL_SCHEMA_OBJECT_INDEX_PREFIX = "sqlite_autoindex_" + +COLUMN_CONSTRAINT_TYPES = Enum(["PRIMARY_KEY", "NOT NULL", "UNIQUE", "CHECK", "DEFAULT", + "COLLATE", "FOREIGN_KEY"]) + +COLUMN_CONSTRAINT_PREFACES = ["CONSTRAINT", "PRIMARY", "NOT", "UNIQUE", "CHECK", "DEFAULT", "COLLATE", "REFERENCES"] +TABLE_CONSTRAINT_PREFACES = ["CONSTRAINT", "PRIMARY", "UNIQUE", "CHECK", "FOREIGN"] + +""" + +Note: For TABLE_CONSTRAINT_TYPE, the PRIMARY_KEY and UNIQUE should be handled the same in respect to this library. + +""" + +TABLE_CONSTRAINT_TYPES = Enum(["PRIMARY_KEY", "UNIQUE", "CHECK", "FOREIGN_KEY"]) + +POINTER_MAP_B_TREE_ROOT_PAGE_TYPE = b'\x01' +POINTER_MAP_FREELIST_PAGE_TYPE = b'\x02' +POINTER_MAP_OVERFLOW_FIRST_PAGE_TYPE = b'\x03' +POINTER_MAP_OVERFLOW_FOLLOWING_PAGE_TYPE = b'\x04' +POINTER_MAP_B_TREE_NON_ROOT_PAGE_TYPE = b'\x05' +POINTER_MAP_PAGE_TYPES = [POINTER_MAP_B_TREE_ROOT_PAGE_TYPE, + POINTER_MAP_FREELIST_PAGE_TYPE, + POINTER_MAP_OVERFLOW_FIRST_PAGE_TYPE, + POINTER_MAP_OVERFLOW_FOLLOWING_PAGE_TYPE, + POINTER_MAP_B_TREE_NON_ROOT_PAGE_TYPE] + +WAL_FILE_POSTFIX = "-wal" +WAL_HEADER_LENGTH = 32 +WAL_MAGIC_NUMBER_BIG_ENDIAN = 0x377F0683 +WAL_MAGIC_NUMBER_LITTLE_ENDIAN = 0x377F0682 +WAL_FILE_FORMAT_VERSION = 3007000 +WAL_FRAME_HEADER_LENGTH = 24 + +WAL_INDEX_POSTFIX = "-shm" +WAL_INDEX_FILE_FORMAT_VERSION = 3007000 +WAL_INDEX_NUMBER_OF_SUB_HEADERS = 2 +WAL_INDEX_SUB_HEADER_LENGTH = 48 +WAL_INDEX_CHECKPOINT_INFO_LENGTH = 24 +WAL_INDEX_LOCK_RESERVED_LENGTH = 16 +WAL_INDEX_HEADER_LENGTH = WAL_INDEX_NUMBER_OF_SUB_HEADERS * WAL_INDEX_SUB_HEADER_LENGTH + \ + WAL_INDEX_CHECKPOINT_INFO_LENGTH + WAL_INDEX_LOCK_RESERVED_LENGTH +WAL_INDEX_NUMBER_OF_FRAMES_BACKFILLED_IN_DATABASE_LENGTH = 4 + +""" + +Note: The reader mark size is referred to as the Maximum xShmLock index (SQLITE_SHM_NLOCK) - 3 in the sqlite code. + +""" +WAL_INDEX_READER_MARK_SIZE = 5 +WAL_INDEX_READER_MARK_LENGTH = 4 + +ROLLBACK_JOURNAL_ALL_CONTENT_UNTIL_END_OF_FILE = -1 +ROLLBACK_JOURNAL_POSTFIX = "-journal" +ROLLBACK_JOURNAL_HEADER_LENGTH = 28 +ROLLBACK_JOURNAL_HEADER_HEX_STRING = 'd9d505f920a163d7' +ROLLBACK_JOURNAL_HEADER_ALL_CONTENT = 'ffffffff' + +BASE_VERSION_NUMBER = 0 +COMMIT_RECORD_BASE_VERSION_NUMBER = BASE_VERSION_NUMBER + 1 + +""" + +The DATABASE_HEADER_VERSIONED_FIELDS covers all fields that may change from database header to database header +throughout the write ahead log. This may not be a definitive list of fields that can change. + +""" +DATABASE_HEADER_VERSIONED_FIELDS = Enum({"FILE_CHANGE_COUNTER": "file_change_counter", + "VERSION_VALID_FOR_NUMBER": "version_valid_for_number", + "DATABASE_SIZE_IN_PAGES": "database_size_in_pages", + "FIRST_FREELIST_TRUNK_PAGE_NUMBER": "first_freelist_trunk_page_number", + "NUMBER_OF_FREE_LIST_PAGES": "number_of_freelist_pages", + "LARGEST_ROOT_B_TREE_PAGE_NUMBER": "largest_root_b_tree_page_number", + "SCHEMA_COOKIE": "schema_cookie", + "SCHEMA_FORMAT_NUMBER": "schema_format_number", + "DATABASE_TEXT_ENCODING": "database_text_encoding", + "USER_VERSION": "user_version", + "MD5_HEX_DIGEST": "md5_hex_digest"}) + +""" + +The types of output that are supported by this package. + +""" +EXPORT_TYPES = Enum(["TEXT", "CSV", "SQLITE", "XLSX"]) + +""" + +Below we instantiate and compile a regular expression to check xml illegal characters: +ILLEGAL_XML_CHARACTER_PATTERN. + +""" + +_illegal_xml_characters = [(0x00, 0x08), (0x0B, 0x0C), (0x0E, 0x1F), (0x7F, 0x84), (0x86, 0x9F), + (0xD800, 0xDFFF), (0xFDD0, 0xFDDF), (0xFFFE, 0xFFFF)] + +if maxunicode >= 0x10000: + _illegal_xml_characters.extend([(0x1FFFE, 0x1FFFF), (0x2FFFE, 0x2FFFF), (0x3FFFE, 0x3FFFF), + (0x4FFFE, 0x4FFFF), (0x5FFFE, 0x5FFFF), (0x6FFFE, 0x6FFFF), + (0x7FFFE, 0x7FFFF), (0x8FFFE, 0x8FFFF), (0x9FFFE, 0x9FFFF), + (0xAFFFE, 0xAFFFF), (0xBFFFE, 0xBFFFF), (0xCFFFE, 0xCFFFF), + (0xDFFFE, 0xDFFFF), (0xEFFFE, 0xEFFFF), (0xFFFFE, 0xFFFFF), + (0x10FFFE, 0x10FFFF)]) + +_illegal_xml_ranges = ["%s-%s" % (unichr(low), unichr(high)) for (low, high) in _illegal_xml_characters] +ILLEGAL_XML_CHARACTER_PATTERN = compile(u'[%s]' % u''.join(_illegal_xml_ranges)) diff --git a/sqlite_dissect/exception.py b/sqlite_dissect/exception.py new file mode 100644 index 0000000..8cfbc39 --- /dev/null +++ b/sqlite_dissect/exception.py @@ -0,0 +1,110 @@ + +""" + +exception.py + +This script holds the custom exceptions used in this library. + +This script holds the following object(s): +SqliteError(Exception) +ParsingError(SqliteError) +HeaderParsingError(ParsingError) +MasterSchemaParsingError(ParsingError) +MasterSchemaRowParsingError(MasterSchemaParsingError) +PageParsingError(ParsingError) +BTreePageParsingError(PageParsingError) +CellParsingError(BTreePageParsingError) +RecordParsingError(CellParsingError) +VersionParsingError(ParsingError) +DatabaseParsingError(VersionParsingError) +WalParsingError(VersionParsingError) +WalFrameParsingError(WalParsingError) +WalCommitRecordParsingError(WalParsingError) +SignatureError(SqliteError) +CarvingError(SqliteError) +CellCarvingError(CarvingError) +InvalidVarIntError(CarvingError) +OutputError(SqliteError) +ExportError(SqliteError) + +""" + + +class SqliteError(Exception): + pass + + +class ParsingError(SqliteError): + pass + + +class HeaderParsingError(ParsingError): + pass + + +class MasterSchemaParsingError(ParsingError): + pass + + +class MasterSchemaRowParsingError(MasterSchemaParsingError): + pass + + +class PageParsingError(ParsingError): + pass + + +class BTreePageParsingError(PageParsingError): + pass + + +class CellParsingError(BTreePageParsingError): + pass + + +class RecordParsingError(CellParsingError): + pass + + +class VersionParsingError(ParsingError): + pass + + +class DatabaseParsingError(VersionParsingError): + pass + + +class WalParsingError(VersionParsingError): + pass + + +class WalFrameParsingError(WalParsingError): + pass + + +class WalCommitRecordParsingError(WalParsingError): + pass + + +class SignatureError(SqliteError): + pass + + +class CarvingError(SqliteError): + pass + + +class CellCarvingError(CarvingError): + pass + + +class InvalidVarIntError(CarvingError): + pass + + +class OutputError(SqliteError): + pass + + +class ExportError(SqliteError): + pass diff --git a/sqlite_dissect/export/README.md b/sqlite_dissect/export/README.md new file mode 100644 index 0000000..5fa84ce --- /dev/null +++ b/sqlite_dissect/export/README.md @@ -0,0 +1,165 @@ + +# sqlite_dissect.export + +This package will have scripts for writing results from the SQLite carving framework to files such +as csv, sqlite, and so on. + +- csv_export.py +- sqlite_export.py +- text_export.py +- xlsx_export.py + +TODO items for the "export" package: + +- [ ] Finish UML class diagrams. +- [ ] Create a interface/super class that is extended from for exporters in order to simplify interaction with them. +- [ ] Redo the exporters to allow multiple exports instead of having to re-parse the file each time. +- [ ] Incorporate a base export class that takes in a version history and set of exporters. +- [ ] Normalize the inputs of the exporters so that they address postfix and file names similarly (ex. .csv postfix). +- [ ] Check inconsistencies among exporters on overwriting or renaming files (also enter/exit methodology). +- [ ] Investigate pyexcel as a possible alternative to openpyxl for writing xlsx files and possibly csv files. + +
+ +### csv_export.py + +This script holds the objects used for exporting results of the SQLite carving framework to csv files. + +This script holds the following object(s): +- VersionCsvExporter(object) +- CommitCsvExporter(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Better exception handling when working with python and SQLite carving objects. +- [ ] Address superclass/subclass structure. +- [ ] Augment classes to not have to continuously open and close the file (maybe by using the "with" syntax). +- [ ] Work on fixing up column headers and hard coded values in columns. +- [ ] Fix the "column definitions" for names once implemented in b-tree index pages. +- [ ] Use cases if empty tables and no carvable rows which result in no files? +- [ ] Use of "iso_" like in the sqlite_export for internal schema object indexes? +- [ ] Figure out naming conventions (or how to handle) the "Row ID" vs the integer primary key which is NULL. +- [ ] Do not overwrite files but instead move them to a different name as in the SQLite and text exporters? +- [ ] Investigate how other applications handle different database text encodings in reference to output. +- [ ] Investigate decoding and re-encoding affects on carved entries. +- [ ] Handle the "=" use case better than just replacing with a space. +- [ ] Investigate why blob objects show up as isinstance of str objects. + ##### VersionCsvExporter Class + - [ ] Check virtual table rows for any use cases that could cause errors when writing. + - [ ] Address use cases with files, directories, multiple files, etc. + - [ ] Check if file or directory exists, etc. + - [ ] Figure out a better way to handle the carved records. + - [ ] Check the carved records dictionary that all carved records are accounted for. + - [ ] Fix the carved records once the carving package has been fixed. + - [ ] Address the located/carved/status status of the entries. + - [ ] Figure out a better way to calculate absolute offsets in write functions better. + - [ ] Fix the "Unknown" status of freeblocks and unallocated space carved entries. + - [ ] Either note or let the user control overwrite/append mode functionality + - [ ] Handle issues with truncation of carved entries (partial records). + - [ ] Account for truncated carved entries (status?) and remove NULL for values if truncated. + - [ ] _write_b_tree_index_leaf_records: Check how index interior/leaf pages work with records. + ##### CommitCsvExporter Class + - [ ] _write_cells: Address the use of "NULL" vs None in SQLite for cells. + - [ ] write_commit: Remove the master schema entry argument? + - [ ] write_commit: Handle the b-tree table interior page better since it is only for journal files. + +
+ +### sqlite_export.py + +This script holds the objects used for exporting results of the SQLite carving framework to SQLite files. + +>Note: +>
+> During development this script was written testing and using SQLite version 3.9.2. The pysqlite version +> was 2.6.0. Keep in mind that sqlite3.version gives version information on the pysqlite sqlite interface code, +> whereas sqlite3.sqlite_version gives the actual version of the SQLite driver that is used. + +This script holds the following object(s): +- CommitSqliteExporter(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Better exception handling when working with python and SQLite carving objects. +- [ ] Implement a version form similar to the VersionCsvExporter. +- [ ] Work on fixing up column headers and hard coded values in columns. +- [ ] Fix the "column definitions" for names once implemented in b-tree index pages. +- [ ] Use cases if empty tables and no carvable rows which result in no files? +- [ ] Investigate differences in efficiency in respect to inserting one or many cells (rows) at a time. +- [ ] Figure out number of columns instead of pulling out the length of each cell over and over again. +- [ ] Empty tables or those with no "updated commits" do not show up in the file. Should empty tables be created? +- [ ] Create a constant for "iso_" for internal schema object indexes? +- [ ] Figure out naming conventions (or how to handle) the "Row ID" vs the integer primary key which is NULL. +- [ ] Investigate how other applications handle different database text encodings in reference to output. +- [ ] Consolidate documentation information so that it is not repeated. + ##### CommitSqliteExporter Class: + -[ ] _write_cells: Address the use of "NULL" vs None in SQLite for cells. + -[ ] _write_cells: Address the use case above with the advent of tables with added columns. + -[ ] _write_cells: Clean up coding of the for loop for writing cell record column values. + -[ ] _write_cells: Handle the failing "str" encodings instead of just setting in a buffer. + -[ ] write_commit: Remove the master schema entry argument? + -[ ] write_commit: Figure out a way to handle additional columns other than a "sd_" preface. + -[ ] write_commit: Address issues that may be caused by prefacing additional columns with "sd_". + +
+ +### text_export.py + +This script holds the objects used for exporting results of the SQLite carving framework to text files. + +This script holds the following object(s): +- CommitConsoleExporter(object) +- CommitTextExporter(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Better exception handling when working with python and SQLite carving objects. +- [ ] Implement a version form similar to the VersionCsvExporter. +- [ ] Work on fixing up column headers and hard coded values in columns. +- [ ] Fix the "column definitions" for names once implemented in b-tree index pages. +- [ ] Use cases if empty tables and no carvable rows which result in no files? +- [ ] Use of "iso_" like in the sqlite_export for internal schema object indexes? +- [ ] Figure out naming conventions (or how to handle) the "Row ID" vs the integer primary key which is NULL. +- [ ] Investigate how other applications handle different database text encodings in reference to output. +- [ ] Empty tables or those with no "updated commits" do not show up in the file. Should empty tables be ignored? + ##### CommitTextExporter Class: + -[ ] _write_cells: Address the use of "NULL" vs None in SQLite for cells. + -[ ] write_header: Remove the master schema entry argument? + +
+ +### xlsx_export.py + +This script holds the objects used for exporting results of the SQLite carving framework to xlsx files. + +This script holds the following object(s): +- CommitXlsxExporter(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Better exception handling when working with python and SQLite carving objects. +- [ ] Address superclass/subclass structure (the CommitXlsxExporter shares a lot with the CommitCsvExporter). +- [ ] Implement a version form similar to the VersionCsvExporter. +- [ ] Work on fixing up column headers and hard coded values in columns. +- [ ] Fix the "column definitions" for names once implemented in b-tree index pages. +- [ ] Use cases if empty tables and no carvable rows which result in no files? +- [ ] Use of "iso_" like in the sqlite_export for internal schema object indexes? +- [ ] Figure out naming conventions (or how to handle) the "Row ID" vs the integer primary key which is NULL. +- [ ] Investigate decoding and re-encoding affects on carved entries. +- [ ] Investigate how other applications handle different database text encodings in reference to output. + ##### CommitXlsxExporter Class: + -[ ] Document and address issues with encoding of unicode. + -[ ] Document and address issues with the 31 max length sheet names (ie. the max 10 similar names). + -[ ] write_commit: Remove the master schema entry argument? + -[ ] _write_cells: Address the use of "NULL" vs None in SQLite for cells. + -[ ] _write_cells: Handle the "=" use case better than just replacing with a space. + -[ ] _write_cells: Investigate why blob objects show up as isinstance of str objects. + -[ ] _write_cells: Check the operation is "Carved" when decoding text values with "replace". diff --git a/sqlite_dissect/export/__init__.py b/sqlite_dissect/export/__init__.py new file mode 100644 index 0000000..7bb811a --- /dev/null +++ b/sqlite_dissect/export/__init__.py @@ -0,0 +1,11 @@ + +""" + +__init__.py + +This init script will initialize any needed logic for this package. + +This package will have scripts for writing results from the SQLite carving framework to files such +as csv, sqlite, and so on. + +""" diff --git a/sqlite_dissect/export/csv_export.py b/sqlite_dissect/export/csv_export.py new file mode 100644 index 0000000..e7a0efd --- /dev/null +++ b/sqlite_dissect/export/csv_export.py @@ -0,0 +1,674 @@ +from csv import QUOTE_ALL +from csv import writer +from logging import DEBUG +from logging import getLogger +from os.path import basename +from os.path import normpath +from os.path import sep +from re import sub +from sqlite_dissect.constants import ILLEGAL_XML_CHARACTER_PATTERN +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import MASTER_SCHEMA_ROW_TYPE +from sqlite_dissect.constants import PAGE_TYPE +from sqlite_dissect.constants import UTF_8 +from sqlite_dissect.exception import ExportError +from sqlite_dissect.file.database.utilities import aggregate_leaf_cells + +""" + +csv_export.py + +This script holds the objects used for exporting results of the SQLite carving framework to csv files. + +This script holds the following object(s): +VersionCsvExporter(object) +CommitCsvExporter(object) + +""" + + +class VersionCsvExporter(object): + + @staticmethod + def write_version(csv_file_name, export_directory, version, master_schema_entry_carved_records=None): + + logger = getLogger(LOGGER_NAME) + + if not master_schema_entry_carved_records: + master_schema_entry_carved_records = {} + + for master_schema_entry in version.master_schema.master_schema_entries: + + """ + + Here we only care about the master schema entries that have a root page number since ones that either + do not have a root page number or have a root page number of 0 do not have correlating b-trees in the + SQLite file and are instead either trigger types, view types, or special cases of table types such as + virtual tables. + + """ + + if master_schema_entry.root_page_number: + + fixed_file_name = basename(normpath(csv_file_name)) + fixed_master_schema_name = sub(" ", "_", master_schema_entry.name) + csv_file_name = export_directory + sep + fixed_file_name + "-" + fixed_master_schema_name + ".csv" + + logger.info("Writing CSV file: {}.".format(csv_file_name)) + + with open(csv_file_name, "wb") as csv_file_handle: + + csv_writer = writer(csv_file_handle, delimiter=',', quotechar="\"", quoting=QUOTE_ALL) + + b_tree_root_page = version.get_b_tree_root_page(master_schema_entry.root_page_number) + + """ + + Retrieve the carved records for this particular master schema entry. + + """ + + carved_cells = [] + if master_schema_entry.name in master_schema_entry_carved_records: + carved_cells = master_schema_entry_carved_records[master_schema_entry.name] + + """ + + Below we have to account for how the pages are stored. + + For the table master schema entry row type: + 1.) If the table is not a "without rowid" table, it will be stored on a table b-tree page with + row ids. + 2.) If the table is a "without rowid" table, it will be stored on an index b-tree page with no + row ids. + + For the index master schema entry row type: + 1.) It will be stored on an index b-tree page with no row ids. + + Different functions are created to write records for both table and index b-tree pages. Keep in + mind that a table master schema row type may be stored on a index b-tree page depending if it is + specified as a "without rowid" table. All index master schema row types are stored on index + b-tree pages. + + """ + + if master_schema_entry.row_type == MASTER_SCHEMA_ROW_TYPE.TABLE: + + if not master_schema_entry.without_row_id: + + VersionCsvExporter._write_b_tree_table_leaf_records(csv_writer, version, + master_schema_entry, + b_tree_root_page, carved_cells) + + else: + + VersionCsvExporter._write_b_tree_index_leaf_records(csv_writer, version, + master_schema_entry, + b_tree_root_page, carved_cells) + + elif master_schema_entry.row_type == MASTER_SCHEMA_ROW_TYPE.INDEX: + + VersionCsvExporter._write_b_tree_index_leaf_records(csv_writer, version, master_schema_entry, + b_tree_root_page, carved_cells) + + else: + + log_message = "Invalid master schema entry row type: {} found for csv export on master " \ + "schema entry name: {} table name: {} sql: {}." + log_message = log_message.format(master_schema_entry.row_type, master_schema_entry.name, + master_schema_entry.table_name, master_schema_entry.sql) + + logger.warn(log_message) + raise ExportError(log_message) + + @staticmethod + def _write_b_tree_index_leaf_records(csv_writer, version, master_schema_entry, b_tree_root_page, carved_cells): + + """ + + This function will write the list of cells sent in to the sheet specified including the metadata regarding + to the file type, page type, and operation. + + Note: The types of the data in the values can prove to be an issue here. We want to write the value out as + a string similarly as the text and csv outputs do for example even though it may contain invalid + characters. When data is sent into the openpyxl library to be written to the xml xlsx, if it is a + string, it is encoded into the default encoding and then checked for xml illegal characters that may + pose an issue when written to the xml. In order to properly check the values and write them accordingly + through the openpyxl library we address the following use cases for the value in order: + 1.) If the value is None, we replace the value with the string "NULL". This might be replaced by + leaving it None but issues can be seen when carving cells where the value is None not because it + was NULL originally in the database, but because it was unable to be parsed out when it may have + actually had a value (when it was truncated). Distinction is needed between these two use cases. + 2.) If the value is a bytearray (most likely originally a blob object) or a string value, we want to + write the value as a string. However, in order to do this for blob objects or strings that may + have a few bad characters in them from carving, we need to do our due diligence and make sure + there are no bad unicode characters and no xml illegal characters that may cause issues with + writing to the xlsx. In order to do this we do the following: + a.) We first convert the value to string if the affinity was not text, otherwise we decode + the value in the database text encoding. When we decode using the database text encoding, + we specify to "replace" characters it does not recognize in order to compensate for carved + rows. + b.) We then test encoding it to UTF-8. + i.) If the value successfully encodes as UTF-8 we set that as the value. + ii.) If the value throws an exception encoding, we have illegal unicode characters in the + string that need to be addressed. In order to escape these, we decode the string + as UTF-8 using the "replace" method to replace any illegal unicode characters + with '\ufffd' and set this back as the value after encoding again. + c.) After we have successfully set the value back to a UTF-8 compliant value, we need to check + the value for xml illegal characters. If any of these xml illegal characters are found, + they are replaced with a space. This behaviour may be different from how values are output + into text or csv since this is being written to xml and additional rules apply for certain + characters. + between the xlsx output and text/csv output in reference to xml illegal characters. + d.) After all the illegal characters are removed, due to the way openpyxl determines data types + of particular cells, if a cell starts with "=", it is determined to be a formula and set as + that in the data type field for that cell. This causes issues when opening the file in excel. + Microsoft Excel recommends prefacing the string with a single quote character, however, + this only seems to be within Excel itself. You can specify the data type of the cell in + openpyxl, but not in the write-only mode that is being used here. In order to work around + this, we check if the first character of a string or bytearray is a "=" character and preface + that string with a space. There may be better ways to handle this such as not using the + write-only mode. + Note: Additionally to the "=" character, the "-" character has similar issues in excel. + However, openpyxl explicitly checks on the "=" character being the first character + and setting that cell to a formula and does not handle the use case of a cell starting + with the "-" character, so this use case is ignored. + 3.) If the value does not fall in one of the above use cases, we leave it as is and write it to the + xlsx without any modifications. + + Note: It was noticed that blob objects are typically detected as isinstance of str here and strings are + bytearray objects. This needs to be investigated why exactly blob objects are coming out as str + objects. + + Note: Comparision should be done on how other applications work with different database text encodings in + reference to their output. + + Note: The decoding of the value in the database text encoding should only specify replace on a carved entry. + + :param csv_writer: + :param version: + :param master_schema_entry: + :param b_tree_root_page: + :param carved_cells: + + :return: + + """ + + logger = getLogger(LOGGER_NAME) + + number_of_cells, cells = aggregate_leaf_cells(b_tree_root_page) + + if logger.isEnabledFor(DEBUG): + master_schema_entry_string = "The {} b-tree page with {} row type and name: {} with sql: {} " \ + "has {} in-tact rows:" + master_schema_entry_string = master_schema_entry_string.format(b_tree_root_page.page_type, + master_schema_entry.row_type, + master_schema_entry.name, + master_schema_entry.sql, number_of_cells) + logger.debug(master_schema_entry_string) + + """ + + Note: The index master schema entries are currently not fully parsed and therefore we do not have column + definitions in order to derive the column names from. + + """ + + column_headers = [] + column_headers.extend(["File Source", "Version", "Page Version", "Cell Source", "Page Number", "Location", + "Carved", "Status", "File Offset"]) + logger.debug("Column Headers: {}".format(" , ".join(column_headers))) + + csv_writer.writerow(column_headers) + + for cell in cells.values(): + + cell_record_column_values = [] + + for record_column in cell.payload.record_columns: + serial_type = record_column.serial_type + text_affinity = True if serial_type >= 13 and serial_type % 2 == 1 else False + value = record_column.value + if value is None: + pass + elif isinstance(value, (bytearray, str)): + value = value.decode(version.database_text_encoding, "replace") if text_affinity else str(value) + try: + value.encode(UTF_8) + except UnicodeDecodeError: + value = value.decode(UTF_8, "replace") + value = ILLEGAL_XML_CHARACTER_PATTERN.sub(" ", value) + if value.startswith("="): + value = ' ' + value + cell_record_column_values.append(value) + + row = [version.file_type, cell.version_number, cell.page_version_number, cell.source, cell.page_number, + cell.location, False, "Complete", cell.file_offset] + row.extend(cell_record_column_values) + csv_writer.writerow(row) + + if logger.isEnabledFor(DEBUG): + for cell in cells.values(): + cell_record_column_values = [str(record_column.value) if record_column.value else "NULL" + for record_column in cell.payload.record_columns] + log_message = "File source: {} version: {} page version: {} cell source: {} page: {} located: {} " \ + "carved: {} status: {} at file offset: {}: " + log_message = log_message.format(version.file_type, cell.version_number, cell.page_version_number, + cell.source, cell.page_number, cell.location, False, + "Complete", cell.file_offset) + log_message += "(" + ", ".join(cell_record_column_values) + ")" + logger.debug(log_message) + + VersionCsvExporter._write_b_tree_table_master_schema_carved_records(csv_writer, version, carved_cells, False) + + @staticmethod + def _write_b_tree_table_leaf_records(csv_writer, version, master_schema_entry, b_tree_root_page, carved_cells): + + """ + + This function will write the list of cells sent in to the sheet specified including the metadata regarding + to the file type, page type, and operation. + + Note: The types of the data in the values can prove to be an issue here. We want to write the value out as + a string similarly as the text and csv outputs do for example even though it may contain invalid + characters. When data is sent into the openpyxl library to be written to the xml xlsx, if it is a + string, it is encoded into the default encoding and then checked for xml illegal characters that may + pose an issue when written to the xml. In order to properly check the values and write them accordingly + through the openpyxl library we address the following use cases for the value in order: + 1.) If the value is None, we replace the value with the string "NULL". This might be replaced by + leaving it None but issues can be seen when carving cells where the value is None not because it + was NULL originally in the database, but because it was unable to be parsed out when it may have + actually had a value (when it was truncated). Distinction is needed between these two use cases. + 2.) If the value is a bytearray (most likely originally a blob object) or a string value, we want to + write the value as a string. However, in order to do this for blob objects or strings that may + have a few bad characters in them from carving, we need to do our due diligence and make sure + there are no bad unicode characters and no xml illegal characters that may cause issues with + writing to the xlsx. In order to do this we do the following: + a.) We first convert the value to string if the affinity was not text, otherwise we decode + the value in the database text encoding. When we decode using the database text encoding, + we specify to "replace" characters it does not recognize in order to compensate for carved + rows. + b.) We then test encoding it to UTF-8. + i.) If the value successfully encodes as UTF-8 we set that as the value. + ii.) If the value throws an exception encoding, we have illegal unicode characters in the + string that need to be addressed. In order to escape these, we decode the string + as UTF-8 using the "replace" method to replace any illegal unicode characters + with '\ufffd' and set this back as the value after encoding again. + c.) After we have successfully set the value back to a UTF-8 compliant value, we need to check + the value for xml illegal characters. If any of these xml illegal characters are found, + they are replaced with a space. This behaviour may be different from how values are output + into text or csv since this is being written to xml and additional rules apply for certain + characters. + between the xlsx output and text/csv output in reference to xml illegal characters. + d.) After all the illegal characters are removed, due to the way openpyxl determines data types + of particular cells, if a cell starts with "=", it is determined to be a formula and set as + that in the data type field for that cell. This causes issues when opening the file in excel. + Microsoft Excel recommends prefacing the string with a single quote character, however, + this only seems to be within Excel itself. You can specify the data type of the cell in + openpyxl, but not in the write-only mode that is being used here. In order to work around + this, we check if the first character of a string or bytearray is a "=" character and preface + that string with a space. There may be better ways to handle this such as not using the + write-only mode. + Note: Additionally to the "=" character, the "-" character has similar issues in excel. + However, openpyxl explicitly checks on the "=" character being the first character + and setting that cell to a formula and does not handle the use case of a cell starting + with the "-" character, so this use case is ignored. + 3.) If the value does not fall in one of the above use cases, we leave it as is and write it to the + xlsx without any modifications. + + Note: It was noticed that blob objects are typically detected as isinstance of str here and strings are + bytearray objects. This needs to be investigated why exactly blob objects are coming out as str + objects. + + Note: Comparision should be done on how other applications work with different database text encodings in + reference to their output. + + Note: The decoding of the value in the database text encoding should only specify replace on a carved entry. + + :param csv_writer: + :param version: + :param master_schema_entry: + :param b_tree_root_page: + :param carved_cells: + + :return: + + """ + + logger = getLogger(LOGGER_NAME) + + number_of_cells, cells = aggregate_leaf_cells(b_tree_root_page) + + if logger.isEnabledFor(DEBUG): + master_schema_entry_string = "The {} b-tree page with {} row type and name: {} with sql: {} " \ + "has {} in-tact rows:" + master_schema_entry_string = master_schema_entry_string.format(b_tree_root_page.page_type, + master_schema_entry.row_type, + master_schema_entry.name, + master_schema_entry.sql, number_of_cells) + logger.debug(master_schema_entry_string) + + column_headers = [] + column_headers.extend(["File Source", "Version", "Page Version", "Cell Source", "Page Number", "Location", + "Carved", "Status", "File Offset", "Row ID"]) + column_headers.extend([column_definition.column_name + for column_definition in master_schema_entry.column_definitions]) + + logger.debug("Column Headers: {}".format(" , ".join(column_headers))) + + csv_writer.writerow(column_headers) + + sorted_cells = sorted(cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id) + + for cell in sorted_cells: + + cell_record_column_values = [] + + for record_column in cell.payload.record_columns: + serial_type = record_column.serial_type + text_affinity = True if serial_type >= 13 and serial_type % 2 == 1 else False + value = record_column.value + if value is None: + pass + elif isinstance(value, (bytearray, str)): + value = value.decode(version.database_text_encoding, "replace") if text_affinity else str(value) + try: + value = value.encode(UTF_8) + except UnicodeDecodeError: + value = value.decode(UTF_8, "replace").encode(UTF_8) + value = ILLEGAL_XML_CHARACTER_PATTERN.sub(" ", value) + if value.startswith("="): + value = ' ' + value + value = str(value) + cell_record_column_values.append(value) + + row = [version.file_type, cell.version_number, cell.page_version_number, cell.source, cell.page_number, + cell.location, False, "Complete", cell.file_offset, cell.row_id] + row.extend(cell_record_column_values) + csv_writer.writerow(row) + + if logger.isEnabledFor(DEBUG): + for cell in sorted_cells: + cell_record_column_values = [str(record_column.value) if record_column.value else "NULL" + for record_column in cell.payload.record_columns] + log_message = "File source: {} version: {} page version: {} cell source: {} page: {} location: {} " \ + "carved: {} status: {} at file offset: {} for row id: {}: " + log_message = log_message.format(version.file_type, cell.version_number, cell.page_version_number, + cell.source, cell.page_number, cell.location, False, "Complete", + cell.file_offset, cell.row_id) + log_message += "(" + ", ".join(cell_record_column_values) + ")" + logger.debug(log_message) + + VersionCsvExporter._write_b_tree_table_master_schema_carved_records(csv_writer, version, carved_cells, True) + + @staticmethod + def _write_b_tree_table_master_schema_carved_records(csv_writer, version, carved_cells, has_row_ids): + + logger = getLogger(LOGGER_NAME) + + for carved_cell in carved_cells: + + cell_record_column_values = [] + + for record_column in carved_cell.payload.record_columns: + serial_type = record_column.serial_type + text_affinity = True if serial_type >= 13 and serial_type % 2 == 1 else False + value = record_column.value + if value is None: + pass + elif isinstance(value, (bytearray, str)): + value = value.decode(version.database_text_encoding, "replace") if text_affinity else str(value) + try: + value = value.encode(UTF_8) + except UnicodeDecodeError: + value = value.decode(UTF_8, "replace").encode(UTF_8) + value = ILLEGAL_XML_CHARACTER_PATTERN.sub(" ", value) + if value.startswith("="): + value = ' ' + value + value = str(value) + cell_record_column_values.append(value) + + row = [version.file_type, carved_cell.version_number, carved_cell.page_version_number, + carved_cell.source, carved_cell.page_number, carved_cell.location, True, "Unknown", + carved_cell.file_offset] + if has_row_ids: + row.append("") + row.extend(cell_record_column_values) + csv_writer.writerow(row) + + if logger.isEnabledFor(DEBUG): + for carved_cell in carved_cells: + cell_record_column_values = [str(record_column.value) if record_column.value else "NULL" + for record_column in carved_cell.payload.record_columns] + log_message = "File source: {} version: {} version number: {} cell source: {} page: {} location: {} " \ + "carved: {} status: {} at file offset: {}" + log_message = log_message.format(version.file_type, carved_cell.version_number, + carved_cell.page_version_number, carved_cell.source, + carved_cell.page_number, carved_cell.location, True, + "Unknown", carved_cell.file_offset) + if has_row_ids: + log_message += " for row id: {}:".format("") + log_message += "(" + ", ".join(cell_record_column_values) + ")" + logger.debug(log_message) + + +class CommitCsvExporter(object): + + def __init__(self, export_directory, file_name_prefix=""): + self._export_directory = export_directory + self._file_name_prefix = file_name_prefix + self._csv_file_names = {} + + def write_commit(self, master_schema_entry, commit): + + """ + + + + Note: This function only writes the commit record if the commit record was updated. + + :param master_schema_entry: + :param commit: + + :return: + + """ + + if not commit.updated: + return + + logger = getLogger(LOGGER_NAME) + + mode = "ab" + csv_file_name = self._csv_file_names[commit.name] if commit.name in self._csv_file_names else None + write_headers = False + + if not csv_file_name: + mode = "wb" + commit_name = sub(" ", "_", commit.name) + csv_file_name = self._export_directory + sep + self._file_name_prefix + "-" + commit_name + ".csv" + self._csv_file_names[commit.name] = csv_file_name + write_headers = True + + with open(csv_file_name, mode) as csv_file_handle: + + csv_writer = writer(csv_file_handle, delimiter=',', quotechar="\"", quoting=QUOTE_ALL) + + """ + + Below we have to account for how the pages are stored. + + For the table master schema entry row type: + 1.) If the table is not a "without rowid" table, it will be stored on a table b-tree page with + row ids. + 2.) If the table is a "without rowid" table, it will be stored on an index b-tree page with no + row ids. + + For the index master schema entry row type: + 1.) It will be stored on an index b-tree page with no row ids. + + The commit object handles this by having a page type to make this distinction easier. Therefore, we only + need to check on the page type here. + + """ + + column_headers = [] + if write_headers: + column_headers.extend(["File Source", "Version", "Page Version", "Cell Source", "Page Number", + "Location", "Operation", "File Offset"]) + + if commit.page_type == PAGE_TYPE.B_TREE_INDEX_LEAF: + + """ + + Note: The index master schema entries are currently not fully parsed and therefore we do not have + column definitions in order to derive the column names from. + + """ + + csv_writer.writerow(column_headers) + + CommitCsvExporter._write_cells(csv_writer, commit.file_type, commit.database_text_encoding, + commit.page_type, commit.added_cells.values(), "Added") + CommitCsvExporter._write_cells(csv_writer, commit.file_type, commit.database_text_encoding, + commit.page_type, commit.updated_cells.values(), "Updated") + CommitCsvExporter._write_cells(csv_writer, commit.file_type, commit.database_text_encoding, + commit.page_type, commit.deleted_cells.values(), "Deleted") + CommitCsvExporter._write_cells(csv_writer, commit.file_type, commit.database_text_encoding, + commit.page_type, commit.carved_cells.values(), "Carved") + + elif commit.page_type == PAGE_TYPE.B_TREE_TABLE_LEAF or commit.page_type == PAGE_TYPE.B_TREE_TABLE_INTERIOR: + + if write_headers: + column_headers.append("Row ID") + column_headers.extend([column_definition.column_name + for column_definition in master_schema_entry.column_definitions]) + csv_writer.writerow(column_headers) + + # Sort the added, updated, and deleted cells by the row id + sorted_added_cells = sorted(commit.added_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id) + CommitCsvExporter._write_cells(csv_writer, commit.file_type, commit.database_text_encoding, + commit.page_type, sorted_added_cells, "Added") + sorted_updated_cells = sorted(commit.updated_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id) + CommitCsvExporter._write_cells(csv_writer, commit.file_type, commit.database_text_encoding, + commit.page_type, sorted_updated_cells, "Updated") + sorted_deleted_cells = sorted(commit.deleted_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id) + CommitCsvExporter._write_cells(csv_writer, commit.file_type, commit.database_text_encoding, + commit.page_type, sorted_deleted_cells, "Deleted") + + # We will not sort the carved cells since row ids are not deterministic even if parsed + CommitCsvExporter._write_cells(csv_writer, commit.file_type, commit.database_text_encoding, + commit.page_type, commit.carved_cells.values(), "Carved") + + else: + + log_message = "Invalid commit page type: {} found for csv export on master " \ + "schema entry name: {} while writing to csv file name: {}." + log_message = log_message.format(commit.page_type, commit.name, csv_file_name) + logger.warn(log_message) + raise ExportError(log_message) + + @staticmethod + def _write_cells(csv_writer, file_type, database_text_encoding, page_type, cells, operation): + + """ + + This function will write the list of cells sent in to the sheet specified including the metadata regarding + to the file type, page type, and operation. + + Note: The types of the data in the values can prove to be an issue here. We want to write the value out as + a string similarly as the text and csv outputs do for example even though it may contain invalid + characters. When data is sent into the openpyxl library to be written to the xml xlsx, if it is a + string, it is encoded into the default encoding and then checked for xml illegal characters that may + pose an issue when written to the xml. In order to properly check the values and write them accordingly + through the openpyxl library we address the following use cases for the value in order: + 1.) If the value is a bytearray (most likely originally a blob object) or a string value, we want to + write the value as a string. However, in order to do this for blob objects or strings that may + have a few bad characters in them from carving, we need to do our due diligence and make sure + there are no bad unicode characters and no xml illegal characters that may cause issues with + writing to the xlsx. In order to do this we do the following: + a.) We first convert the value to string if the affinity was not text, otherwise we decode + the value in the database text encoding. When we decode using the database text encoding, + we specify to "replace" characters it does not recognize in order to compensate for carved + rows. + b.) We then test encoding it to UTF-8. + i.) If the value successfully encodes as UTF-8 we set that as the value. + ii.) If the value throws an exception encoding, we have illegal unicode characters in the + string that need to be addressed. In order to escape these, we decode the string + as UTF-8 using the "replace" method to replace any illegal unicode characters + with '\ufffd' and set this back as the value after encoding again. + c.) After we have successfully set the value back to a UTF-8 compliant value, we need to check + the value for xml illegal characters. If any of these xml illegal characters are found, + they are replaced with a space. This behaviour may be different from how values are output + into text or csv since this is being written to xml and additional rules apply for certain + characters. + between the xlsx output and text/csv output in reference to xml illegal characters. + d.) After all the illegal characters are removed, due to the way openpyxl determines data types + of particular cells, if a cell starts with "=", it is determined to be a formula and set as + that in the data type field for that cell. This causes issues when opening the file in excel. + Microsoft Excel recommends prefacing the string with a single quote character, however, + this only seems to be within Excel itself. You can specify the data type of the cell in + openpyxl, but not in the write-only mode that is being used here. In order to work around + this, we check if the first character of a string or bytearray is a "=" character and preface + that string with a space. There may be better ways to handle this such as not using the + write-only mode. + Note: Additionally to the "=" character, the "-" character has similar issues in excel. + However, openpyxl explicitly checks on the "=" character being the first character + and setting that cell to a formula and does not handle the use case of a cell starting + with the "-" character, so this use case is ignored. + 2.) If the value does not fall in one of the above use cases, we leave it as is and write it to the + xlsx without any modifications. + + Note: If the value is None, we leave it as None. We used to update the None value with the string "NULL" + since issues could be seen when carving cells where the value is None not because it was NULL originally + in the database, but because it was unable to be parsed out when it may have actually had a value (when + it was truncated). Distinction is needed between these two use cases. + + Note: It was noticed that blob objects are typically detected as isinstance of str here and strings are + bytearray objects. This needs to be investigated why exactly blob objects are coming out as str + objects. + + Note: Comparision should be done on how other applications work with different database text encodings in + reference to their output. + + Note: The decoding of the value in the database text encoding should only specify replace on a carved entry. + + :param csv_writer: + :param file_type: + :param database_text_encoding: + :param page_type: + :param cells: + :param operation: + + :return: + + """ + + for cell in cells: + + cell_record_column_values = [] + for record_column in cell.payload.record_columns: + serial_type = record_column.serial_type + text_affinity = True if serial_type >= 13 and serial_type % 2 == 1 else False + value = record_column.value + if value is None: + pass + elif isinstance(value, (bytearray, str)): + value = value.decode(database_text_encoding, "replace") if text_affinity else str(value) + try: + value = value.encode(UTF_8) + except UnicodeDecodeError: + value = value.decode(UTF_8, "replace").encode(UTF_8) + value = ILLEGAL_XML_CHARACTER_PATTERN.sub(" ", value) + if value.startswith("="): + value = ' ' + value + value = str(value) + cell_record_column_values.append(value) + + row = [file_type, cell.version_number, cell.page_version_number, cell.source, cell.page_number, + cell.location, operation, cell.file_offset] + if page_type == PAGE_TYPE.B_TREE_TABLE_LEAF: + row.append(cell.row_id) + row.extend(cell_record_column_values) + csv_writer.writerow(row) diff --git a/sqlite_dissect/export/sqlite_export.py b/sqlite_dissect/export/sqlite_export.py new file mode 100644 index 0000000..3d120f5 --- /dev/null +++ b/sqlite_dissect/export/sqlite_export.py @@ -0,0 +1,412 @@ +from logging import getLogger +from os import rename +from os.path import exists +from os.path import sep +from re import sub +from sqlite3 import connect +from sqlite3 import sqlite_version +from sqlite3 import version +from uuid import uuid4 +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import PAGE_TYPE +from sqlite_dissect.exception import ExportError + +""" + +sqlite_export.py + +This script holds the objects used for exporting results of the SQLite carving framework to SQLite files. + +Note: During development this script was written testing and using SQLite version 3.9.2. The pysqlite version + was 2.6.0. Keep in mind that sqlite3.version gives version information on the pysqlite SQLite interface code, + whereas sqlite3.sqlite_version gives the actual version of the SQLite driver that is used. + +This script holds the following object(s): +CommitSqliteExporter(object) + +""" + + +class CommitSqliteExporter(object): + + def __init__(self, export_directory, file_name): + + """ + + Constructor. + + The master schema entries created tables dictionary will hold the names of the created tables in the SQLite + file being written to so consecutive writes to those tables will be able to tell if the table was already + created or not. The reason it is a dictionary and not just a list of names is that the value keyed off the + master schema name will be the number of columns in that table. This is needed since different rows within + the same table may have a different number of columns in the case that the table was altered and columns were + added at some point. This way the number of columns can be specified and values that may be missing can be + specified as being left NULL. + + Note: According to documentation, it appears only tables can be altered. However, we include the same logic + with the number of rows for both tables and indexes for consistency and code reduction. + + Note: If the file is detected as already existing, a uuid will be appended to the file name of the old file + and a new file by the name specified will be created. + + :param export_directory: + :param file_name: + + :return: + + """ + + self._sqlite_file_name = export_directory + sep + file_name + self._connection = None + self._master_schema_entries_created_tables = {} + + def __enter__(self): + + # Check if the file exists and if it does rename it + if exists(self._sqlite_file_name): + + # Generate a uuid to append to the file name + new_file_name_for_existing_file = self._sqlite_file_name + "-" + str(uuid4()) + + # Rename the existing file + rename(self._sqlite_file_name, new_file_name_for_existing_file) + + log_message = "File: {} already existing when creating the file for commit sqlite exporting. The " \ + "file was renamed to: {} and new data will be written to the file name specified." + log_message = log_message.format(self._sqlite_file_name, new_file_name_for_existing_file) + getLogger(LOGGER_NAME).debug(log_message) + + self._connection = connect(self._sqlite_file_name) + log_message = "Opened connection to {} using sqlite version: {} and pysqlite version: {}" + log_message = log_message.format(self._sqlite_file_name, sqlite_version, version) + getLogger(LOGGER_NAME).debug(log_message) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self._connection.close() + log_message = "Closed connection to {} using sqlite version: {} and pysqlite version: {}" + log_message = log_message.format(self._sqlite_file_name, sqlite_version, version) + getLogger(LOGGER_NAME).debug(log_message) + + def write_commit(self, master_schema_entry, commit): + + """ + + + + Note: This function only writes the commit record if the commit record was updated. + + Note: Any table or index names beginning with sqlite_ are not allowed since "sqlite_" is reserved for + internal schema object names. In the case that a table or index is an internal schema object, we + will preface that name with an "iso_" representing an (i)nternal (s)chema (o)bject. + + :param master_schema_entry: + :param commit: + + :return: + + """ + + if not commit.updated: + return + + logger = getLogger(LOGGER_NAME) + + # Check if the master schema entry name is a internal schema object and if so preface it with "iso_" + internal_schema_object = master_schema_entry.internal_schema_object \ + if hasattr(master_schema_entry, "internal_schema_object") else False + table_name = "iso_" + master_schema_entry.name if internal_schema_object else master_schema_entry.name + + # Check if we have created the table for this master schema entry name yet + if master_schema_entry.name not in self._master_schema_entries_created_tables: + + column_headers = ["File Source", "Version", "Page Version", "Cell Source", "Page Number", "Location", + "Operation", "File Offset"] + + """ + + Below we have to account for how the pages are stored. + + For the table master schema entry row type: + 1.) If the table is not a "without rowid" table, it will be stored on a table b-tree page with + row ids. + 2.) If the table is a "without rowid" table, it will be stored on an index b-tree page with no + row ids. + + For the index master schema entry row type: + 1.) It will be stored on an index b-tree page with no row ids. + + The commit object handles this by having a page type to make this distinction easier. Therefore, we only + need to check on the page type here. + + """ + + if commit.page_type == PAGE_TYPE.B_TREE_INDEX_LEAF: + + """ + + Note: The index master schema entries are currently not fully parsed and therefore we do not have + column definitions in order to derive the column names from. + + Since we need to have column headers defined for each of the fields, here we calculate the + number of additional columns that will be needed to output the fields from the index and expand + the table by that number using generic column names. + + At least one of the added, updated, deleted, or carved cells fields must be set for the commit + to have been considered updated and for us to have gotten here. + + """ + + cells = list() + cells.extend(commit.added_cells.values()) + cells.extend(commit.updated_cells.values()) + cells.extend(commit.deleted_cells.values()) + cells.extend(commit.carved_cells.values()) + + if len(cells) < 1: + log_message = "Found invalid number of cells in commit when specified updated: {} " \ + "found for sqlite export on master schema entry name: {} page type: {} " \ + "while writing to sqlite file name: {}." + log_message = log_message.format(len(cells), commit.name, commit.page_type, self._sqlite_file_name) + logger.warn(log_message) + raise ExportError(log_message) + + number_of_columns = len(cells[0].payload.record_columns) + index_column_headers = [] + for i in range(number_of_columns): + index_column_headers.append("Column {}".format(i)) + + column_headers.extend(index_column_headers) + column_headers = [sub(" ", "_", column_header).lower() for column_header in column_headers] + + elif commit.page_type == PAGE_TYPE.B_TREE_TABLE_LEAF: + + column_definitions = [column_definition.column_name + for column_definition in master_schema_entry.column_definitions] + column_headers.append("Row ID") + + """ + + In order to make sure there are no pre-existing columns with "sd_" prefacing them, we check for that + use case and add another "sd_" to the beginning of the column header name until there are no conflicts. + + """ + + updated_column_headers = [] + for column_header in column_headers: + updated_column_header_name = "sd_" + sub(" ", "_", column_header).lower() + while updated_column_header_name in column_definitions: + updated_column_header_name = "sd_" + updated_column_header_name + updated_column_headers.append(updated_column_header_name) + + updated_column_headers.extend(column_definitions) + column_headers = updated_column_headers + + else: + + log_message = "Invalid commit page type: {} found for sqlite export on master " \ + "schema entry name: {} while writing to sqlite file name: {}." + log_message = log_message.format(commit.page_type, commit.name, self._sqlite_file_name) + logger.warn(log_message) + raise ExportError(log_message) + + create_table_statement = "CREATE TABLE {} ({})" + create_table_statement = create_table_statement.format(table_name, " ,".join(column_headers)) + self._connection.execute(create_table_statement) + self._connection.commit() + + self._master_schema_entries_created_tables[master_schema_entry.name] = len(column_headers) + + """ + + Now write all of the cells to the SQLite file in their table. + + """ + + column_count = self._master_schema_entries_created_tables[master_schema_entry.name] + + if commit.page_type == PAGE_TYPE.B_TREE_INDEX_LEAF: + + CommitSqliteExporter._write_cells(self._connection, table_name, column_count, commit.file_type, + commit.database_text_encoding, commit.page_type, + commit.added_cells.values(), "Added") + CommitSqliteExporter._write_cells(self._connection, table_name, column_count, commit.file_type, + commit.database_text_encoding, commit.page_type, + commit.updated_cells.values(), "Updated") + CommitSqliteExporter._write_cells(self._connection, table_name, column_count, commit.file_type, + commit.database_text_encoding, commit.page_type, + commit.deleted_cells.values(), "Deleted") + CommitSqliteExporter._write_cells(self._connection, table_name, column_count, commit.file_type, + commit.database_text_encoding, commit.page_type, + commit.carved_cells.values(), "Carved") + + elif commit.page_type == PAGE_TYPE.B_TREE_TABLE_LEAF: + + # Sort the added, updated, and deleted cells by the row id + sorted_added_cells = sorted(commit.added_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id) + CommitSqliteExporter._write_cells(self._connection, table_name, column_count, commit.file_type, + commit.database_text_encoding, commit.page_type, sorted_added_cells, + "Added") + sorted_updated_cells = sorted(commit.updated_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id) + CommitSqliteExporter._write_cells(self._connection, table_name, column_count, commit.file_type, + commit.database_text_encoding, commit.page_type, sorted_updated_cells, + "Updated") + sorted_deleted_cells = sorted(commit.deleted_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id) + CommitSqliteExporter._write_cells(self._connection, table_name, column_count, commit.file_type, + commit.database_text_encoding, commit.page_type, sorted_deleted_cells, + "Deleted") + + # We will not sort the carved cells since row ids are not deterministic even if parsed + CommitSqliteExporter._write_cells(self._connection, table_name, column_count, commit.file_type, + commit.database_text_encoding, commit.page_type, + commit.carved_cells.values(), "Carved") + + else: + + log_message = "Invalid commit page type: {} found for sqlite export on master " \ + "schema entry name: {} while writing to sqlite file name: {}." + log_message = log_message.format(commit.page_type, commit.name, self._sqlite_file_name) + logger.warn(log_message) + raise ExportError(log_message) + + """ + + Commit any entries written to the SQLite file. + + Note: This is done to speed up writing to the SQLite file and was previously in the "_write_cells" function + and called after every set of cells written. Now that it has been brought out here, it will execute + for every commit record. This will reduce calls to commit and also make sure at least one statement + has been executed when calling a commit. In addition the insert statement was changed to insert + many at a time instead of individually. + + """ + + self._connection.commit() + + @staticmethod + def _write_cells(connection, table_name, column_count, file_type, + database_text_encoding, page_type, cells, operation): + + """ + + This function will write the list of cells sent in to the connection under the table name specified including + the metadata regarding to the file type, page type, and operation. + + Note: The types of the data in the values can prove to be an issue here. For the most part we want to write + back the value as the type that we read it out of the file as even though the data has the possibility + of still being stored differently since we are leaving all data types to be undefined causing the storage + algorithm internal to SQLite to slightly change. Despite this, we make the following modifications in + order to best ensure data integrity when writing the data back to the SQLite file: + 1.) If the value is a bytearray, the value is interpreted as a blob object. In order to write this + back correctly, we set it to buffer(value) in order to write it back to the SQLite database as + a blob object. Before we write it back, we make sure that the object does not have text affinity, + or if it does we decode it in the database text encoding before writing it. + 2.) If the value is a string, we encode it using UTF-8. If this fails, that means it had characters + not supported by the unicode encoding which caused it to fail. Since we are writing back carved + records that may have invalid characters in strings due to parts being overwritten or false + positives, this can occur a lot. Therefore, if the unicode encoding fails, we do the same + as above for blob objects and create a buffer(value) blob object and write that back to the + database in order to maintain the original data. Therefore, in some tables, depending on the + data parsed or strings retrieved may be stored in either a string (text) or blob storage class. + 3.) If the value does not fall in one of the above use cases, we leave it as is and write it back to the + database without any modifications. + + Note: If the value is None, we leave it as None. We used to update the None value with the string "NULL" + since issues could be seen when carving cells where the value is None not because it was NULL originally + in the database, but because it was unable to be parsed out when it may have actually had a value (when + it was truncated). Distinction is needed between these two use cases. + + Note: Since the amount of columns found may be less than the number of columns actually in the SQL/schema + due to alter table statements over time that may have added columns, we account for the difference + in the number of columns. This is done by taking the difference of the number of columns in the + SQL/schema and subtracting the number of columns for the particular row that is being worked on + and multiply that number by the "None" field in order to pad out the row in the SQLite database + with no data for the remaining columns. + + :param connection: + :param table_name: + :param column_count: + :param file_type: + :param database_text_encoding: + :param page_type: + :param cells: + :param operation: + + :return: + + """ + + if cells: + + entries = [] + + for cell in cells: + + cell_record_column_values = [] + for record_column in cell.payload.record_columns: + serial_type = record_column.serial_type + text_affinity = True if serial_type >= 13 and serial_type % 2 == 1 else False + value = record_column.value + + if value is None: + pass + elif isinstance(value, bytearray): + if text_affinity: + value = value.decode(database_text_encoding, "replace") + else: + value = buffer(value) + elif isinstance(value, str): + try: + if text_affinity: + value = value.decode(database_text_encoding, "replace") + else: + value = buffer(value) + except UnicodeDecodeError: + + """ + + Note: Here we do not decode or encode the value, since the above failed the value will + contain text that cannot be properly decoded and most likely due to random bytes + in a carving. In this case, we just print the value without trying to account + for the database text encoding which may mean the text may appear differently + (ie. with spaces between each character), but it is better to do it this way + rather then to risk replacing characters since we don't know if it is indeed text. + + """ + + value = buffer(value) + + cell_record_column_values.append(value) + + row = [file_type, cell.version_number, cell.page_version_number, cell.source, cell.page_number, + cell.location, operation, cell.file_offset] + if page_type == PAGE_TYPE.B_TREE_TABLE_LEAF: + row.append(cell.row_id) + row.extend(cell_record_column_values) + + # Check the length of the row against the column count and pad it out with NULLs if necessary + if len(row) < column_count: + row.extend([None] * (column_count - len(row))) + + if len(row) > column_count: + log_message = "The number of columns found in the row: {} were more than the expected: {} " \ + "for sqlite export on master schema entry name: {} with file type: {} " \ + "and page type: {}." + log_message = log_message.format(len(row), column_count, table_name, file_type, page_type) + getLogger(LOGGER_NAME).warn(log_message) + raise ExportError(log_message) + + entries.append(tuple(row)) + + if not entries: + log_message = "Did not find any entries to write when cells were specified for sqlite export on " \ + "master schema entry name: {} with file type: {} and page type: {}." + log_message = log_message.format(table_name, file_type, page_type) + getLogger(LOGGER_NAME).warn(log_message) + raise ExportError(log_message) + + number_of_rows = (len(entries[0]) - 1) + + column_fields = "?" + (", ?" * number_of_rows) + insert_statement = "INSERT INTO {} VALUES ({})".format(table_name, column_fields) + connection.executemany(insert_statement, entries) diff --git a/sqlite_dissect/export/text_export.py b/sqlite_dissect/export/text_export.py new file mode 100644 index 0000000..ba53927 --- /dev/null +++ b/sqlite_dissect/export/text_export.py @@ -0,0 +1,257 @@ +from logging import getLogger +from os import rename +from os.path import exists +from os.path import sep +from uuid import uuid4 +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import PAGE_TYPE +from sqlite_dissect.exception import ExportError +from sqlite_dissect.output import stringify_cell_record + +""" + +text_export.py + +This script holds the objects used for exporting results of the SQLite carving framework to text files. + +This script holds the following object(s): +CommitConsoleExporter(object) +CommitTextExporter(object) + +""" + + +class CommitConsoleExporter(object): + + @staticmethod + def write_header(master_schema_entry, page_type): + header = "\nMaster schema entry: {} row type: {} on page type: {} with sql: {}." + header = header.format(master_schema_entry.name, master_schema_entry.row_type, + page_type, master_schema_entry.sql) + print(header) + + @staticmethod + def write_commit(commit): + + """ + + + + Note: This function only prints the commit record if the commit record was updated. + + :param commit: + + :return: + + """ + + if not commit.updated: + return + + logger = getLogger(LOGGER_NAME) + + commit_header = "Commit: {} updated in version: {} with root page number: {} on b-tree page numbers: {}." + print(commit_header.format(commit.name, commit.version_number, + commit.root_page_number, commit.b_tree_page_numbers)) + + if commit.page_type == PAGE_TYPE.B_TREE_INDEX_LEAF: + + CommitConsoleExporter._write_cells(commit.file_type, commit.database_text_encoding, commit.page_type, + commit.added_cells.values(), "Added") + CommitConsoleExporter._write_cells(commit.file_type, commit.database_text_encoding, commit.page_type, + commit.updated_cells.values(), "Updated") + CommitConsoleExporter._write_cells(commit.file_type, commit.database_text_encoding, commit.page_type, + commit.deleted_cells.values(), "Deleted") + CommitConsoleExporter._write_cells(commit.file_type, commit.database_text_encoding, commit.page_type, + commit.carved_cells.values(), "Carved") + + elif commit.page_type == PAGE_TYPE.B_TREE_TABLE_LEAF: + + # Sort the added, updated, and deleted cells by the row id + sorted_added_cells = sorted(commit.added_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id) + CommitConsoleExporter._write_cells(commit.file_type, commit.database_text_encoding, commit.page_type, + sorted_added_cells, "Added") + sorted_updated_cells = sorted(commit.updated_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id) + CommitConsoleExporter._write_cells(commit.file_type, commit.database_text_encoding, commit.page_type, + sorted_updated_cells, "Updated") + sorted_deleted_cells = sorted(commit.deleted_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id) + CommitConsoleExporter._write_cells(commit.file_type, commit.database_text_encoding, commit.page_type, + sorted_deleted_cells, "Deleted") + + # We will not sort the carved cells since row ids are not deterministic even if parsed + CommitConsoleExporter._write_cells(commit.file_type, commit.database_text_encoding, commit.page_type, + commit.carved_cells.values(), "Carved") + + else: + + log_message = "Invalid commit page type: {} found for text export on master " \ + "schema entry name: {} while writing to sqlite file name: {}." + log_message = log_message.format(commit.page_type, commit.name) + logger.warn(log_message) + raise ExportError(log_message) + + @staticmethod + def _write_cells(file_type, database_text_encoding, page_type, cells, operation): + + """ + + This function will write the list of cells sent in to the connection under the table name specified including + the metadata regarding to the file type, page type, and operation. + + Note: Since we are writing out to text, all values are written as strings. + + :param file_type: + :param database_text_encoding: + :param page_type: + :param cells: + :param operation: + + :return: + + """ + + base_string = "File Type: {} Version Number: {} Page Version Number: {} Source: {} " \ + "Page Number: {} Location: {} Operation: {} File Offset: {}" + for cell in cells: + preface = base_string.format(file_type, cell.version_number, cell.page_version_number, cell.source, + cell.page_number, cell.location, operation, cell.file_offset) + row_values = stringify_cell_record(cell, database_text_encoding, page_type) + print(preface + " " + row_values + ".") + + +class CommitTextExporter(object): + + def __init__(self, export_directory, file_name): + + """ + + + + Note: If the file is detected as already existing, a uuid will be appended to the file name of the old file + and a new file by the name specified will be created. + + :param export_directory: + :param file_name: + + :return: + + """ + + self._text_file_name = export_directory + sep + file_name + self._file_handle = None + + def __enter__(self): + + # Check if the file exists and if it does rename it + if exists(self._text_file_name): + + # Generate a uuid to append to the file name + new_file_name_for_existing_file = self._text_file_name + "-" + str(uuid4()) + + # Rename the existing file + rename(self._text_file_name, new_file_name_for_existing_file) + + log_message = "File: {} already existing when creating the file for commit text exporting. The " \ + "file was renamed to: {} and new data will be written to the file name specified." + log_message = log_message.format(self._text_file_name, new_file_name_for_existing_file) + getLogger(LOGGER_NAME).debug(log_message) + + self._file_handle = open(self._text_file_name, "w") + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self._file_handle.close() + + def write_header(self, master_schema_entry, page_type): + header = "\nMaster schema entry: {} row type: {} on page type: {} with sql: {}." + header = header.format(master_schema_entry.name, master_schema_entry.row_type, + page_type, master_schema_entry.sql) + self._file_handle.write(header + "\n") + + def write_commit(self, commit): + + """ + + + + Note: This function only writes the commit record if the commit record was updated. + + :param commit: + + :return: + + """ + + if not commit.updated: + return + + logger = getLogger(LOGGER_NAME) + + commit_header = "Commit: {} updated in version: {} with root page number: {} on b-tree page numbers: {}.\n" + self._file_handle.write(commit_header.format(commit.name, commit.version_number, + commit.root_page_number, commit.b_tree_page_numbers)) + + if commit.page_type == PAGE_TYPE.B_TREE_INDEX_LEAF: + + CommitTextExporter._write_cells(self._file_handle, commit.file_type, commit.database_text_encoding, + commit.page_type, commit.added_cells.values(), "Added") + CommitTextExporter._write_cells(self._file_handle, commit.file_type, commit.database_text_encoding, + commit.page_type, commit.updated_cells.values(), "Updated") + CommitTextExporter._write_cells(self._file_handle, commit.file_type, commit.database_text_encoding, + commit.page_type, commit.deleted_cells.values(), "Deleted") + CommitTextExporter._write_cells(self._file_handle, commit.file_type, commit.database_text_encoding, + commit.page_type, commit.carved_cells.values(), "Carved") + + elif commit.page_type == PAGE_TYPE.B_TREE_TABLE_LEAF: + + # Sort the added, updated, and deleted cells by the row id + sorted_added_cells = sorted(commit.added_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id) + CommitTextExporter._write_cells(self._file_handle, commit.file_type, commit.database_text_encoding, + commit.page_type, sorted_added_cells, "Added") + sorted_updated_cells = sorted(commit.updated_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id) + CommitTextExporter._write_cells(self._file_handle, commit.file_type, commit.database_text_encoding, + commit.page_type, sorted_updated_cells, "Updated") + sorted_deleted_cells = sorted(commit.deleted_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id) + CommitTextExporter._write_cells(self._file_handle, commit.file_type, commit.database_text_encoding, + commit.page_type, sorted_deleted_cells, "Deleted") + + # We will not sort the carved cells since row ids are not deterministic even if parsed + CommitTextExporter._write_cells(self._file_handle, commit.file_type, commit.database_text_encoding, + commit.page_type, commit.carved_cells.values(), "Carved") + + else: + + log_message = "Invalid commit page type: {} found for text export on master " \ + "schema entry name: {}." + log_message = log_message.format(commit.page_type, commit.name, self._text_file_name) + logger.warn(log_message) + raise ExportError(log_message) + + @staticmethod + def _write_cells(file_handle, file_type, database_text_encoding, page_type, cells, operation): + + """ + + This function will write the list of cells sent in to the connection under the table name specified including + the metadata regarding to the file type, page type, and operation. + + Note: Since we are writing out to text, all values are written as strings. + + :param file_handle: + :param file_type: + :param database_text_encoding: + :param page_type: + :param cells: + :param operation: + + :return: + + """ + + base_string = "File Type: {} Version Number: {} Page Version Number: {} Source: {} " \ + "Page Number: {} Location: {} Operation: {} File Offset: {}" + for cell in cells: + preface = base_string.format(file_type, cell.version_number, cell.page_version_number, cell.source, + cell.page_number, cell.location, operation, cell.file_offset) + row_values = stringify_cell_record(cell, database_text_encoding, page_type) + file_handle.write(preface + " " + row_values + ".\n") diff --git a/sqlite_dissect/export/xlsx_export.py b/sqlite_dissect/export/xlsx_export.py new file mode 100644 index 0000000..d8c9c8f --- /dev/null +++ b/sqlite_dissect/export/xlsx_export.py @@ -0,0 +1,337 @@ +from logging import getLogger +from openpyxl import Workbook +from os import rename +from os.path import exists +from os.path import sep +from uuid import uuid4 +from sqlite_dissect.constants import ILLEGAL_XML_CHARACTER_PATTERN +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import PAGE_TYPE +from sqlite_dissect.constants import UTF_8 +from sqlite_dissect.exception import ExportError + +""" + +xlsx_export.py + +This script holds the objects used for exporting results of the SQLite carving framework to xlsx files. + +This script holds the following object(s): +CommitXlsxExporter(object) + +""" + + +class CommitXlsxExporter(object): + + def __init__(self, export_directory, file_name): + self._workbook = Workbook(write_only=True) + self._xlsx_file_name = export_directory + sep + file_name + self._sheets = {} + self._long_sheet_name_translation_dictionary = {} + + def __enter__(self): + + # Check if the file exists and if it does rename it + if exists(self._xlsx_file_name): + + # Generate a uuid to append to the file name + new_file_name_for_existing_file = self._xlsx_file_name + "-" + str(uuid4()) + + # Rename the existing file + rename(self._xlsx_file_name, new_file_name_for_existing_file) + + log_message = "File: {} already existing when creating the file for commit xlsx exporting. The " \ + "file was renamed to: {} and new data will be written to the file name specified." + log_message = log_message.format(self._xlsx_file_name, new_file_name_for_existing_file) + getLogger(LOGGER_NAME).debug(log_message) + + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self._workbook.save(self._xlsx_file_name) + log_message = "Saving file {} after xlsx export." + log_message = log_message.format(self._xlsx_file_name) + getLogger(LOGGER_NAME).debug(log_message) + + def write_commit(self, master_schema_entry, commit): + + """ + + + + Note: This function only writes the commit record if the commit record was updated. + + :param master_schema_entry: + :param commit: + + :return: + + """ + + if not commit.updated: + return + + logger = getLogger(LOGGER_NAME) + + """ + + In xlsx files, there is a limit to the number of characters allowed to be specified in a sheet name. This + limit is 31 characters. The openpyxl library also checks for this use case and if it finds a sheet name longer + than 31 characters, raises an exception. Therefore, we check that here and accommodate for that use case when + it occurs. + + This is done by maintaining a dictionary of commit names longer than 31 characters and a sheet name + based off of the commit name that is within the character limit. If a commit name is longer than 31 characters, + all characters past 30 are chopped off and then a integer is added to the end in the range of 0 to 9 depending + on the number of collisions that may occur for multiple similar commit names. + + Note: There needs to be a better way to distinguish between similar commit names and if there are more than 10 + names similar in the first 30 characters, an exception will be raised. Right now a maximum of 10 similar + names are support (0 to 9). + + """ + + # Setup the name postfix increment counter + name_postfix_increment = 0 + + # Set the sheet name to be the commit name + sheet_name = commit.name + + # Check if the sheet name is greater than 31 characters + if len(sheet_name) > 31: + + # Check if the sheet name is already in the dictionary + if sheet_name in self._long_sheet_name_translation_dictionary: + + # Set it to the name already made for it from a previous call + sheet_name = self._long_sheet_name_translation_dictionary[sheet_name] + + # The sheet name was not already in the dictionary so we need to make a new name + else: + + # Continue while we are between 0 and 9 + while name_postfix_increment < 10: + + # Create the truncated sheet name from the first 30 characters of the sheet name and name postfix + truncated_sheet_name = sheet_name[:30] + str(name_postfix_increment) + + # CHeck if the name does not already exist in the dictionary + if truncated_sheet_name not in self._long_sheet_name_translation_dictionary: + + # Add the sheet name and truncated sheet name into the dictionary + self._long_sheet_name_translation_dictionary[sheet_name] = truncated_sheet_name + + # Set the sheet name + sheet_name = truncated_sheet_name + + # Log a debug message for the truncation of the commit name as a sheet name + log_message = "Commit name: {} was truncated to: {} since it had a length of {} characters " \ + "which is greater than the 31 allowed characters for a sheet name." + log_message = log_message.format(commit.name, sheet_name, len(commit.name)) + logger.debug(log_message) + + # Break from the while loop + break + + # The name already exists + else: + + # Increment the name postfix counter + name_postfix_increment += 1 + + # Raise an exception if the name postfix increment counter reached 10 + if name_postfix_increment == 10: + log_message = "Max number of allowed (10) increments reached for renaming the sheet with " \ + "original name: {} for page type: {} due to having a length of {} characters " \ + "which is greater than the 31 allowed characters while writing to xlsx file name: {}." + log_message = log_message.format(commit.name, commit.page_type, len(commit.name), + self._xlsx_file_name) + logger.warn(log_message) + raise ExportError(log_message) + + sheet = self._sheets[sheet_name] if sheet_name in self._sheets else None + write_headers = False + + if not sheet: + sheet = self._workbook.create_sheet(sheet_name) + self._sheets[sheet_name] = sheet + write_headers = True + + """ + + Below we have to account for how the pages are stored. + + For the table master schema entry row type: + 1.) If the table is not a "without rowid" table, it will be stored on a table b-tree page with + row ids. + 2.) If the table is a "without rowid" table, it will be stored on an index b-tree page with no + row ids. + + For the index master schema entry row type: + 1.) It will be stored on an index b-tree page with no row ids. + + The commit object handles this by having a page type to make this distinction easier. Therefore, we only + need to check on the page type here. + + """ + + column_headers = [] + if write_headers: + column_headers.extend(["File Source", "Version", "Page Version", "Cell Source", "Page Number", + "Location", "Operation", "File Offset"]) + + if commit.page_type == PAGE_TYPE.B_TREE_INDEX_LEAF: + + """ + + Note: The index master schema entries are currently not fully parsed and therefore we do not have + column definitions in order to derive the column names from. + + """ + + sheet.append(column_headers) + + CommitXlsxExporter._write_cells(sheet, commit.file_type, commit.database_text_encoding, commit.page_type, + commit.added_cells.values(), "Added") + CommitXlsxExporter._write_cells(sheet, commit.file_type, commit.database_text_encoding, commit.page_type, + commit.updated_cells.values(), "Updated") + CommitXlsxExporter._write_cells(sheet, commit.file_type, commit.database_text_encoding, commit.page_type, + commit.deleted_cells.values(), "Deleted") + CommitXlsxExporter._write_cells(sheet, commit.file_type, commit.database_text_encoding, commit.page_type, + commit.carved_cells.values(), "Carved") + + elif commit.page_type == PAGE_TYPE.B_TREE_TABLE_LEAF: + + if write_headers: + column_headers.append("Row ID") + column_headers.extend([column_definition.column_name + for column_definition in master_schema_entry.column_definitions]) + sheet.append(column_headers) + + # Sort the added, updated, and deleted cells by the row id + sorted_added_cells = sorted(commit.added_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id) + CommitXlsxExporter._write_cells(sheet, commit.file_type, commit.database_text_encoding, commit.page_type, + sorted_added_cells, "Added") + sorted_updated_cells = sorted(commit.updated_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id) + CommitXlsxExporter._write_cells(sheet, commit.file_type, commit.database_text_encoding, commit.page_type, + sorted_updated_cells, "Updated") + sorted_deleted_cells = sorted(commit.deleted_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id) + CommitXlsxExporter._write_cells(sheet, commit.file_type, commit.database_text_encoding, commit.page_type, + sorted_deleted_cells, "Deleted") + + # We will not sort the carved cells since row ids are not deterministic even if parsed + CommitXlsxExporter._write_cells(sheet, commit.file_type, commit.database_text_encoding, commit.page_type, + commit.carved_cells.values(), "Carved") + + else: + + log_message = "Invalid commit page type: {} found for xlsx export on master " \ + "schema entry name: {} while writing to xlsx file name: {}." + log_message = log_message.format(commit.page_type, commit.name, self._xlsx_file_name) + logger.warn(log_message) + raise ExportError(log_message) + + @staticmethod + def _write_cells(sheet, file_type, database_text_encoding, page_type, cells, operation): + + """ + + This function will write the list of cells sent in to the sheet specified including the metadata regarding + to the file type, page type, and operation. + + Note: The types of the data in the values can prove to be an issue here. We want to write the value out as + a string similarly as the text and csv outputs do for example even though it may contain invalid + characters. When data is sent into the openpyxl library to be written to the xml xlsx, if it is a + string, it is encoded into the default encoding and then checked for xml illegal characters that may + pose an issue when written to the xml. In order to properly check the values and write them accordingly + through the openpyxl library we address the following use cases for the value in order: + 1.) If the value is a bytearray (most likely originally a blob object) or a string value, we want to + write the value as a string. However, in order to do this for blob objects or strings that may + have a few bad characters in them from carving, we need to do our due diligence and make sure + there are no bad unicode characters and no xml illegal characters that may cause issues with + writing to the xlsx. In order to do this we do the following: + a.) We first convert the value to string if the affinity was not text, otherwise we decode + the value in the database text encoding. When we decode using the database text encoding, + we specify to "replace" characters it does not recognize in order to compensate for carved + rows. + b.) We then test encoding it to UTF-8. + i.) If the value successfully encodes as UTF-8 nothing is done further for this step. + ii.) If the value throws an exception encoding, we have illegal unicode characters in the + string that need to be addressed. In order to escape these, we decode the string + as UTF-8 using the "replace" method to replace any illegal unicode characters + with '\ufffd' and set this back as the value. + c.) After we have successfully set the value back to a UTF-8 compliant value, we need to check + the value for xml illegal characters. If any of these xml illegal characters are found, + they are replaced with a space. This behaviour may be different from how values are output + into text or csv since this is being written to xml and additional rules apply for certain + characters. + between the xlsx output and text/csv output in reference to xml illegal characters. + d.) After all the illegal characters are removed, due to the way openpyxl determines data types + of particular cells, if a cell starts with "=", it is determined to be a formula and set as + that in the data type field for that cell. This causes issues when opening the file in excel. + Microsoft Excel recommends prefacing the string with a single quote character, however, + this only seems to be within Excel itself. You can specify the data type of the cell in + openpyxl, but not in the write-only mode that is being used here. In order to work around + this, we check if the first character of a string or bytearray is a "=" character and preface + that string with a space. There may be better ways to handle this such as not using the + write-only mode. + Note: Additionally to the "=" character, the "-" character has similar issues in excel. + However, openpyxl explicitly checks on the "=" character being the first character + and setting that cell to a formula and does not handle the use case of a cell starting + with the "-" character, so this use case is ignored. + 2.) If the value does not fall in one of the above use cases, we leave it as is and write it to the + xlsx without any modifications. + + Note: If the value is None, we leave it as None. We used to update the None value with the string "NULL" + since issues could be seen when carving cells where the value is None not because it was NULL originally + in the database, but because it was unable to be parsed out when it may have actually had a value (when + it was truncated). Distinction is needed between these two use cases. + + Note: It was noticed that blob objects are typically detected as isinstance of str here and strings are + bytearray objects. This needs to be investigated why exactly blob objects are coming out as str + objects. + + Note: Comparisons should be done on how other applications work with different database text encodings in + reference to their output. + + Note: The decoding of the value in the database text encoding should only specify replace on a carved entry. + + :param sheet: + :param file_type: + :param database_text_encoding: + :param page_type: + :param cells: + :param operation: + + :return: + + """ + + for cell in cells: + cell_record_column_values = [] + for record_column in cell.payload.record_columns: + serial_type = record_column.serial_type + text_affinity = True if serial_type >= 13 and serial_type % 2 == 1 else False + value = record_column.value + if isinstance(value, (bytearray, str)): + if len(value) == 0 and isinstance(value, bytearray): + value = None + else: + value = value.decode(database_text_encoding, "replace") if text_affinity else str(value) + try: + value.encode(UTF_8) + except UnicodeDecodeError: + value = value.decode(UTF_8, "replace") + value = ILLEGAL_XML_CHARACTER_PATTERN.sub(" ", value) + if value.startswith("="): + value = ' ' + value + cell_record_column_values.append(value) + + row = [file_type, cell.version_number, cell.page_version_number, cell.source, cell.page_number, + cell.location, operation, cell.file_offset] + if page_type == PAGE_TYPE.B_TREE_TABLE_LEAF: + row.append(cell.row_id) + row.extend(cell_record_column_values) + + sheet.append(row) diff --git a/sqlite_dissect/file/README.md b/sqlite_dissect/file/README.md new file mode 100644 index 0000000..d7b81df --- /dev/null +++ b/sqlite_dissect/file/README.md @@ -0,0 +1,139 @@ + +# sqlite_dissect.file + +This package will control parsing and access to all (supported) sqlite files including the +database, rollback journal, and wal. + +- file_handle.py +- header.py +- utilities.py +- version.py +- version_parser.py + +TODO items for the "file" package: + +- [ ] Finish UML class diagrams. + +
+ +### file_handle.py + +This script holds the file handle for file objects to be worked with in relation to the database, wal, journal and other +supported file types specified in the FILE_TYPE file types list. + +This script holds the following object(s): +- FileHandle(object) +

+ +```mermaid +%%{init: { "theme": "dark" }}%% +classDiagram + class FileHandleobject { + -_logger + -_database_text_encoding + +file_type + +file_object + +file_externally_controlled + +file_size + +header + +__init__(self, file_type, file_identifier, database_text_encoding=None, file_size=None) + +__repr__(self) + +__str__(self) + +stringify(self, padding="", print_header=True) + +database_text_encoding(self) + +database_text_encoding(self, database_text_encoding) + +close(self) + +read_data(self, offset, number_of_bytes) + } +``` + +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. + ##### FileHandle Class: + - [ ] Handle the encoding differently (in particular the WAL file)? + - [ ] Investigate a better way of cleaning up the file object other than having to explicitly call close. + - [ ] The lock byte page is not implemented yet and therefore databases >= 1GB will fail to open. + - [ ] Investigate if lock byte pages affect other SQLite file types such as WAL, journal, etc. at all. + - [ ] Handle exceptions that may be raised from creating headers and reading data better. + - [ ] Split the read_data function into separate read and seek functions? + +
+ +### header.py + +This script holds an abstract class for file header objects to extend and inherit from. File headers such as that +of the wal, journal, and database file headers will extend this class. + +>Note: +>
+> The database file header is the same as the file header for the sqlite database. However, for cases like the wal +> file, the file has a file header that is not related to the actual database information and then depending on how +> many commits were done with the first page in them, could have many database headers. + +This script holds the following object(s): +- SQLiteHeader(object) +

+ +TODO: +- [ ] Documentation improvements. + ##### SQLiteHeader Class: + -[ ] Investigate if there is a correct way to enforce class variables to subclasses. + + +### utilities.py +This script holds utility functions for dealing with the version classes rather than more general utility methods. + +This script holds the following function(s): +- validate_page_version_history(version_history) +

+ +TODO: +- [ ] Documentation improvements. + +
+ +### version.py +This script holds the superclass objects used for parsing the database and write ahead log. + +This script holds the following object(s): +- Version(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Investigate if there is a correct way to enforce class variables to subclasses. + ##### Version Class: + - [ ] Better exception handling when creating objects such as pages, etc. + - [ ] Incorporate a get_page function? + - [ ] Improve the (freelist/pointer map/master schema) page lists by making dictionaries? + - [ ] Have a way to parse and store pages in the object itself? + - [ ] get_b_tree_root_page: Check to make sure it is only a root page specified by the master schema or 1. + - [ ] Document how the database_size_in_pages field is generated across different files and versions. + - [ ] Check that rollback journals update the version valid for number and file change counter >= 3.7.0. + - [ ] Have the database/version implement the commit record interface and rename it? Rename version? + - [ ] get_b_tree_root_page: Check if stored in memory for the version and if so return it instead of parsing. + +
+ +### version_parser.py + +This script holds the objects for parsing through the version history for master schema entries. This can be used +for retrieving cells (records), carving, signature generation, etc.. + +This script holds the following object(s): +- VersionParser(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Handle exceptions that may be raised from creating and working with objects better. + ##### VersionParser Class: + - [ ] Support the same master schema entry being removed and then re-added (Keep in mind row id). + - [ ] How to handle master schema entries not found in specified versions (warning currently raised)? + - [ ] Support for virtual table modules of master schema entry table type (warning currently raised). + - [ ] Support for "without rowid" tables (index b-tree pages) (warning currently raised). + - [ ] Investigate issues with same rows in index b-tree leaf pages that might get removed. + - [ ] Either transition or also put the page_type field in the master schema entry. diff --git a/sqlite_dissect/file/__init__.py b/sqlite_dissect/file/__init__.py new file mode 100644 index 0000000..99baad0 --- /dev/null +++ b/sqlite_dissect/file/__init__.py @@ -0,0 +1,11 @@ + +""" + +__init__.py + +This init script will initialize any needed logic for this package. + +This package will control parsing and access to all (supported) sqlite files including the +database, rollback journal, and wal. + +""" diff --git a/sqlite_dissect/file/database/README.md b/sqlite_dissect/file/database/README.md new file mode 100644 index 0000000..b8976d8 --- /dev/null +++ b/sqlite_dissect/file/database/README.md @@ -0,0 +1,163 @@ + +# sqlite_dissect.file.wal_index + +This package will control parsing and access to the SQLite database files. + +- database.py +- header.py +- page.py +- payload.py +- utilities.py + +TODO items for the "database" package: + +- [ ] Finish UML class diagrams. + +
+ +### database.py +This script holds the objects used for parsing the database file. + +This script holds the following object(s): +- Database(Version) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Investigate where a database file has empty space beyond the page size (wal checkpoints were set). + ##### Database Class: + - [ ] Better exception handling when creating objects such as pages, etc. + - [ ] Check the use case in regards to a database size in pages of 0 in the header and it's calculated. + - [ ] Handle where the version valid for number != file change counter (warning currently thrown). + - [ ] Test out code with a empty database file with no schema (especially the master schema parsing). + - [ ] More detailed documentation on pages stored in memory. (Trade offs in speed/memory.) + - [ ] Check lists and dictionaries for fields before adding. + - [ ] The file_size arg may not be needed since it is in the file handle and may be removed + +
+ +### header.py +This script holds the header objects used for parsing the header of the database file structure from the root page. + +This script holds the following object(s): +- DatabaseHeader(SQLiteHeader) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Finish try/except exception handling for struct.error and ord in classes. + ##### DatabaseHeader Class: + - [ ] Document the database size in pages is going to be 0 if < version 3.7.0 for calling classes. + - [ ] Investigate why the sqlite version number is 0 in some sqlite files. + - [ ] Figure a way to determine the number of pages and version number for a suspected empty schema. + ##### BTreePageHeader Class: + - [ ] The contains_sqlite_database_header attribute should apply to table b-trees, not all b-trees. + - [ ] The root_page_only_md5_hex_digest attribute should apply to table b-trees, not all b-trees. + +
+ +### page.py +This script holds the Page and Cell related objects for parsing out the different types of SQLite pages in the +SQLite database file. This also includes freeblock and fragment related objects. + +This script holds the following object(s): +Page(object) +OverflowPage(Page) +FreelistTrunkPage(Page) +FreelistLeafPage(Page) +PointerMapPage(Page) +PointerMapEntry(object) +BTreePage(Page) +TableInteriorPage(BTreePage) +TableLeafPage(BTreePage) +IndexInteriorPage(BTreePage) +IndexLeafPage(BTreePage) +BTreeCell(object) +TableInteriorCell(BTreeCell) +TableLeafCell(BTreeCell) +IndexInteriorCell(BTreeCell) +IndexLeafCell(BTreeCell) +Freeblock(BTreeCell) +Fragment(BTreeCell) + +>Note: In some places, like with unallocated data on the page, it was decided to not store this data in memory +> and pull it from the file on demand and/or calculate information from it if needed on demand. This was done +> to prevent the memory used by this program becoming bloated with unneeded data. + +Assumptions: +1. OverflowPage: All overflow pages are replaced in a chain on modification. This assumes that whenever a cell is + modified, that even if the content of the overflow portion does not change, the whole cell including + overflow need to be replaced due to the way the cells are stored in SQLite. +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Finish try/except exception handling for struct.error and ord in classes. +- [ ] Replace version_interface with a more appropriately named variable. +- [ ] Investigate if there is a correct way to enforce class variables to subclasses. +- [ ] Calculation for overflow across the b-tree pages could be pulled out to condense code or for use with carving. +- [ ] Retrieval of cells on demand as well as other fields should be analyzed for better memory handling. +- [ ] Research the documentation on how it says certain things are done with freelists for backwards compatibility. +- [ ] Figure out a better way to read out overflow content on demand in regards to payloads/records. +- [ ] Have a iterator for overflow pages in table leaf and index b-tree pages. + ##### FreelistTrunkPage Class: + - [ ] Make sure a freelist trunk page can be updated without updating following freelist pages. + ##### PointerMapPage Class: + - [ ] See documentation in class regarding unallocated space in pointer maps that may be carvable. + ##### TableInteriorPage Class: + - [ ] Verify that the right-most pointer must always exist. + ##### IndexInteriorPage Class: + - [ ] Verify that the right-most pointer must always exist. + ##### BTreeCell Class: + - [ ] Cells with payloads do not have overflow calculated in their md5 hash. Should this be changed? + - [ ] Rename start_offset to just offset (and in other objects as well)? + ##### TableInteriorCell Class: + - [ ] Verify that the left child pointer must always exist. + ##### IndexInteriorCell Class: + - [ ] Verify that the left child pointer must always exist. + +
+ +### payload.py +This script holds the objects used for parsing payloads from the cells in SQLite b-tree pages for +index leaf, index interior, and table leaf. (Table Interior pages do not have payloads in their cells.) + +This script holds the following object(s): +Payload(object) +Record(Payload) +RecordColumn(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. + ##### Record Class: + - [ ] Incorporate absolute offsets. + - [ ] Use \_\_slots\_\_ or some other way to reduce memory since many of these objects will be created. + +
+ +### utilities.py +This script holds utility functions for dealing with database specific objects such as pages rather than more general +utility methods. + +This script holds the following function(s): +aggregate_leaf_cells(b_tree_page, accounted_for_cell_md5s=None, records_only=False) +create_pointer_map_pages(version, database_size_in_pages, page_size) +get_maximum_pointer_map_entries_per_page(page_size) +get_page_numbers_and_types_from_b_tree_page(b_tree_page) +get_pages_from_b_tree_page(b_tree_page) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] aggregate_leaf_cells: Investigate ways of making this faster like with intersections of sets. +- [ ] aggregate_leaf_cells: Check if not using accounted for cell md5s if not specified speeds the function up. +- [ ] aggregate_leaf_cells: Investigate how do index b-tree pages work with fields in interior vs leaf b-tree pages? +- [ ] aggregate_leaf_cells: Account for "without rowid" tables (where they are stored on index b-tree pages). +- [ ] create_pointer_map_pages: Handle exceptions that may occur if the page is not a pointer map page. +- [ ] get_all_pages_from_b_tree_page: Check for duplicates in dictionary when adding? +- [ ] get_page_numbers_and_types_from_b_tree_page: Check for duplicates in dictionary when adding? diff --git a/sqlite_dissect/file/database/__init__.py b/sqlite_dissect/file/database/__init__.py new file mode 100644 index 0000000..56764a4 --- /dev/null +++ b/sqlite_dissect/file/database/__init__.py @@ -0,0 +1,10 @@ + +""" + +__init__.py + +This init script will initialize any needed logic for this package. + +This package will control parsing and access to the SQLite database files. + +""" diff --git a/sqlite_dissect/file/database/database.py b/sqlite_dissect/file/database/database.py new file mode 100644 index 0000000..f440394 --- /dev/null +++ b/sqlite_dissect/file/database/database.py @@ -0,0 +1,367 @@ +from copy import copy +from warnings import warn +from sqlite_dissect.constants import BASE_VERSION_NUMBER +from sqlite_dissect.constants import FILE_TYPE +from sqlite_dissect.constants import FIRST_FREELIST_TRUNK_PAGE_INDEX +from sqlite_dissect.constants import FIRST_FREELIST_TRUNK_PARENT_PAGE_NUMBER +from sqlite_dissect.constants import SQLITE_3_7_0_VERSION_NUMBER +from sqlite_dissect.constants import SQLITE_MASTER_SCHEMA_ROOT_PAGE +from sqlite_dissect.exception import DatabaseParsingError +from sqlite_dissect.file.database.page import FreelistTrunkPage +from sqlite_dissect.file.database.utilities import create_pointer_map_pages +from sqlite_dissect.file.file_handle import FileHandle +from sqlite_dissect.file.schema.master import MasterSchema +from sqlite_dissect.file.version import Version + +""" + +database.py + +This script holds the objects used for parsing the database file. + +This script holds the following object(s): +Database(Version) + +""" + + +class Database(Version): + + def __init__(self, file_identifier, store_in_memory=False, file_size=None, strict_format_checking=True): + + """ + + Constructor. This constructor initializes this object. + + :param file_identifier: str The full file path to the file to be opened or the file object. + :param store_in_memory: boolean Tells this class to store it's particular version information in memory or not. + :param file_size: int Optional parameter to supply the file size. + :param strict_format_checking: boolean Specifies if the application should exit if structural validations fail. + + """ + + """ + + Note: We pass the file name and file object to the file handle and let that do any needed error checking + for us. + + """ + + database_file_handle = FileHandle(FILE_TYPE.DATABASE, file_identifier, file_size=file_size) + super(Database, self).__init__(database_file_handle, BASE_VERSION_NUMBER, + store_in_memory, strict_format_checking) + + """ + + Retrieve the database header from the file handle. + + """ + + self._database_header = self.file_handle.header + + """ + + Make sure the database size in pages is not 0. If this occurs, the version has to be prior to 3.7.0. If the + size is 0 and the version is < 3.7.0 we set the database size in pages to the calculated number of pages + computed from the file size multiplied by the page size. If the version is >= 3.7.0, we raise an exception. + + If the database size in pages is not 0, there is still a use case that could cause the page size to be + incorrect. This is when the version valid for number does not match the file change counter. Versions before + 3.7.0 did not know to update the page size, but also did not know to update the version valid for number. Only + the change counter was updated. Therefore, in the use case where a file could be made with a version >= 3.7.0 + where the database size in pages is set as well as the version valid for number but then closed down, opened + with a SQLite driver version < 3.7.0 and modified, the version valid for number would not match the change + counter resulting in what could possibly be a bad database size in pages. + + Note: If the file is opened back up in a version >= 3.7.0 after being opened in a previous version, the + database size in pages and version valid for number are set correctly again along with the file change + counter on the first modification to the database. It is important to note this was only tested using + WAL mode and the base database file remained with the incorrect information until the WAL updated it + either at a checkpoint or file closure. Rollback journals are assumed to also update this but have not + been observed as of yet. + + """ + + # The database header size in pages is not set + if self.database_header.database_size_in_pages == 0: + + log_message = "Database header for version: {} specifies a database size in pages of 0 for " \ + "sqlite version: {}." + log_message = log_message.format(self.version_number, self.database_header.sqlite_version_number) + self._logger.info(log_message) + + if self.database_header.sqlite_version_number >= SQLITE_3_7_0_VERSION_NUMBER: + log_message = "The database header database size in pages is 0 when the sqlite version: {} is " \ + "greater or equal than 3.7.0 in version: {} and should be set." + log_message = log_message.format(self.database_header.sqlite_version_number, self.version_number) + self._logger.error(log_message) + raise DatabaseParsingError(log_message) + + # Calculate the number of pages from the file size and page size + self.database_size_in_pages = self.file_handle.file_size / self.page_size + + # The database header size in pages is set and the version valid for number does not equal the change counter + elif self.database_header.version_valid_for_number != self.database_header.file_change_counter: + + """ + + We now know that the database has been modified by a legacy version and the database size may not + be correct. We have to rely on calculating the page size here. + + """ + + # Calculate the number of pages from the file size and page size + self.database_size_in_pages = self.file_handle.file_size / self.page_size + + log_message = "Database header for version: {} specifies a database size in pages of {} but version " \ + "valid for number: {} does not equal the file change counter: {} for sqlite " \ + "version: {}. Setting the database size in pages to the calculated page size of: {}." + log_message = log_message.format(self.version_number, self.database_header.database_size_in_pages, + self.database_header.version_valid_for_number, + self.database_header.file_change_counter, + self.database_header.sqlite_version_number, + self.database_size_in_pages) + self._logger.warn(log_message) + warn(log_message, RuntimeWarning) + + # The database header size in pages is set and the version valid for number does equals the change counter + else: + + """ + + Check to make sure the calculated size in pages matches the database header database size in pages as + it should. + + Note: The calculated number of pages can and has been found to be wrong in some cases where the database + size in pages is specified where the version valid for number equals the file change counter. It is + still unsure of why this can occur but in the use cases this was seen, the database size in pages was + correct and the file was inflated (padded) with empty space at the end indicating additional pages + when calculating page size from file size. For this reason a warning is thrown instead of an + exception (in the case that the version valid for number equals the file change counter and database + size in pages is set). + + The use case has not been seen where the database size in pages is 0 and the database size in pages + has been calculated. More investigation is needed. + + """ + + calculated_size_in_pages = self.file_handle.file_size / self.page_size + + if self.database_header.database_size_in_pages != calculated_size_in_pages: + + # Set the database size in pages to the database header size in pages + self.database_size_in_pages = self.database_header.database_size_in_pages + + log_message = "Database header for version: {} specifies a database size in pages of {} but the " \ + "calculated size in pages is {} instead for sqlite version: {}. The database size in " \ + "pages will remain unchanged but possibly erroneous use cases may occur when parsing." + log_message = log_message.format(self.version_number, self.database_header.database_size_in_pages, + calculated_size_in_pages, self.database_header.sqlite_version_number) + self._logger.warn(log_message) + warn(log_message, RuntimeWarning) + + else: + + self.database_size_in_pages = self.database_header.database_size_in_pages + + """ + + Since the main database file is the first version (version number 0) all pages are considered "updated" + since they are new in terms of the information retrieved from them. + + The page version index will set all page numbers currently in the database pages to the version number of + this first version (version number 0). + + """ + + self.updated_page_numbers = [page_index + 1 for page_index in range(self.database_size_in_pages)] + self.page_version_index = dict(map(lambda x: [x, self.version_number], self.updated_page_numbers)) + + self._logger.debug("Updated page numbers initialized as: {} in version: {}.".format(self.updated_page_numbers, + self.version_number)) + self._logger.debug("Page version index initialized as: {} in version: {}.".format(self.page_version_index, + self.version_number)) + + """ + + Here we setup the updated b-tree page numbers. This array will be removed from as we parse through the file + to leave just the b-tree pages of the commit record that were updated at the end. + + """ + + self.updated_b_tree_page_numbers = copy(self.updated_page_numbers) + + """ + + Create the freelist trunk and leaf pages. + + Note: If there are no freelist pages, the first freelist trunk page will be None and there will be an empty + array for the freelist page numbers. + + """ + + if self.database_header.first_freelist_trunk_page_number: + self.first_freelist_trunk_page = FreelistTrunkPage(self, + self.database_header.first_freelist_trunk_page_number, + FIRST_FREELIST_TRUNK_PARENT_PAGE_NUMBER, + FIRST_FREELIST_TRUNK_PAGE_INDEX) + + self.freelist_page_numbers = [] + observed_freelist_pages = 0 + freelist_trunk_page = self.first_freelist_trunk_page + while freelist_trunk_page: + + # Remove it from the updated b-tree pages + self.updated_b_tree_page_numbers.remove(freelist_trunk_page.number) + + self.freelist_page_numbers.append(freelist_trunk_page.number) + observed_freelist_pages += 1 + for freelist_leaf_page in freelist_trunk_page.freelist_leaf_pages: + self.freelist_page_numbers.append(freelist_leaf_page.number) + observed_freelist_pages += 1 + freelist_trunk_page = freelist_trunk_page.next_freelist_trunk_page + + if observed_freelist_pages != self.database_header.number_of_freelist_pages: + log_message = "The number of observed freelist pages: {} does not match the number of freelist pages " \ + "specified in the header: {} for version: {}." + log_message = log_message.format(observed_freelist_pages, self.database_header.number_of_freelist_pages, + self.version_number) + self._logger.error(log_message) + raise DatabaseParsingError(log_message) + + """ + + Create the pointer map pages. + + Note: If there are no pointer map pages, both the pointer map pages and pointer map page numbers will be an + empty array. + + """ + + if self.database_header.largest_root_b_tree_page_number: + self.pointer_map_pages = create_pointer_map_pages(self, self.database_size_in_pages, self.page_size) + else: + self.pointer_map_pages = [] + + self.pointer_map_page_numbers = [] + for pointer_map_page in self.pointer_map_pages: + + # Remove it from the updated b-tree pages + self.updated_b_tree_page_numbers.remove(pointer_map_page.number) + + self.pointer_map_page_numbers.append(pointer_map_page.number) + + """ + + Create the root page of the SQLite database. + + """ + + self._root_page = self.get_b_tree_root_page(SQLITE_MASTER_SCHEMA_ROOT_PAGE) + + """ + + Create the master schema from the root page of the SQLite database. + + Note: There is the possibility that there is no information in the master schema (ie. a "blank" root page). + To check this we make sure the schema format number and database text encoding are 0 in the header. + A warning is already printed in the database header if this use case is determined. + + In this case the master schema will double check that the root page is indeed devoid of information + and will have no schema entries but maintain its fields such as the master schema page numbers which + will be a list of just the root page such as: [1]. + + """ + + self._master_schema = MasterSchema(self, self.root_page) + + # Remove the master schema pages from the updated b-tree pages (this will always include the root page number) + for master_schema_page_number in self.master_schema.master_schema_page_numbers: + self.updated_b_tree_page_numbers.remove(master_schema_page_number) + + """ + + Since we do not check the schema format number and database text encoding in the master schema, we do that here. + This is due to the fact that the database header is not sent into the master schema (although if needed it could + retrieve it through the instance of this class sent in). + + """ + + if len(self.master_schema.master_schema_entries) == 0: + if self.database_header.schema_format_number != 0 or self.database_header.database_text_encoding != 0: + log_message = "No master schema entries found in master schema for version: {} when the database " \ + "schema format number was: {} and the database text encoding was: {} when both should " \ + "be 0." + log_message = log_message.format(self.version_number, self.database_header.schema_format_number, + self.database_header.database_text_encoding) + self._logger.error(log_message) + raise DatabaseParsingError(log_message) + + """ + + Setup the flags to report on modifications. + + See the version superclass for more documentation on the setup of these flags for the Database class. + + """ + + self.database_header_modified = True + self.root_b_tree_page_modified = True + self.master_schema_modified = True + + if self.first_freelist_trunk_page: + self.freelist_pages_modified = True + + if self.database_header.largest_root_b_tree_page_number: + self.pointer_map_pages_modified = True + + """ + + If the version information is being stored in memory, parse out the pages and store them as a private variable. + + """ + + self._pages = {} + if self.store_in_memory: + self._pages = self.pages + + @Version.database_text_encoding.setter + def database_text_encoding(self, database_text_encoding): + log_message = "Database text encoding {} requested to be set on database. Operation not permitted. " \ + "Should be set during object construction." + log_message = log_message.format(database_text_encoding) + self._logger.error(log_message) + raise TypeError(log_message) + + def get_page_data(self, page_number, offset=0, number_of_bytes=None): + + # Set the number of bytes to the rest of the page if it was not set + number_of_bytes = self.page_size - offset if not number_of_bytes else number_of_bytes + + if offset >= self.page_size: + log_message = "Requested offset: {} is >= the page size: {} for page: {}." + log_message = log_message.format(offset, self.page_size, page_number) + self._logger.error(log_message) + raise ValueError(log_message) + + if offset + number_of_bytes > self.page_size: + log_message = "Requested length of data: {} at offset {} to {} is greater than the page " \ + "size: {} for page: {}." + log_message = log_message.format(number_of_bytes, offset, number_of_bytes + offset, + self.page_size, page_number) + self._logger.error(log_message) + raise ValueError(log_message) + + page_offset = self.get_page_offset(page_number) + + return self.file_handle.read_data(page_offset + offset, number_of_bytes) + + def get_page_offset(self, page_number): + + if page_number < 1 or page_number > self.database_size_in_pages: + log_message = "Invalid page number: {} for version: {} with database size in pages: {}." + log_message = log_message.format(page_number, self.version_number, self.database_size_in_pages) + self._logger.error(log_message) + raise ValueError(log_message) + + return (page_number - 1) * self.page_size diff --git a/sqlite_dissect/file/database/header.py b/sqlite_dissect/file/database/header.py new file mode 100644 index 0000000..33bd680 --- /dev/null +++ b/sqlite_dissect/file/database/header.py @@ -0,0 +1,404 @@ +from abc import ABCMeta +from binascii import hexlify +from logging import getLogger +from re import compile +from re import sub +from struct import error +from struct import unpack +from warnings import warn +from sqlite_dissect.constants import DATABASE_TEXT_ENCODINGS +from sqlite_dissect.constants import INTERIOR_PAGE_HEADER_LENGTH +from sqlite_dissect.constants import LEAF_PAGE_HEADER_LENGTH +from sqlite_dissect.constants import LEAF_PAYLOAD_FRACTION +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import MAGIC_HEADER_STRING +from sqlite_dissect.constants import MAGIC_HEADER_STRING_ENCODING +from sqlite_dissect.constants import MASTER_PAGE_HEX_ID +from sqlite_dissect.constants import MAXIMUM_EMBEDDED_PAYLOAD_FRACTION +from sqlite_dissect.constants import MAXIMUM_PAGE_SIZE +from sqlite_dissect.constants import MAXIMUM_PAGE_SIZE_INDICATOR +from sqlite_dissect.constants import MAXIMUM_PAGE_SIZE_LIMIT +from sqlite_dissect.constants import MINIMUM_EMBEDDED_PAYLOAD_FRACTION +from sqlite_dissect.constants import MINIMUM_PAGE_SIZE_LIMIT +from sqlite_dissect.constants import RESERVED_FOR_EXPANSION_REGEX +from sqlite_dissect.constants import RIGHT_MOST_POINTER_LENGTH +from sqlite_dissect.constants import RIGHT_MOST_POINTER_OFFSET +from sqlite_dissect.constants import ROLLBACK_JOURNALING_MODE +from sqlite_dissect.constants import SQLITE_DATABASE_HEADER_LENGTH +from sqlite_dissect.constants import VALID_SCHEMA_FORMATS +from sqlite_dissect.constants import WAL_JOURNALING_MODE +from sqlite_dissect.exception import HeaderParsingError +from sqlite_dissect.file.header import SQLiteHeader +from sqlite_dissect.utilities import get_md5_hash + +""" + +header.py + +This script holds the header objects used for parsing the header of the database file structure from the root page. + +This script holds the following object(s): +DatabaseHeader(SQLiteHeader) + +""" + + +class DatabaseHeader(SQLiteHeader): + + def __init__(self, database_header_byte_array): + + super(DatabaseHeader, self).__init__() + + logger = getLogger(LOGGER_NAME) + + if len(database_header_byte_array) != SQLITE_DATABASE_HEADER_LENGTH: + log_message = "The database header byte array of size: {} is not the expected size of: {}." + log_message = log_message.format(len(database_header_byte_array), SQLITE_DATABASE_HEADER_LENGTH) + logger.error(log_message) + raise ValueError(log_message) + + try: + + self.magic_header_string = database_header_byte_array[0:16] + + except error: + + logger.error("Failed to retrieve the magic header.") + raise + + if self.magic_header_string != MAGIC_HEADER_STRING.decode(MAGIC_HEADER_STRING_ENCODING): + log_message = "The magic header string is invalid." + logger.error(log_message) + raise HeaderParsingError(log_message) + + try: + + self.page_size = unpack(b">H", database_header_byte_array[16:18])[0] + + except error: + + logger.error("Failed to retrieve the page size.") + raise + + if self.page_size == MAXIMUM_PAGE_SIZE_INDICATOR: + self.page_size = MAXIMUM_PAGE_SIZE + elif self.page_size < MINIMUM_PAGE_SIZE_LIMIT: + log_message = "The page size: {} is less than the minimum page size limit: {}." + log_message = log_message.format(self.page_size, MINIMUM_PAGE_SIZE_LIMIT) + logger.error(log_message) + raise HeaderParsingError(log_message) + elif self.page_size > MAXIMUM_PAGE_SIZE_LIMIT: + log_message = "The page size: {} is greater than the maximum page size limit: {}." + log_message = log_message.format(self.page_size, MAXIMUM_PAGE_SIZE_LIMIT) + logger.error(log_message) + raise HeaderParsingError(log_message) + + try: + + self.file_format_write_version = ord(database_header_byte_array[18:19]) + + except TypeError: + + logger.error("Failed to retrieve the file format write version.") + raise + + if self.file_format_write_version not in [ROLLBACK_JOURNALING_MODE, WAL_JOURNALING_MODE]: + log_message = "The file format write version: {} is invalid.".format(self.file_format_write_version) + logger.error(log_message) + raise HeaderParsingError(log_message) + + try: + + self.file_format_read_version = ord(database_header_byte_array[19:20]) + + except TypeError: + + logger.error("Failed to retrieve the file format read version.") + raise + + if self.file_format_read_version not in [ROLLBACK_JOURNALING_MODE, WAL_JOURNALING_MODE]: + log_message = "The file format read version: {} is invalid.".format(self.file_format_read_version) + logger.error(log_message) + raise HeaderParsingError(log_message) + + try: + + self.reserved_bytes_per_page = ord(database_header_byte_array[20:21]) + + except TypeError: + + logger.error("Failed to retrieve the reserved bytes per page.") + raise + + if self.reserved_bytes_per_page != 0: + log_message = "Reserved bytes per page is not 0 but {} and is not implemented." + log_message = log_message.format(self.reserved_bytes_per_page) + logger.error(log_message) + raise NotImplementedError(log_message) + + try: + + self.maximum_embedded_payload_fraction = ord(database_header_byte_array[21:22]) + + except TypeError: + + logger.error("Failed to retrieve the maximum embedded payload fraction.") + raise + + if self.maximum_embedded_payload_fraction != MAXIMUM_EMBEDDED_PAYLOAD_FRACTION: + log_message = "Maximum embedded payload fraction: {} is not expected the expected value of: {}." + log_message = log_message.format(self.maximum_embedded_payload_fraction, MAXIMUM_EMBEDDED_PAYLOAD_FRACTION) + logger.error(log_message) + raise HeaderParsingError(log_message) + + try: + + self.minimum_embedded_payload_fraction = ord(database_header_byte_array[22:23]) + + except TypeError: + + logger.error("Failed to retrieve the minimum embedded payload fraction.") + raise + + if self.minimum_embedded_payload_fraction != MINIMUM_EMBEDDED_PAYLOAD_FRACTION: + log_message = "Minimum embedded payload fraction: {} is not expected the expected value of: {}." + log_message = log_message.format(self.minimum_embedded_payload_fraction, MINIMUM_EMBEDDED_PAYLOAD_FRACTION) + logger.error(log_message) + raise HeaderParsingError(log_message) + + try: + + self.leaf_payload_fraction = ord(database_header_byte_array[23:24]) + + except TypeError: + + logger.error("Failed to retrieve the leaf payload fraction.") + raise + + if self.leaf_payload_fraction != LEAF_PAYLOAD_FRACTION: + log_message = "Leaf payload fraction: {} is not expected the expected value of: {}." + log_message = log_message.format(self.leaf_payload_fraction, LEAF_PAYLOAD_FRACTION) + logger.error(log_message) + raise HeaderParsingError(log_message) + + self.file_change_counter = unpack(b">I", database_header_byte_array[24:28])[0] + self.database_size_in_pages = unpack(b">I", database_header_byte_array[28:32])[0] + self.first_freelist_trunk_page_number = unpack(b">I", database_header_byte_array[32:36])[0] + self.number_of_freelist_pages = unpack(b">I", database_header_byte_array[36:40])[0] + self.schema_cookie = unpack(b">I", database_header_byte_array[40:44])[0] + self.schema_format_number = unpack(b">I", database_header_byte_array[44:48])[0] + self.default_page_cache_size = unpack(b">I", database_header_byte_array[48:52])[0] + self.largest_root_b_tree_page_number = unpack(b">I", database_header_byte_array[52:56])[0] + self.database_text_encoding = unpack(b">I", database_header_byte_array[56:60])[0] + + if self.schema_format_number == 0 and self.database_text_encoding == 0: + + """ + + Note: If the schema format number and database text encoding are both 0 then no schema or data has been + placed into this database file. If a schema or any data was inputted and then all tables dropped, + the schema format number and database text encoding would then be set. In this case the database + should only be 1 page. However, we have no way to determine what the size of the database page is + unless the version is at least 3.7.0. We could check on the SQLite version and make sure the + version is at least 3.7.0 and then check the database size in pages to make sure it was 1 but we + would have no way to handle the case if the version was not at least 3.7.0. Also, it has been + noticed that the SQLite version number is 0 in some database files. Until this is further + thought out and possible solutions are determined, we will not worry about checking that + the database has 1 page. + + """ + + log_message = "Schema format number and database text encoding are 0 indicating no schema or data." + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + else: + + if self.schema_format_number not in VALID_SCHEMA_FORMATS: + log_message = "Schema format number: {} not a valid schema format.".format(self.schema_format_number) + logger.error(log_message) + raise HeaderParsingError(log_message) + + if self.database_text_encoding not in DATABASE_TEXT_ENCODINGS: + log_message = "Database text encoding: {} not a valid encoding.".format(self.database_text_encoding) + logger.error(log_message) + raise HeaderParsingError(log_message) + + self.user_version = unpack(b">I", database_header_byte_array[60:64])[0] + self.incremental_vacuum_mode = unpack(b">I", database_header_byte_array[64:68])[0] + + """ + + Originally a check was done that if the largest root b-tree page number existed and the database was less + than or equal to 2 pages in size, an exception was thrown. This was found to be wrong in the case of where + a database file was generated initially with one page with no information in it yet. In this case (where + auto-vacuuming was turned on resulting in a non-zero largest root b-tree page number) the largest root + b tree page number was found to be 1. Therefore no exception is thrown if the database size in pages is 1 + as well as the largest root b-tree page number. However, this resulted in the check of the largest root + b-tree page number == 2 as well as the database size in pages == 2. This was decided an irrelevant use case + and removed. + + Now the only thing that is checked is that if the incremental vacuum mode is set than the database header + largest root b-tree page number must be set. (The inverse of this is not true.) + + Note: In regards to the above, the checking of the page size was done by the database size in pages calculated + from the actual parsing of the SQLite file and did not originally reside in this class. After that + specific use case was removed, there was no reason not to move this to the database header class. + + """ + + if not self.largest_root_b_tree_page_number and self.incremental_vacuum_mode: + log_message = "The database header largest root b-tree page number was not set when the incremental " \ + "vacuum mode was: {}." + log_message = log_message.format(self.incremental_vacuum_mode) + logger.error(log_message) + raise HeaderParsingError(log_message) + + self.application_id = unpack(b">I", database_header_byte_array[68:72])[0] + self.reserved_for_expansion = database_header_byte_array[72:92] + + pattern = compile(RESERVED_FOR_EXPANSION_REGEX) + reserved_for_expansion_hex = hexlify(self.reserved_for_expansion) + if not pattern.match(reserved_for_expansion_hex): + log_message = "Header space reserved for expansion is not zero: {}.".format(reserved_for_expansion_hex) + logger.error(log_message) + raise HeaderParsingError(log_message) + + self.version_valid_for_number = unpack(b">I", database_header_byte_array[92:96])[0] + self.sqlite_version_number = unpack(b">I", database_header_byte_array[96:100])[0] + + self.md5_hex_digest = get_md5_hash(database_header_byte_array) + + def stringify(self, padding=""): + string = padding + "Magic Header String: {}\n" \ + + padding + "Page Size: {}\n" \ + + padding + "File Format Write Version: {}\n" \ + + padding + "File Format Read Version: {}\n" \ + + padding + "Reserved Bytes per Page: {}\n" \ + + padding + "Maximum Embedded Payload Fraction: {}\n" \ + + padding + "Minimum Embedded Payload Fraction: {}\n" \ + + padding + "Leaf Payload Fraction: {}\n" \ + + padding + "File Change Counter: {}\n" \ + + padding + "Database Size in Pages: {}\n" \ + + padding + "First Freelist Trunk Page Number: {}\n" \ + + padding + "Number of Freelist Pages: {}\n" \ + + padding + "Schema Cookie: {}\n" \ + + padding + "Schema Format Number: {}\n" \ + + padding + "Default Page Cache Size: {}\n" \ + + padding + "Largest Root B-Tree Page Number: {}\n" \ + + padding + "Database Text Encoding: {}\n" \ + + padding + "User Version: {}\n" \ + + padding + "Incremental Vacuum Mode: {}\n" \ + + padding + "Application ID: {}\n" \ + + padding + "Reserved for Expansion (Hex): {}\n" \ + + padding + "Version Valid for Number: {}\n" \ + + padding + "SQLite Version Number: {}\n" \ + + padding + "MD5 Hex Digest: {}" + return string.format(self.magic_header_string, + self.page_size, + self.file_format_write_version, + self.file_format_read_version, + self.reserved_bytes_per_page, + self.maximum_embedded_payload_fraction, + self.minimum_embedded_payload_fraction, + self.leaf_payload_fraction, + self.file_change_counter, + self.database_size_in_pages, + self.first_freelist_trunk_page_number, + self.number_of_freelist_pages, + self.schema_cookie, + self.schema_format_number, + self.default_page_cache_size, + self.largest_root_b_tree_page_number, + self.database_text_encoding, + self.user_version, + self.incremental_vacuum_mode, + self.application_id, + hexlify(self.reserved_for_expansion), + self.version_valid_for_number, + self.sqlite_version_number, + self.md5_hex_digest) + + +class BTreePageHeader(object): + + __metaclass__ = ABCMeta + + def __init__(self, page, header_length): + + self.offset = 0 + self.header_length = header_length + + self.contains_sqlite_database_header = False + + """ + + The root_page_only_md5_hex_digest is only set when the SQLite database header is detected in the page. + + """ + + self.root_page_only_md5_hex_digest = None + + first_page_byte = page[0:1] + if first_page_byte == MASTER_PAGE_HEX_ID: + self.contains_sqlite_database_header = True + self.root_page_only_md5_hex_digest = get_md5_hash(page[SQLITE_DATABASE_HEADER_LENGTH:]) + self.offset += SQLITE_DATABASE_HEADER_LENGTH + + self.page_type = page[self.offset:self.offset + 1] + self.first_freeblock_offset = unpack(b">H", page[self.offset + 1:self.offset + 3])[0] + self.number_of_cells_on_page = unpack(b">H", page[self.offset + 3:self.offset + 5])[0] + self.cell_content_offset = unpack(b">H", page[self.offset + 5:self.offset + 7])[0] + self.number_of_fragmented_free_bytes = ord(page[self.offset + 7:self.offset + 8]) + + self.md5_hex_digest = get_md5_hash(page[self.offset:self.header_length]) + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding=""): + string = padding + "Contains SQLite Database Header: {}\n" \ + + padding + "Root Page Only MD5 Hex Digest: {}\n" \ + + padding + "Page Type (Hex): {}\n" \ + + padding + "Offset: {}\n" \ + + padding + "Length: {}\n" \ + + padding + "First Freeblock Offset: {}\n" \ + + padding + "Number of Cells on Page: {}\n" \ + + padding + "Cell Content Offset: {}\n" \ + + padding + "Number of Fragmented Free Bytes: {}\n" \ + + padding + "MD5 Hex Digest: {}" + return string.format(self.contains_sqlite_database_header, + self.root_page_only_md5_hex_digest, + hexlify(self.page_type), + self.offset, + self.header_length, + self.first_freeblock_offset, + self.number_of_cells_on_page, + self.cell_content_offset, + self.number_of_fragmented_free_bytes, + self.md5_hex_digest) + + +class LeafPageHeader(BTreePageHeader): + + def __init__(self, page): + super(LeafPageHeader, self).__init__(page, LEAF_PAGE_HEADER_LENGTH) + + +class InteriorPageHeader(BTreePageHeader): + + def __init__(self, page): + super(InteriorPageHeader, self).__init__(page, INTERIOR_PAGE_HEADER_LENGTH) + + right_most_pointer_start_offset = self.offset + RIGHT_MOST_POINTER_OFFSET + right_most_pointer_end_offset = right_most_pointer_start_offset + RIGHT_MOST_POINTER_LENGTH + self.right_most_pointer = unpack(b">I", page[right_most_pointer_start_offset:right_most_pointer_end_offset])[0] + + def stringify(self, padding=""): + string = "\n" \ + + padding + "Right Most Pointer: {}" + string = string.format(self.right_most_pointer) + return super(InteriorPageHeader, self).stringify(padding) + string diff --git a/sqlite_dissect/file/database/page.py b/sqlite_dissect/file/database/page.py new file mode 100644 index 0000000..9cafa35 --- /dev/null +++ b/sqlite_dissect/file/database/page.py @@ -0,0 +1,1776 @@ +from abc import ABCMeta +from binascii import hexlify +from logging import getLogger +from re import sub +from struct import unpack +from warnings import warn +from sqlite_dissect.constants import CELL_LOCATION +from sqlite_dissect.constants import CELL_MODULE +from sqlite_dissect.constants import CELL_POINTER_BYTE_LENGTH +from sqlite_dissect.constants import CELL_SOURCE +from sqlite_dissect.constants import FIRST_OVERFLOW_PAGE_INDEX +from sqlite_dissect.constants import FIRST_OVERFLOW_PAGE_NUMBER_LENGTH +from sqlite_dissect.constants import FIRST_OVERFLOW_PARENT_PAGE_NUMBER +from sqlite_dissect.constants import FREEBLOCK_BYTE_LENGTH +from sqlite_dissect.constants import FREELIST_HEADER_LENGTH +from sqlite_dissect.constants import FREELIST_LEAF_PAGE_NUMBER_LENGTH +from sqlite_dissect.constants import FREELIST_NEXT_TRUNK_PAGE_LENGTH +from sqlite_dissect.constants import INDEX_INTERIOR_CELL_CLASS +from sqlite_dissect.constants import INDEX_INTERIOR_PAGE_HEX_ID +from sqlite_dissect.constants import INDEX_LEAF_CELL_CLASS +from sqlite_dissect.constants import INDEX_LEAF_PAGE_HEX_ID +from sqlite_dissect.constants import INTERIOR_PAGE_HEADER_CLASS +from sqlite_dissect.constants import LEAF_PAGE_HEADER_CLASS +from sqlite_dissect.constants import LEFT_CHILD_POINTER_BYTE_LENGTH +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import MASTER_PAGE_HEX_ID +from sqlite_dissect.constants import NEXT_FREEBLOCK_OFFSET_LENGTH +from sqlite_dissect.constants import OVERFLOW_HEADER_LENGTH +from sqlite_dissect.constants import PAGE_FRAGMENT_LIMIT +from sqlite_dissect.constants import PAGE_HEADER_MODULE +from sqlite_dissect.constants import PAGE_TYPE +from sqlite_dissect.constants import PAGE_TYPE_LENGTH +from sqlite_dissect.constants import POINTER_MAP_B_TREE_NON_ROOT_PAGE_TYPE +from sqlite_dissect.constants import POINTER_MAP_B_TREE_ROOT_PAGE_TYPE +from sqlite_dissect.constants import POINTER_MAP_ENTRY_LENGTH +from sqlite_dissect.constants import POINTER_MAP_FREELIST_PAGE_TYPE +from sqlite_dissect.constants import POINTER_MAP_OVERFLOW_FIRST_PAGE_TYPE +from sqlite_dissect.constants import POINTER_MAP_OVERFLOW_FOLLOWING_PAGE_TYPE +from sqlite_dissect.constants import POINTER_MAP_PAGE_TYPES +from sqlite_dissect.constants import SQLITE_DATABASE_HEADER_LENGTH +from sqlite_dissect.constants import SQLITE_MASTER_SCHEMA_ROOT_PAGE +from sqlite_dissect.constants import TABLE_INTERIOR_CELL_CLASS +from sqlite_dissect.constants import TABLE_INTERIOR_PAGE_HEX_ID +from sqlite_dissect.constants import TABLE_LEAF_CELL_CLASS +from sqlite_dissect.constants import TABLE_LEAF_PAGE_HEX_ID +from sqlite_dissect.constants import ZERO_BYTE +from sqlite_dissect.exception import BTreePageParsingError +from sqlite_dissect.exception import CellParsingError +from sqlite_dissect.exception import PageParsingError +from sqlite_dissect.file.database.payload import decode_varint +from sqlite_dissect.file.database.payload import Record +from sqlite_dissect.utilities import calculate_expected_overflow +from sqlite_dissect.utilities import get_class_instance +from sqlite_dissect.utilities import get_md5_hash + +""" + +page.py + +This script holds the Page and Cell related objects for parsing out the different types of SQLite pages in the +SQLite database file. This also includes freeblock and fragment related objects. + +This script holds the following object(s): +Page(object) +OverflowPage(Page) +FreelistTrunkPage(Page) +FreelistLeafPage(Page) +PointerMapPage(Page) +PointerMapEntry(object) +BTreePage(Page) +TableInteriorPage(BTreePage) +TableLeafPage(BTreePage) +IndexInteriorPage(BTreePage) +IndexLeafPage(BTreePage) +BTreeCell(object) +TableInteriorCell(BTreeCell) +TableLeafCell(BTreeCell) +IndexInteriorCell(BTreeCell) +IndexLeafCell(BTreeCell) +Freeblock(BTreeCell) +Fragment(BTreeCell) + +Note: In some places, like with unallocated data on the page, it was decided to not store this data in memory + and pull it from the file on demand and/or calculate information from it if needed on demand. This was done + to prevent the memory used by this program becoming bloated with unneeded data. + +Assumptions: +1.) OverflowPage: All overflow pages are replaced in a chain on modification. This assumes that whenever a cell is + modified, that even if the content of the overflow portion does not change, the whole cell including + overflow need to be replaced due to the way the cells are stored in SQLite. + +""" + + +class Page(object): + + __metaclass__ = ABCMeta + + def __init__(self, version_interface, number): + + self._logger = getLogger(LOGGER_NAME) + + self._version_interface = version_interface + self.version_number = self._version_interface.version_number + self.page_version_number = self._version_interface.get_page_version(number) + self.number = number + self.page_type = None + self.offset = self._version_interface.get_page_offset(self.number) + self.size = self._version_interface.page_size + self.md5_hex_digest = None + self.unallocated_space_start_offset = None + self.unallocated_space_end_offset = None + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding=""): + string = padding + "Version Number: {}\n" \ + + padding + "Page Version Number: {}\n" \ + + padding + "Number: {}\n" \ + + padding + "Page Type: {}\n" \ + + padding + "Offset: {}\n" \ + + padding + "Size: {}\n" \ + + padding + "MD5 Hex Digest: {}\n" \ + + padding + "Unallocated Space Start Offset: {}\n" \ + + padding + "Unallocated Space End Offset: {}\n" \ + + padding + "Unallocated Space Size: {}\n" \ + + padding + "Unallocated Content MD5 Hex Digest: {}\n" \ + + padding + "Unallocated Content (Hex): {}" + return string.format(self.version_number, + self.page_version_number, + self.number, + self.page_type, + self.offset, + self.size, + self.md5_hex_digest, + self.unallocated_space_start_offset, + self.unallocated_space_end_offset, + self.unallocated_space_length, + self.unallocated_space_md5_hex_digest, + hexlify(self.unallocated_space)) + + @property + def unallocated_space(self): + + """ + + This property returns the unallocated space inside this page. + + :return: bytearray The byte array for unallocated space. + + """ + + if self.unallocated_space_length == 0: + return bytearray() + else: + return self._version_interface.get_page_data(self.number, self.unallocated_space_start_offset, + self.unallocated_space_length) + + @property + def unallocated_space_md5_hex_digest(self): + + """ + + This method will compute the md5 hash of the unallocated space of this page and return it. This is + calculated when called instead of before hand since this is a superclass and does not know where the + unallocated space starts and ends at time of creation. Although this could be computed and stored the first + time it is called, it was decided to always compute when called. + + :return: string The hexadecimal md5 hash string. + + """ + + return get_md5_hash(self.unallocated_space) + + @property + def unallocated_space_length(self): + + """ + + This property will compute the unallocated space length of this page and return it. This is calculated + when called instead of before hand since this is a superclass and does not know the unallocated space + start and end offsets at time of creation. + + :return: int The unallocated space length. + + """ + + # Return the length of the unallocated space on this page + return self.unallocated_space_end_offset - self.unallocated_space_start_offset + + +class OverflowPage(Page): + + def __init__(self, version_interface, number, parent_cell_page_number, parent_overflow_page_number, + index, payload_remaining): + + super(OverflowPage, self).__init__(version_interface, number) + + self.page_type = PAGE_TYPE.OVERFLOW + + if payload_remaining <= 0: + log_message = "No payload remaining when overflow page initialized for version number: {} page number: {}." + log_message = log_message.format(self.version_number, self.number) + self._logger.error(log_message) + raise PageParsingError(log_message) + + page = self._version_interface.get_page_data(self.number) + + self.parent_cell_page_number = parent_cell_page_number + self.parent_overflow_page_number = parent_overflow_page_number + self.index = index + self.next_overflow_page_number = unpack(b">I", page[:OVERFLOW_HEADER_LENGTH])[0] + + self.unallocated_space_start_offset = self.size + self.unallocated_space_end_offset = self.size + self.md5_hex_digest = get_md5_hash(page) + + if payload_remaining <= self.size - OVERFLOW_HEADER_LENGTH: + + # This was found to be the last overflow page in the chain. Make sure there are no other overflow pages. + if self.next_overflow_page_number: + log_message = "Additional overflow page number: {} found for version number: {} " \ + "page version number: {} page number: {} when no more overflow pages were expected." + log_message = log_message.format(self.next_overflow_page_number, self.version_number, + self.page_version_number, self.number) + self._logger.error(log_message) + raise PageParsingError(log_message) + + self.unallocated_space_start_offset = payload_remaining + OVERFLOW_HEADER_LENGTH + + if self.next_overflow_page_number: + + """ + + Here we make the assumption that all overflow pages have to be replaced when any overflow page in a chain + is updated. In other words, when a overflow chain is changed in a version, all overflow pages in that chain + belong to that version. This is due to the face that all overflow pages in a chain pertain to a cell that + was modified and therefore all overflow pages belonging to that record need to be reinserted even if the + same as before. + + Here we check the version of the overflow page that this one points to. If the versions of the two pages + are different we throw an exception. + + Since overflow pages are in a chain, this check is done on each creation of the next overflow page for the + following overflow page if it exists. + + """ + + next_overflow_page_version = self._version_interface.get_page_version(self.next_overflow_page_number) + if self.page_version_number != next_overflow_page_version: + log_message = "The version of the current overflow page: {} on version: {} on page: {} has points to " \ + "a next overflow page version: {} for page: {} that has a different version." + log_message = log_message.format(self.page_version_number, self.version_number, self.number, + next_overflow_page_version, self.next_overflow_page_number) + self._logger.error(log_message) + raise PageParsingError(log_message) + + def stringify(self, padding=""): + string = "\n" \ + + padding + "Parent Cell Page Number: {}\n" \ + + padding + "Parent Overflow Page Number: {}\n" \ + + padding + "Index: {}\n" \ + + padding + "Next Overflow Page Number: {}\n" \ + + padding + "Content Length: {}\n" \ + + padding + "Content (Hex): {}" + string = string.format(self.parent_cell_page_number, + self.parent_overflow_page_number, + self.index, + self.next_overflow_page_number, + self.content_length, + hexlify(self.content)) + return super(OverflowPage, self).stringify(padding) + string + + @property + def content(self): + return self._version_interface.get_page_data(self.number, OVERFLOW_HEADER_LENGTH, self.content_length) + + @property + def content_length(self): + return self.unallocated_space_start_offset - OVERFLOW_HEADER_LENGTH + + +class FreelistTrunkPage(Page): + + def __init__(self, version_interface, number, parent_freelist_trunk_page_number, index): + + super(FreelistTrunkPage, self).__init__(version_interface, number) + + self.page_type = PAGE_TYPE.FREELIST_TRUNK + + self.parent_freelist_trunk_page_number = parent_freelist_trunk_page_number + self.index = index + + page = self._version_interface.get_page_data(self.number) + + self.next_freelist_trunk_page_number = unpack(b">I", page[:FREELIST_NEXT_TRUNK_PAGE_LENGTH])[0] + self.number_of_leaf_page_pointers = unpack(b">I", page[FREELIST_NEXT_TRUNK_PAGE_LENGTH: + FREELIST_HEADER_LENGTH])[0] + self.freelist_leaf_page_numbers = [] + self.freelist_leaf_pages = [] + for index in range(self.number_of_leaf_page_pointers): + start_offset = index * FREELIST_LEAF_PAGE_NUMBER_LENGTH + FREELIST_HEADER_LENGTH + end_offset = start_offset + FREELIST_LEAF_PAGE_NUMBER_LENGTH + freelist_leaf_page_number = unpack(b">I", page[start_offset:end_offset])[0] + + """ + + Note: Freelist leaf pages can be in previous commit records to the commit record this current freelist trunk + page is in or commit records up to the main commit record version if applicable. + + """ + + freelist_leaf_page = FreelistLeafPage(self._version_interface, freelist_leaf_page_number, + self.number, index) + + self.freelist_leaf_page_numbers.append(freelist_leaf_page_number) + self.freelist_leaf_pages.append(freelist_leaf_page) + + if len(self.freelist_leaf_page_numbers) != self.number_of_leaf_page_pointers: + log_message = "In freelist trunk page: {} with page version: {} in version: {} found a different amount " \ + "of freelist leaf page numbers: {} than freelist leaf page pointers: {} found on the page." + log_message = log_message.format(self.number, self.page_version_number, self.version_number, + len(self.freelist_leaf_page_numbers), self.number_of_leaf_page_pointers) + self._logger.error(log_message) + raise PageParsingError(log_message) + + freelist_leaf_page_numbers_size = self.number_of_leaf_page_pointers * FREELIST_LEAF_PAGE_NUMBER_LENGTH + self.unallocated_space_start_offset = FREELIST_HEADER_LENGTH + freelist_leaf_page_numbers_size + self.unallocated_space_end_offset = self.size + + self.md5_hex_digest = get_md5_hash(page) + + self.next_freelist_trunk_page = None + if self.next_freelist_trunk_page_number: + + """ + + Here we make the assumption that a freelist trunk page can be updated without updating following freelist + trunk pages in the linked list. Since this is an "allowed" assumption, a print statement will print a log + info message that this happens and once we observe it, we can then declare it is no longer an assumption. + + """ + + next_freelist_trunk_page_version_number = self._version_interface.get_page_version( + self.next_freelist_trunk_page_number) + if self.page_version_number > next_freelist_trunk_page_version_number: + log_message = "Found a freelist trunk page: {} that has page version: {} in version: {} that points " \ + "to an earlier freelist trunk page version: {}." + log_message = log_message.format(self.number, self.page_version_number, self.version_number, + next_freelist_trunk_page_version_number) + self._logger.info(log_message) + + self.next_freelist_trunk_page = FreelistTrunkPage(self._version_interface, + self.next_freelist_trunk_page_number, + self.number, self.index + 1) + + def stringify(self, padding=""): + string = "\n" \ + + padding + "Parent Freelist Trunk Page Number: {}\n" \ + + padding + "Index: {}\n" \ + + padding + "Next Freelist Trunk Page Number: {}\n" \ + + padding + "Number of Leaf Page Pointers: {}\n" \ + + padding + "Freelist Leaf Page Numbers: {}\n" \ + + padding + "Freelist Leaf Pages length: {}" + string = string.format(self.parent_freelist_trunk_page_number, + self.index, + self.next_freelist_trunk_page_number, + self.number_of_leaf_page_pointers, + self.freelist_leaf_page_numbers, + len(self.freelist_leaf_pages)) + for freelist_leaf_page in self.freelist_leaf_pages: + string += "\n" + padding + "Freelist Leaf Page:\n{}".format(freelist_leaf_page.stringify(padding + "\t")) + if self.next_freelist_trunk_page: + string += "\n" + padding \ + + "Next Freelist Trunk Page:\n{}".format(self.next_freelist_trunk_page.stringify(padding + "\t")) + return super(FreelistTrunkPage, self).stringify(padding) + string + + +class FreelistLeafPage(Page): + + def __init__(self, version_interface, number, parent_freelist_trunk_page_number, index): + + super(FreelistLeafPage, self).__init__(version_interface, number) + + self.page_type = PAGE_TYPE.FREELIST_LEAF + + self.parent_freelist_trunk_page_number = parent_freelist_trunk_page_number + self.index = index + + self.unallocated_space_start_offset = 0 + self.unallocated_space_end_offset = self.size + + page = self._version_interface.get_page_data(self.number) + self.md5_hex_digest = get_md5_hash(page) + + def stringify(self, padding=""): + string = "\n" \ + + padding + "Parent Freelist Trunk Page Number: {}\n" \ + + padding + "Index: {}" + string = string.format(self.parent_freelist_trunk_page_number, + self.index) + return super(FreelistLeafPage, self).stringify(padding) + string + + +class PointerMapPage(Page): + + def __init__(self, version_interface, number, number_of_entries): + + super(PointerMapPage, self).__init__(version_interface, number) + + self.page_type = PAGE_TYPE.POINTER_MAP + + page = self._version_interface.get_page_data(self.number) + + self.number_of_entries = number_of_entries + + self.unallocated_space_start_offset = self.number_of_entries * POINTER_MAP_ENTRY_LENGTH + self.unallocated_space_end_offset = self.size + + self.md5_hex_digest = get_md5_hash(page) + + self.pointer_map_entries = [] + for index in range(self.number_of_entries): + + offset = index * POINTER_MAP_ENTRY_LENGTH + + if offset >= self.size: + log_message = "For pointer map page: {} for page version: {} and version: {} the offset: {} " \ + "was found to greater or equal to the page size: {} on index: {}." + log_message = log_message.format(self.number, self.page_version_number, self.version_number, + offset, self.size, index) + self._logger.error(log_message) + raise PageParsingError(log_message) + + page_type = page[offset:offset + PAGE_TYPE_LENGTH] + if page_type == ZERO_BYTE: + log_message = "The page type was found to be empty for pointer map page: {} for page version: {} " \ + "and version: {} on index: {} and offset: {}." + log_message = log_message.format(self.number, self.page_version_number, self.version_number, + index, offset) + self._logger.error(log_message) + raise PageParsingError(log_message) + + elif offset + POINTER_MAP_ENTRY_LENGTH > self.size: + log_message = "The offset {} and pointer map length: {} go beyond the page size: {} for pointer " \ + "map page: {} for page version: {} and version: {} on index: {}." + log_message = log_message.format(offset, POINTER_MAP_ENTRY_LENGTH, self.size, self.number, + self.page_version_number, self.version_number, index) + self._logger.error(log_message) + raise PageParsingError(log_message) + + elif page_type not in POINTER_MAP_PAGE_TYPES: + log_message = "The page type was not recognized: {} as a valid pointer map page type for " \ + "pointer map page: {} for page version: {} and version: {} on index: {} and offset: {}." + log_message = log_message.format(hexlify(page_type), self.number, self.page_version_number, + self.version_number, index, offset) + self._logger.error(log_message) + raise PageParsingError(log_message) + + parent_page_number = unpack(b">I", page[offset + PAGE_TYPE_LENGTH:offset + POINTER_MAP_ENTRY_LENGTH])[0] + + if page_type in [POINTER_MAP_B_TREE_ROOT_PAGE_TYPE, POINTER_MAP_FREELIST_PAGE_TYPE] and parent_page_number: + log_message = "The page type: {} has a parent page number: {} which is invalid for " \ + "pointer map page: {} for page version: {} and version: {} on index: {} and offset: {}." + log_message = log_message.format(hexlify(page_type), parent_page_number, self.number, + self.page_version_number, self.version_number, index, offset) + self._logger.error(log_message) + raise PageParsingError(log_message) + + elif page_type in [POINTER_MAP_OVERFLOW_FIRST_PAGE_TYPE, POINTER_MAP_OVERFLOW_FOLLOWING_PAGE_TYPE, + POINTER_MAP_B_TREE_NON_ROOT_PAGE_TYPE] and not parent_page_number: + log_message = "The page type: {} does not have a parent page number which is invalid for " \ + "pointer map page: {} for page version: {} and version: {} on index: {} and offset: {}." + log_message = log_message.format(hexlify(page_type), self.number, self.page_version_number, + self.version_number, index, offset) + self._logger.error(log_message) + raise PageParsingError(log_message) + + pointer_map_entry_md5_hex_digest = get_md5_hash(page[offset:offset + POINTER_MAP_ENTRY_LENGTH]) + + page_number = number + index + 1 + pointer_map_entry = PointerMapEntry(index, offset, page_number, page_type, parent_page_number, + pointer_map_entry_md5_hex_digest) + self.pointer_map_entries.append(pointer_map_entry) + + if len(self.pointer_map_entries) != self.number_of_entries: + log_message = "In pointer map page: {} with page version: {} in version: {} found a different amount " \ + "of pointer map entries: {} than expected number of entries: {} found on the page." + log_message = log_message.format(self.number, self.page_version_number, self.version_number, + len(self.pointer_map_entries), self.number_of_entries) + self._logger.error(log_message) + raise PageParsingError(log_message) + + remaining_space_offset = self.number_of_entries * POINTER_MAP_ENTRY_LENGTH + if remaining_space_offset != self.unallocated_space_start_offset: + log_message = "The remaining space offset: {} is not equal to the unallocated space start offset: {} " \ + "for pointer map page: {} for page version: {} and version: {}." + log_message = log_message.format(remaining_space_offset, self.unallocated_space_start_offset, self.number, + self.page_version_number, self.version_number) + self._logger.error(log_message) + raise PageParsingError(log_message) + + """ + + Originally here the remaining space was checked to see if it was all zeros, and if not an exception was thrown. + This has since been removed since it was realized that this unallocated space can contain information resulting + in non-zero unallocated space. + + It was realized that when a database increases in size and then decreases due to auto-vacuuming where freelist + pages are truncated from the end of the database, the pointer information from those previous pages remain. + + This information may give an idea into what pages were removed and how they were previously structured. This + data should probably be parsed and investigated during the unallocated carving specific to pointer map pages. + + The patterns still need to match 5 bytes, first byte being the pointer map page type and the second 4 bytes + being the page number (if existing). This could give an idea of how big the database was previously but will + only give the max size at any point in time since it does not appear that the pointer map pages are zero'd out + at any point and are just overwritten if need be. + + There may still may be non-pointer map data included beyond the pointer map entries that does not fit the 5 + byte patterns. For example page 2 where the first pointer map page was placed was previously a b-tree page + before vacuuming was turned on. However, there are other details where auto-vacuuming is only possible is + turned on before table creation. More research will have to be done here for exactly how everything here works. + The page may also be zero'd out at a time such as this as well. + + """ + + def stringify(self, padding=""): + string = "\n" \ + + padding + "Number of Entries: {}\n" \ + + padding + "Pointer Map Entries Size: {}" + string = string.format(self.number_of_entries, + len(self.pointer_map_entries)) + for pointer_map_entry in self.pointer_map_entries: + string += "\n" + padding + "Pointer Map Entry:\n{}".format(pointer_map_entry.stringify(padding + "\t")) + return super(PointerMapPage, self).stringify(padding) + string + + +class PointerMapEntry(object): + + def __init__(self, index, offset, page_number, page_type, parent_page_number, md5_hex_digest): + self.index = index + self.offset = offset + self.page_number = page_number + self.page_type = page_type + self.parent_page_number = parent_page_number + self.md5_hex_digest = md5_hex_digest + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding=""): + string = padding + "Index: {}\n" \ + + padding + "Offset: {}\n" \ + + padding + "Page Number: {}\n" \ + + padding + "Page Type: {}\n" \ + + padding + "Parent Page Number: {}\n" \ + + padding + "MD5 Hex Digest: {}" + return string.format(self.index, + self.offset, + self.page_number, + self.page_type, + self.parent_page_number, + self.md5_hex_digest) + + +class BTreePage(Page): + + __metaclass__ = ABCMeta + + def __init__(self, version_interface, number, header_class_name, cell_class_name): + + super(BTreePage, self).__init__(version_interface, number) + + page = self._version_interface.get_page_data(self.number) + + self.page_type = None + self.hex_type = page[0] + + if self.hex_type == MASTER_PAGE_HEX_ID: + master_page_hex_type = page[SQLITE_DATABASE_HEADER_LENGTH] + if master_page_hex_type == TABLE_INTERIOR_PAGE_HEX_ID: + self.page_type = PAGE_TYPE.B_TREE_TABLE_INTERIOR + elif master_page_hex_type == TABLE_LEAF_PAGE_HEX_ID: + self.page_type = PAGE_TYPE.B_TREE_TABLE_LEAF + else: + log_message = "Page hex type for master page is: {} and not a table interior or table leaf page as " \ + "expected in b-tree page: {} in page version: {} for version: {}." + log_message = log_message.format(hexlify(master_page_hex_type), self.number, + self.page_version_number, self.version_number) + self._logger.error(log_message) + raise BTreePageParsingError(log_message) + + elif self.hex_type == TABLE_INTERIOR_PAGE_HEX_ID: + self.page_type = PAGE_TYPE.B_TREE_TABLE_INTERIOR + elif self.hex_type == TABLE_LEAF_PAGE_HEX_ID: + self.page_type = PAGE_TYPE.B_TREE_TABLE_LEAF + elif self.hex_type == INDEX_INTERIOR_PAGE_HEX_ID: + self.page_type = PAGE_TYPE.B_TREE_INDEX_INTERIOR + elif self.hex_type == INDEX_LEAF_PAGE_HEX_ID: + self.page_type = PAGE_TYPE.B_TREE_INDEX_LEAF + else: + log_message = "Page hex type: {} is not a valid b-tree page type for b-tree page: {} in page version: {} " \ + "for version: {}." + log_message = log_message.format(hexlify(self.hex_type), self.number, self.page_version_number, + self.version_number) + self._logger.error(log_message) + raise BTreePageParsingError(log_message) + + header_class = get_class_instance(header_class_name) + cell_class = get_class_instance(cell_class_name) + + self.header = header_class(page) + + cell_pointer_array_offset = self.header.header_length + if self.header.contains_sqlite_database_header: + cell_pointer_array_offset += SQLITE_DATABASE_HEADER_LENGTH + + if self.number != SQLITE_MASTER_SCHEMA_ROOT_PAGE: + log_message = "B-tree page found to contain the sqlite database header but is not the root page for " \ + "b-tree page: {} in page version: {} for version: {}." + log_message = log_message.format(self.number, self.page_version_number, self.version_number) + self._logger.error(log_message) + raise BTreePageParsingError(log_message) + + cell_pointer_array_length = self.header.number_of_cells_on_page * CELL_POINTER_BYTE_LENGTH + self.unallocated_space_start_offset = cell_pointer_array_offset + cell_pointer_array_length + self.unallocated_space_end_offset = self.header.cell_content_offset + + adjusted_header_length = self.header.header_length + if self.header.contains_sqlite_database_header: + adjusted_header_length += SQLITE_DATABASE_HEADER_LENGTH + preface_size = adjusted_header_length + cell_pointer_array_length + + if preface_size != self.unallocated_space_start_offset: + log_message = "The calculated preface size: {} is not equal to the unallocated space start offset: {} " \ + "for b-tree page: {} in page version: {} for version: {}." + log_message = log_message.format(preface_size, self.unallocated_space_start_offset, self.number, + self.page_version_number, self.version_number) + self._logger.error(log_message) + raise BTreePageParsingError(log_message) + + if self.header.cell_content_offset != self.unallocated_space_end_offset: + log_message = "The cell content offset in the header: {} is not equal to the unallocated space end " \ + "offset: {} for b-tree page: {} in page version: {} for version: {}." + log_message = log_message.format(self.header.cell_content_offset, self.unallocated_space_end_offset, + self.number, self.page_version_number, self.version_number) + self._logger.error(log_message) + raise BTreePageParsingError(log_message) + + self.cells = [] + self.calculated_cell_total_byte_size = 0 + for cell_index in range(self.header.number_of_cells_on_page): + cell_start_offset = cell_pointer_array_offset + cell_index * CELL_POINTER_BYTE_LENGTH + cell_end_offset = cell_start_offset + CELL_POINTER_BYTE_LENGTH + cell_offset = unpack(b">H", page[cell_start_offset:cell_end_offset])[0] + file_offset = self.offset + cell_offset + cell_instance = cell_class(self._version_interface, self.page_version_number, file_offset, self.number, + page, cell_index, cell_offset) + self.cells.append(cell_instance) + if type(cell_instance) != TableInteriorCell and cell_instance.has_overflow: + overflow_adjusted_page_size = cell_instance.end_offset - cell_instance.start_offset + self.calculated_cell_total_byte_size += overflow_adjusted_page_size + else: + self.calculated_cell_total_byte_size += cell_instance.byte_size + + if len(self.cells) != self.header.number_of_cells_on_page: + log_message = "The number of cells parsed: {} does not equal the number of cells specified in the " \ + "header: {} for b-tree page: {} in page version: {} for version: {}." + log_message = log_message.format(len(self.cells), self.header.number_of_cells_on_page, + self.number, self.page_version_number, self.version_number) + self._logger.error(log_message) + raise BTreePageParsingError(log_message) + + # Check if there are freeblocks specified in the header (0 if no freeblocks) + self.freeblocks = [] + self.calculated_freeblock_total_byte_size = 0 + if self.header.first_freeblock_offset != 0: + freeblock_index = 0 + next_freeblock_offset = self.header.first_freeblock_offset + file_offset = self.offset + next_freeblock_offset + while next_freeblock_offset: + freeblock = Freeblock(self._version_interface, self.page_version_number, file_offset, self.number, page, + freeblock_index, next_freeblock_offset) + self.freeblocks.append(freeblock) + next_freeblock_offset = freeblock.next_freeblock_offset + self.calculated_freeblock_total_byte_size += freeblock.byte_size + freeblock_index += 1 + + # Find fragments + self.fragments = [] + self.calculated_fragment_total_byte_size = 0 + fragment_index = 0 + aggregated_cells = sorted(self.cells + self.freeblocks, key=lambda b_tree_cell: b_tree_cell.start_offset) + last_accounted_for_offset = self.unallocated_space_end_offset + for cell in aggregated_cells: + if last_accounted_for_offset >= self.size: + log_message = "The last accounted for offset: {} while determining fragments is greater than or " \ + "equal to the page size: {} for b-tree page: {} in page version: {} for version: {}." + log_message = log_message.format(last_accounted_for_offset, self.size, self.number, + self.page_version_number, self.version_number) + self._logger.error(log_message) + raise BTreePageParsingError(log_message) + + if cell.start_offset != last_accounted_for_offset: + file_offset = self.offset + last_accounted_for_offset + fragment = Fragment(self._version_interface, self.page_version_number, file_offset, self.number, page, + fragment_index, last_accounted_for_offset, cell.start_offset) + self.fragments.append(fragment) + self.calculated_fragment_total_byte_size += fragment.byte_size + fragment_index += 1 + last_accounted_for_offset = cell.end_offset + + if self.header.number_of_fragmented_free_bytes > PAGE_FRAGMENT_LIMIT: + log_message = "The number of fragmented free bytes: {} is greater than the page fragment limit: {} " \ + "for b-tree page: {} in page version: {} for version: {}." + log_message = log_message.format(self.header.number_of_fragmented_free_bytes, PAGE_FRAGMENT_LIMIT, + self.number, self.page_version_number, self.version_number) + self._logger.error(log_message) + raise BTreePageParsingError(log_message) + + if self.calculated_fragment_total_byte_size != self.header.number_of_fragmented_free_bytes: + log_message = "The calculated fragment total byte size: {} does not equal the number of fragmented free " \ + "bytes specified in the header: {} for b-tree page: {} in page version: {} for version: {}." + log_message = log_message.format(self.calculated_fragment_total_byte_size, + self.header.number_of_fragmented_free_bytes, + self.number, self.page_version_number, self.version_number) + self._logger.error(log_message) + if version_interface.strict_format_checking: + raise BTreePageParsingError(log_message) + else: + warn(log_message, RuntimeWarning) + + # Account for all space within the page + unallocated_space_size = self.unallocated_space_end_offset - self.unallocated_space_start_offset + body_size = self.calculated_cell_total_byte_size + body_size += self.calculated_freeblock_total_byte_size + self.calculated_fragment_total_byte_size + + accounted_for_space = preface_size + unallocated_space_size + body_size + if accounted_for_space != self.size: + log_message = "The calculated accounted for space: {} does not equal the page size: {} " \ + "for b-tree page: {} in page version: {} for version: {}." + log_message = log_message.format(accounted_for_space, self.size, self.number, + self.page_version_number, self.version_number) + self._logger.error(log_message) + if version_interface.strict_format_checking: + raise BTreePageParsingError(log_message) + else: + warn(log_message, RuntimeWarning) + + self.md5_hex_digest = get_md5_hash(page) + + def stringify(self, padding=""): + string = "\n" \ + + padding + "Hex Type (Hex): {}\n" \ + + padding + "Header:\n{}\n"\ + + padding + "Cells Length: {}\n" \ + + padding + "Calculated Cell Total Byte Size: {}\n" \ + + padding + "Freeblocks Length: {}\n" \ + + padding + "Calculated Freeblock Total Byte Size: {}\n" \ + + padding + "Fragments Length: {}\n" \ + + padding + "Calculated Fragment Total Byte Size: {}" + string = string.format(hexlify(self.hex_type), + self.header.stringify(padding + "\t"), + len(self.cells), + self.calculated_cell_total_byte_size, + len(self.freeblocks), + self.calculated_freeblock_total_byte_size, + len(self.fragments), + self.calculated_fragment_total_byte_size) + for cell in self.cells: + string += "\n" + padding + "Cell:\n{}".format(cell.stringify(padding + "\t")) + for freeblock in self.freeblocks: + string += "\n" + padding + "Freeblock:\n{}".format(freeblock.stringify(padding + "\t")) + for fragment in self.fragments: + string += "\n" + padding + "Fragment:\n{}".format(fragment.stringify(padding + "\t")) + return super(BTreePage, self).stringify(padding) + string + + +class TableInteriorPage(BTreePage): + + def __init__(self, version_interface, number): + header_class_name = "{}.{}".format(PAGE_HEADER_MODULE, INTERIOR_PAGE_HEADER_CLASS) + cell_class_name = "{}.{}".format(CELL_MODULE, TABLE_INTERIOR_CELL_CLASS) + super(TableInteriorPage, self).__init__(version_interface, number, header_class_name, cell_class_name) + + """ + + Note: A table interior page can be updated without updating the right most pointer page in a version. + + """ + + if not self.header.right_most_pointer: + log_message = "The right most pointer is not set for b-tree table interior page: {} " \ + "in page version: {} for version: {}." + log_message = log_message.format(self.number, self.page_version_number, self.version_number) + self._logger.error(log_message) + raise BTreePageParsingError(log_message) + + right_most_pointer_page_hex_type = self._version_interface.get_page_data(self.header.right_most_pointer, + 0, PAGE_TYPE_LENGTH) + + if right_most_pointer_page_hex_type == TABLE_INTERIOR_PAGE_HEX_ID: + self.right_most_page = TableInteriorPage(self._version_interface, self.header.right_most_pointer) + elif right_most_pointer_page_hex_type == TABLE_LEAF_PAGE_HEX_ID: + self.right_most_page = TableLeafPage(self._version_interface, self.header.right_most_pointer) + else: + log_message = "The right most pointer does not point to a table interior or leaf page but instead has " \ + "a hex type of: {} for b-tree table interior page: {} in page version: {} for version: {}." + log_message = log_message.format(hexlify(right_most_pointer_page_hex_type), self.number, + self.page_version_number, self.version_number) + self._logger.error(log_message) + raise BTreePageParsingError(log_message) + + def stringify(self, padding=""): + string = "\n" + padding + "Right Most Page:\n{}" + string = string.format(self.right_most_page.stringify(padding + "\t") if self.right_most_page else None) + return super(TableInteriorPage, self).stringify(padding) + string + + +class TableLeafPage(BTreePage): + + def __init__(self, version, number): + header_class_name = "{}.{}".format(PAGE_HEADER_MODULE, LEAF_PAGE_HEADER_CLASS) + cell_class_name = "{}.{}".format(CELL_MODULE, TABLE_LEAF_CELL_CLASS) + super(TableLeafPage, self).__init__(version, number, header_class_name, cell_class_name) + + +class IndexInteriorPage(BTreePage): + + def __init__(self, version, number): + + header_class_name = "{}.{}".format(PAGE_HEADER_MODULE, INTERIOR_PAGE_HEADER_CLASS) + cell_class_name = "{}.{}".format(CELL_MODULE, INDEX_INTERIOR_CELL_CLASS) + super(IndexInteriorPage, self).__init__(version, number, header_class_name, cell_class_name) + + """ + + Note: A index interior page can be updated without updating the right most pointer page in a version. + + """ + + if not self.header.right_most_pointer: + log_message = "The right most pointer is not set for b-tree index interior page: {} " \ + "in page version: {} for version: {}." + log_message = log_message.format(self.number, self.page_version_number, self.version_number) + self._logger.error(log_message) + raise BTreePageParsingError(log_message) + + right_most_pointer_page_hex_type = self._version_interface.get_page_data(self.header.right_most_pointer, + 0, PAGE_TYPE_LENGTH) + + if right_most_pointer_page_hex_type == INDEX_INTERIOR_PAGE_HEX_ID: + self.right_most_page = IndexInteriorPage(self._version_interface, self.header.right_most_pointer) + elif right_most_pointer_page_hex_type == INDEX_LEAF_PAGE_HEX_ID: + self.right_most_page = IndexLeafPage(self._version_interface, self.header.right_most_pointer) + else: + log_message = "The right most pointer does not point to a index interior or leaf page but instead has " \ + "a hex type of: {} for b-tree index interior page: {} in page version: {} for version: {}." + log_message = log_message.format(hexlify(right_most_pointer_page_hex_type), self.number, + self.page_version_number, self.version_number) + self._logger.error(log_message) + raise BTreePageParsingError(log_message) + + def stringify(self, padding=""): + string = "\n" + padding + "Right Most Page:\n{}" + string = string.format(self.right_most_page.stringify(padding + "\t") if self.right_most_page else None) + return super(IndexInteriorPage, self).stringify(padding) + string + + +class IndexLeafPage(BTreePage): + + def __init__(self, version, number): + header_class_name = "{}.{}".format(PAGE_HEADER_MODULE, LEAF_PAGE_HEADER_CLASS) + cell_class_name = "{}.{}".format(CELL_MODULE, INDEX_LEAF_CELL_CLASS) + super(IndexLeafPage, self).__init__(version, number, header_class_name, cell_class_name) + + +class BTreeCell(object): + + __metaclass__ = ABCMeta + + def __init__(self, version_interface, page_version_number, file_offset, page_number, index, offset, + source=CELL_SOURCE.B_TREE, location=None): + + self._logger = getLogger(LOGGER_NAME) + + self._version_interface = version_interface + self._page_size = self._version_interface.page_size + self.version_number = self._version_interface.version_number + self.page_version_number = page_version_number + self.file_offset = file_offset + self.page_number = page_number + self.index = index + self.start_offset = offset + self.location = location if location else CELL_LOCATION.ALLOCATED_SPACE + self.source = source + self.end_offset = None + self.byte_size = None + self.md5_hex_digest = None + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding=""): + string = padding + "Version Number: {}\n" \ + + padding + "Page Version Number: {}\n" \ + + padding + "File Offset: {}\n" \ + + padding + "Page Number: {}\n" \ + + padding + "Source: {}\n" \ + + padding + "Location: {}\n" \ + + padding + "Index: {}\n" \ + + padding + "Start Offset: {}\n" \ + + padding + "End Offset: {}\n" \ + + padding + "Byte Size: {}\n" \ + + padding + "MD5 Hex Digest: {}" + return string.format(self.version_number, + self.page_version_number, + self.file_offset, + self.page_number, + self.source, + self.location, + self.index, + self.start_offset, + self.end_offset, + self.byte_size, + self.md5_hex_digest) + + +class TableInteriorCell(BTreeCell): + + """ + + + + Note: B-Tree table interior cells never contain overflow. Therefore they have no payload (ie. record). This is + the only type of b-tree page that does not have a payload. + + """ + + def __init__(self, version_interface, page_version_number, file_offset, page_number, page, index, offset): + + super(TableInteriorCell, self).__init__(version_interface, page_version_number, file_offset, + page_number, index, offset) + left_child_pointer_end_offset = self.start_offset + LEFT_CHILD_POINTER_BYTE_LENGTH + self.left_child_pointer = unpack(b">I", page[self.start_offset:left_child_pointer_end_offset])[0] + self.row_id, self.row_id_varint_length = decode_varint(page, left_child_pointer_end_offset) + + self.byte_size = LEFT_CHILD_POINTER_BYTE_LENGTH + self.row_id_varint_length + self.end_offset = self.start_offset + self.byte_size + + self.md5_hex_digest = get_md5_hash(page[self.start_offset:self.end_offset]) + + """ + + Note: A table interior cell can be updated without updating the left child page in a version. + + """ + + if not self.left_child_pointer: + log_message = "The left child pointer is not set for b-tree table interior cell index: {} " \ + "at offset: {} for page: {} in page version: {} for version: {}." + log_message = log_message.format(self.index, self.start_offset, self.page_number, + self.page_version_number, self.version_number) + self._logger.error(log_message) + raise CellParsingError(log_message) + + left_child_pointer_page_hex_type = self._version_interface.get_page_data(self.left_child_pointer, + 0, PAGE_TYPE_LENGTH) + + if left_child_pointer_page_hex_type == TABLE_INTERIOR_PAGE_HEX_ID: + self.left_child_page = TableInteriorPage(self._version_interface, self.left_child_pointer) + elif left_child_pointer_page_hex_type == TABLE_LEAF_PAGE_HEX_ID: + self.left_child_page = TableLeafPage(self._version_interface, self.left_child_pointer) + else: + log_message = "The left child pointer: {} does not point to a table interior or leaf page but instead " \ + "has a hex type of: {} for b-tree table interior cell index: {} at offset: {} for page: {} " \ + "in page version: {} for version: {}." + log_message = log_message.format(self.left_child_pointer, hexlify(left_child_pointer_page_hex_type), + self.index, self.start_offset, self.page_number, self.page_version_number, + self.version_number) + self._logger.error(log_message) + raise CellParsingError(log_message) + + def stringify(self, padding=""): + string = "\n" \ + + padding + "Left Child Pointer: {}\n" \ + + padding + "Row ID: {}\n" \ + + padding + "Row ID VARINT Length: {}" + string = string.format(self.left_child_pointer, + self.row_id, + self.row_id_varint_length) + string += "\n" + padding + "Left Child Page:\n{}" + string = string.format(self.left_child_page.stringify(padding + "\t") if self.left_child_page else None) + return super(TableInteriorCell, self).stringify(padding) + string + + +class TableLeafCell(BTreeCell): + + def __init__(self, version_interface, page_version_number, file_offset, page_number, page, index, offset): + + super(TableLeafCell, self).__init__(version_interface, page_version_number, file_offset, + page_number, index, offset) + + self.payload_byte_size, self.payload_byte_size_varint_length = decode_varint(page, self.start_offset) + row_id_offset = self.start_offset + self.payload_byte_size_varint_length + self.row_id, self.row_id_varint_length = decode_varint(page, row_id_offset) + self.payload_offset = self.start_offset + self.payload_byte_size_varint_length + self.row_id_varint_length + + self.has_overflow = False + self.overflow_pages = None + self.overflow_page_number_offset = None + self.overflow_page_number = None + self.overflow_page = None + self.last_overflow_page_content_size = 0 + + u = self._page_size + p = self.payload_byte_size + + """ + + Note: According to the SQLite documentation (as of version 3.9.2) table leaf cell overflow is calculated + by seeing if the payload size p is less than or equal to u - 35. If it is then there is no overflow. + If p is greater than u - 35, then there is overflow. At this point m = (((u - 12) * 32) / 255) - 23. + If p is greater than u - 35 then the number of bytes stored on the b-tree leaf page is the smaller of + m + ((p - m) % (u - 4)) and u - 35. The remaining bytes are then moved to overflow pages. + + The above was found to be wrong in the SQLite documentation. + + The documentation is incorrect that it is the smaller of m + ((p - m) % (u - 4)) and u - 35. After + a lot of testing and reviewing of the actual SQLite c code it was found out that the actual number of + bytes stored on the b-tree leaf page is m + ((p - m) % (u - 4)) unless m + ((p - m) % (u - 4)) > u - 35 + in which case the bytes stored on the b-tree table leaf page is m itself. + + Therefore let b be the bytes on the b-tree table leaf page: + u = page size + p = payload byte size + if p > u - 35 + m = (((u - 12) * 32) / 255) - 23 + b = m + ((p - m) % (u - 4)) + if b > u - 35 + b = m + + Additionally, the bytes stored on the b-tree table leaf page will always be greater to or equal to m + once calculated. + + """ + + self.bytes_on_first_page = p + if p > u - 35: + m = (((u - 12) * 32) / 255) - 23 + self.bytes_on_first_page = m + ((p - m) % (u - 4)) + if self.bytes_on_first_page > u - 35: + self.bytes_on_first_page = m + self.has_overflow = True + self.overflow_page_number_offset = self.payload_offset + self.bytes_on_first_page + overflow_page_number_end_offset = self.overflow_page_number_offset + FIRST_OVERFLOW_PAGE_NUMBER_LENGTH + self.overflow_page_number = unpack(b">I", page[self.overflow_page_number_offset: + overflow_page_number_end_offset])[0] + if self.bytes_on_first_page < m: + log_message = "When calculating overflow, the bytes on the first page: {} calculated are less than " \ + "m: {} for b-tree table leaf cell index: {} at offset: {} for page: {} in " \ + "page version: {} for version: {}." + log_message = log_message.format(self.bytes_on_first_page, m, self.index, self.start_offset, + self.page_number, self.page_version_number, self.version_number) + self._logger.error(log_message) + raise CellParsingError(log_message) + + self.byte_size = self.payload_byte_size_varint_length + self.row_id_varint_length + self.payload_byte_size + self.byte_size += FIRST_OVERFLOW_PAGE_NUMBER_LENGTH if self.has_overflow else 0 + self.end_offset = self.start_offset + self.byte_size - self.payload_byte_size + self.bytes_on_first_page + + self.overflow_byte_size = self.payload_byte_size - self.bytes_on_first_page + self.expected_number_of_overflow_pages, \ + self.expected_last_overflow_page_content_size = calculate_expected_overflow(self.overflow_byte_size, u) + + self.md5_hex_digest = get_md5_hash(page[self.start_offset:self.end_offset]) + + if self.has_overflow: + + """ + + The overflow pages are in a dictionary keyed off of their page number in the format: + overflow_page[OVERFLOW_PAGE_NUMBER] = OVERFLOW_PAGE + + Originally, the overflow pages were nested objects, ie. each overflow page had the following overflow + page within it, and so on. However, this lead to recursion depth problems with larger cell content. + It was changed to be a dictionary of pages here instead. + + Note: Although overflow pages have to be replaced when any overflow page in a chain is updated, the + overflow here may not be updated due to a different cell in this page being updated. Therefore, + we allow the first overflow page to be in a earlier version. However, the overflow pages still + check that all overflow versions in respect to the first overflow page and beyond in the linked + list are all equal. + + """ + + self.overflow_pages = {} + payload_remaining = self.overflow_byte_size + + overflow_page = OverflowPage(self._version_interface, self.overflow_page_number, self.page_number, + FIRST_OVERFLOW_PARENT_PAGE_NUMBER, FIRST_OVERFLOW_PAGE_INDEX, + payload_remaining) + + self.overflow_pages[overflow_page.number] = overflow_page + self.last_overflow_page_content_size = overflow_page.content_length + + while overflow_page.next_overflow_page_number: + payload_remaining = payload_remaining - overflow_page.size + OVERFLOW_HEADER_LENGTH + overflow_page = OverflowPage(self._version_interface, overflow_page.next_overflow_page_number, + self.page_number, overflow_page.number, overflow_page.index + 1, + payload_remaining) + self.overflow_pages[overflow_page.number] = overflow_page + self.last_overflow_page_content_size = overflow_page.content_length + + if self.expected_number_of_overflow_pages != self.number_of_overflow_pages: + log_message = "The number of expected overflow pages: {} was not the actual number of overflow pages " \ + "parsed: {} for b-tree table leaf cell index: {} at offset: {} for page: {} in " \ + "page version: {} for version: {}." + log_message = log_message.format(self.expected_number_of_overflow_pages, self.number_of_overflow_pages, + self.index, self.start_offset, self.page_number, self.page_version_number, + self.version_number) + self._logger.error(log_message) + raise CellParsingError(log_message) + + if self.expected_last_overflow_page_content_size != self.last_overflow_page_content_size: + log_message = "The expected last overflow page content size: {} was not the actual last overflow page " \ + "content size parsed: {} for b-tree table leaf cell index: {} at offset: {} for page: {} " \ + "in page version: {} for version: {}." + log_message = log_message.format(self.expected_last_overflow_page_content_size, + self.last_overflow_page_content_size, self.index, self.start_offset, + self.page_number, self.page_version_number, self.version_number) + raise CellParsingError(log_message) + + self.payload = Record(page, self.payload_offset, self.payload_byte_size, + self.bytes_on_first_page, self.overflow) + + def stringify(self, padding=""): + string = "\n" \ + + padding + "Payload Byte Size: {}\n" \ + + padding + "Payload Byte Size VARINT Length: {}\n" \ + + padding + "Row ID: {}\n" \ + + padding + "Row ID VARINT Length: {}\n" \ + + padding + "Payload Offset: {}\n" \ + + padding + "Bytes on First Page: {}\n" \ + + padding + "Has Overflow: {}\n" \ + + padding + "Overflow Byte Size: {}\n" \ + + padding + "Expected Number of Overflow Pages: {}\n" \ + + padding + "Expected Last Overflow Page Content Size: {}\n" \ + + padding + "Number of Overflow Pages: {}\n" \ + + padding + "Overflow Page Number Offset: {}\n" \ + + padding + "Overflow Page Number: {}\n" \ + + padding + "Last Overflow Page Content Size: {}\n" \ + + padding + "Overflow (Hex): {}" + string = string.format(self.payload_byte_size, + self.payload_byte_size_varint_length, + self.row_id, + self.row_id_varint_length, + self.payload_offset, + self.bytes_on_first_page, + self.has_overflow, + self.overflow_byte_size, + self.expected_number_of_overflow_pages, + self.expected_last_overflow_page_content_size, + self.number_of_overflow_pages, + self.overflow_page_number_offset, + self.overflow_page_number, + self.last_overflow_page_content_size, + hexlify(self.overflow)) + string += "\n" + padding + "Payload:\n{}".format(self.payload.stringify(padding + "\t")) + if self.has_overflow: + overflow_page = self.overflow_pages[self.overflow_page_number] + string += "\n" + padding + "Overflow Page:\n{}".format(self.overflow_page.stringify(padding + "\t")) + while overflow_page.next_overflow_page_number: + overflow_page = self.overflow_pages[overflow_page.next_overflow_page_number] + string += "\n" + padding + "Overflow Page:\n{}".format(self.overflow_page.stringify(padding + "\t")) + return super(TableLeafCell, self).stringify(padding) + string + + @property + def number_of_overflow_pages(self): + return len(self.overflow_pages) if self.overflow_pages else 0 + + @property + def overflow(self): + + overflow = bytearray() + + if not self.has_overflow: + + return overflow + + else: + + overflow_page = self.overflow_pages[self.overflow_page_number] + overflow += overflow_page.content + while overflow_page.next_overflow_page_number: + overflow_page = self.overflow_pages[overflow_page.next_overflow_page_number] + overflow += overflow_page.content + + if len(overflow) != self.overflow_byte_size: + log_message = "The expected overflow size: {} did not match the overflow size parsed: {} " \ + "for b-tree table leaf cell index: {} at offset: {} for page: {} " \ + "in page version: {} for version: {}." + log_message = log_message.format(self.overflow_byte_size, len(overflow), self.index, self.start_offset, + self.page_number, self.page_version_number, self.version_number) + raise CellParsingError(log_message) + + return overflow + + +class IndexInteriorCell(BTreeCell): + + def __init__(self, version_interface, page_version_number, file_offset, page_number, page, index, offset): + + super(IndexInteriorCell, self).__init__(version_interface, page_version_number, file_offset, + page_number, index, offset) + + left_child_pointer_end_offset = self.start_offset + LEFT_CHILD_POINTER_BYTE_LENGTH + self.left_child_pointer = unpack(b">I", page[self.start_offset:left_child_pointer_end_offset])[0] + self.payload_byte_size, self.payload_byte_size_varint_length = decode_varint(page, + left_child_pointer_end_offset) + self.payload_offset = left_child_pointer_end_offset + self.payload_byte_size_varint_length + + self.has_overflow = False + self.overflow_pages = None + self.overflow_page_number_offset = None + self.overflow_page_number = None + self.overflow_page = None + self.last_overflow_page_content_size = 0 + + u = self._page_size + p = self.payload_byte_size + x = (((u - 12) * 64) / 255) - 23 + + """ + + Note: According to the SQLite documentation (as of version 3.9.2) index interior and leaf cell overflow is + calculated by first calculating x as (((u - 12) * 64) / 255) - 23. If the payload size p is less than + or equal to x, then there is no overflow. If p is greater than x, than m = (((u - 12) * 32) / 255) - 23. + If p is greater than x then the number of bytes stored on the b-tree leaf page is the smaller of + m + ((p - m) % (u - 4)) and x. The remaining bytes are then moved to overflow pages. + + The above was found to be wrong in the SQLite documentation. + + The documentation is incorrect that it is the smaller of m + ((p - m) % (u - 4)) and x. After + a lot of testing and reviewing of the actual SQLite c code it was found out that the actual number of + bytes stored on the b-tree leaf page is m + ((p - m) % (u - 4)) unless m + ((p - m) % (u - 4)) > x + in which case the bytes stored on the b-tree index interior or index leaf page is m itself. + + Therefore let b be the bytes on the b-tree index interior or index leaf page: + u = page size + p = payload byte size + x = (((u - 12) * 64) / 255) - 23 + if p > x + m = (((u - 12) * 32) / 255) - 23 + b = m + ((p - m) % (u - 4)) + if b > x + b = m + + Additionally, the bytes stored on the b-tree index interior or index leaf page will always be greater + to or equal to m once calculated. + + """ + + self.bytes_on_first_page = p + if p > x: + m = (((u - 12) * 32) / 255) - 23 + self.bytes_on_first_page = m + ((p - m) % (u - 4)) + if self.bytes_on_first_page > x: + self.bytes_on_first_page = m + self.has_overflow = True + self.overflow_page_number_offset = self.payload_offset + self.bytes_on_first_page + overflow_page_number_end_offset = self.overflow_page_number_offset + FIRST_OVERFLOW_PAGE_NUMBER_LENGTH + self.overflow_page_number = unpack(b">I", page[self.overflow_page_number_offset: + overflow_page_number_end_offset])[0] + if self.bytes_on_first_page < m: + log_message = "When calculating overflow, the bytes on the first page: {} calculated are less than " \ + "m: {} for b-tree index interior cell index: {} at offset: {} for page: {} in " \ + "page version: {} for version: {}." + log_message = log_message.format(self.bytes_on_first_page, m, self.index, self.start_offset, + self.page_number, self.page_version_number, self.version_number) + self._logger.error(log_message) + raise CellParsingError(log_message) + + self.byte_size = LEFT_CHILD_POINTER_BYTE_LENGTH + self.byte_size += self.payload_byte_size_varint_length + self.payload_byte_size + self.byte_size += FIRST_OVERFLOW_PAGE_NUMBER_LENGTH if self.has_overflow else 0 + self.end_offset = self.start_offset + self.byte_size - self.payload_byte_size + self.bytes_on_first_page + + self.overflow_byte_size = self.payload_byte_size - self.bytes_on_first_page + self.expected_number_of_overflow_pages, \ + self.expected_last_overflow_page_content_size = calculate_expected_overflow(self.overflow_byte_size, u) + + self.md5_hex_digest = get_md5_hash(page[self.start_offset:self.end_offset]) + + if self.has_overflow: + + """ + + The overflow pages are in a dictionary keyed off of their page number in the format: + overflow_page[OVERFLOW_PAGE_NUMBER] = OVERFLOW_PAGE + + Originally, the overflow pages were nested objects, ie. each overflow page had the following overflow + page within it, and so on. However, this lead to recursion depth problems with larger cell content. + It was changed to be a dictionary of pages here instead. + + Note: Although overflow pages have to be replaced when any overflow page in a chain is updated, the + overflow here may not be updated due to a different cell in this page being updated. Therefore, + we allow the first overflow page to be in a earlier version. However, the overflow pages still + check that all overflow versions in respect to the first overflow page and beyond in the linked + list are all equal. + + """ + + self.overflow_pages = {} + payload_remaining = self.overflow_byte_size + + overflow_page = OverflowPage(self._version_interface, self.overflow_page_number, self.page_number, + FIRST_OVERFLOW_PARENT_PAGE_NUMBER, FIRST_OVERFLOW_PAGE_INDEX, + payload_remaining) + + self.overflow_pages[overflow_page.number] = overflow_page + self.last_overflow_page_content_size = overflow_page.content_length + + while overflow_page.next_overflow_page_number: + payload_remaining = payload_remaining - overflow_page.size + OVERFLOW_HEADER_LENGTH + overflow_page = OverflowPage(self._version_interface, overflow_page.next_overflow_page_number, + self.page_number, overflow_page.number, overflow_page.index + 1, + payload_remaining) + self.overflow_pages[overflow_page.number] = overflow_page + self.last_overflow_page_content_size = overflow_page.content_length + + if self.expected_number_of_overflow_pages != self.number_of_overflow_pages: + log_message = "The number of expected overflow pages: {} was not the actual number of overflow pages " \ + "parsed: {} for b-tree index interior cell index: {} at offset: {} for page: {} in " \ + "page version: {} for version: {}." + log_message = log_message.format(self.expected_number_of_overflow_pages, self.number_of_overflow_pages, + self.index, self.start_offset, self.page_number, self.page_version_number, + self.version_number) + self._logger.error(log_message) + raise CellParsingError(log_message) + + if self.expected_last_overflow_page_content_size != self.last_overflow_page_content_size: + log_message = "The expected last overflow page content size: {} was not the actual last overflow page " \ + "content size parsed: {} for b-tree index interior cell index: {} at offset: {} for " \ + "page: {} in page version: {} for version: {}." + log_message = log_message.format(self.expected_last_overflow_page_content_size, + self.last_overflow_page_content_size, self.index, self.start_offset, + self.page_number, self.page_version_number, self.version_number) + raise CellParsingError(log_message) + + self.payload = Record(page, self.payload_offset, self.payload_byte_size, + self.bytes_on_first_page, self.overflow) + + """ + + Note: An index interior cell can be updated without updating the left child page in a version. + + """ + + if not self.left_child_pointer: + log_message = "The left child pointer is not set for b-tree index interior cell index: {} " \ + "at offset: {} for page: {} in page version: {} for version: {}." + log_message = log_message.format(self.index, self.start_offset, self.page_number, + self.page_version_number, self.version_number) + self._logger.error(log_message) + raise CellParsingError(log_message) + + left_child_pointer_page_hex_type = self._version_interface.get_page_data(self.left_child_pointer, + 0, PAGE_TYPE_LENGTH) + + if left_child_pointer_page_hex_type == INDEX_INTERIOR_PAGE_HEX_ID: + self.left_child_page = IndexInteriorPage(self._version_interface, self.left_child_pointer) + elif left_child_pointer_page_hex_type == INDEX_LEAF_PAGE_HEX_ID: + self.left_child_page = IndexLeafPage(self._version_interface, self.left_child_pointer) + else: + log_message = "The left child pointer does not point to a index interior or index page but instead has " \ + "a hex type of: {} for b-tree index interior cell index: {} at offset: {} for page: {} " \ + "in page version: {} for version: {}." + log_message = log_message.format(hexlify(left_child_pointer_page_hex_type), self.index, self.start_offset, + self.page_number, self.page_version_number, self.version_number) + self._logger.error(log_message) + raise CellParsingError(log_message) + + def stringify(self, padding=""): + string = "\n" \ + + padding + "Left Child Pointer: {}\n" \ + + padding + "Payload Byte Size: {}\n" \ + + padding + "Payload Byte Size VARINT Length: {}\n" \ + + padding + "Payload Offset: {}\n" \ + + padding + "Bytes on First Page: {}\n" \ + + padding + "Has Overflow: {}\n" \ + + padding + "Overflow Byte Size: {}\n" \ + + padding + "Expected Number of Overflow Pages: {}\n" \ + + padding + "Expected Last Overflow Page Content Size: {}\n" \ + + padding + "Number of Overflow Pages: {}\n" \ + + padding + "Overflow Page Number Offset: {}\n" \ + + padding + "Overflow Page Number: {}\n" \ + + padding + "Last Overflow Page Content Size: {}\n" \ + + padding + "Overflow (Hex): {}" + string = string.format(self.left_child_pointer, + self.payload_byte_size, + self.payload_byte_size_varint_length, + self.payload_offset, + self.bytes_on_first_page, + self.has_overflow, + self.overflow_byte_size, + self.expected_number_of_overflow_pages, + self.expected_last_overflow_page_content_size, + self.number_of_overflow_pages, + self.overflow_page_number_offset, + self.overflow_page_number, + self.last_overflow_page_content_size, + hexlify(self.overflow)) + string += "\n" + padding + "Payload:\n{}".format(self.payload.stringify(padding + "\t")) + if self.has_overflow: + overflow_page = self.overflow_pages[self.overflow_page_number] + string += "\n" + padding + "Overflow Page:\n{}".format(self.overflow_page.stringify(padding + "\t")) + while overflow_page.next_overflow_page_number: + overflow_page = self.overflow_pages[overflow_page.next_overflow_page_number] + string += "\n" + padding + "Overflow Page:\n{}".format(self.overflow_page.stringify(padding + "\t")) + string += "\n" + padding + "Left Child Page:\n{}" + string = string.format(self.left_child_page.stringify(padding + "\t") if self.left_child_page else None) + return super(IndexInteriorCell, self).stringify(padding) + string + + @property + def number_of_overflow_pages(self): + return len(self.overflow_pages) if self.overflow_pages else 0 + + @property + def overflow(self): + overflow = bytearray() + + if not self.has_overflow: + + return overflow + + else: + + overflow_page = self.overflow_pages[self.overflow_page_number] + overflow += overflow_page.content + while overflow_page.next_overflow_page_number: + overflow_page = self.overflow_pages[overflow_page.next_overflow_page_number] + overflow += overflow_page.content + + if len(overflow) != self.overflow_byte_size: + log_message = "The expected overflow size: {} did not match the overflow size parsed: {} " \ + "for b-tree table leaf cell index: {} at offset: {} for page: {} " \ + "in page version: {} for version: {}." + log_message = log_message.format(self.overflow_byte_size, len(overflow), self.index, self.start_offset, + self.page_number, self.page_version_number, self.version_number) + raise CellParsingError(log_message) + + return overflow + + +class IndexLeafCell(BTreeCell): + + def __init__(self, version_interface, page_version_number, file_offset, page_number, page, index, offset): + + super(IndexLeafCell, self).__init__(version_interface, page_version_number, file_offset, + page_number, index, offset) + + self.payload_byte_size, self.payload_byte_size_varint_length = decode_varint(page, self.start_offset) + self.payload_offset = self.start_offset + self.payload_byte_size_varint_length + + self.has_overflow = False + self.overflow_pages = 0 + self.overflow_page_number_offset = None + self.overflow_page_number = None + self.overflow_page = None + self.last_overflow_page_content_size = 0 + + u = self._page_size + p = self.payload_byte_size + x = (((u - 12) * 64) / 255) - 23 + + """ + + Note: According to the SQLite documentation (as of version 3.9.2) index interior and leaf cell overflow is + calculated by first calculating x as (((u - 12) * 64) / 255) - 23. If the payload size p is less than + or equal to x, then there is no overflow. If p is greater than x, than m = (((u - 12) * 32) / 255) - 23. + If p is greater than x then the number of bytes stored on the b-tree leaf page is the smaller of + m + ((p - m) % (u - 4)) and x. The remaining bytes are then moved to overflow pages. + + The above was found to be wrong in the SQLite documentation. + + The documentation is incorrect that it is the smaller of m + ((p - m) % (u - 4)) and x. After + a lot of testing and reviewing of the actual SQLite c code it was found out that the actual number of + bytes stored on the b-tree leaf page is m + ((p - m) % (u - 4)) unless m + ((p - m) % (u - 4)) > x + in which case the bytes stored on the b-tree index interior or index leaf page is m itself. + + Therefore let b be the bytes on the b-tree index interior or index leaf page: + u = page size + p = payload byte size + x = (((u - 12) * 64) / 255) - 23 + if p > x + m = (((u - 12) * 32) / 255) - 23 + b = m + ((p - m) % (u - 4)) + if b > x + b = m + + Additionally, the bytes stored on the b-tree index interior or index leaf page will always be greater + to or equal to m once calculated. + + """ + + self.bytes_on_first_page = p + if p > x: + m = (((u - 12) * 32) / 255) - 23 + self.bytes_on_first_page = m + ((p - m) % (u - 4)) + if self.bytes_on_first_page > x: + self.bytes_on_first_page = m + self.has_overflow = True + self.overflow_page_number_offset = self.payload_offset + self.bytes_on_first_page + overflow_page_number_end_offset = self.overflow_page_number_offset + FIRST_OVERFLOW_PAGE_NUMBER_LENGTH + self.overflow_page_number = unpack(b">I", page[self.overflow_page_number_offset: + overflow_page_number_end_offset])[0] + if self.bytes_on_first_page < m: + log_message = "When calculating overflow, the bytes on the first page: {} calculated are less than " \ + "m: {} for b-tree leaf interior cell index: {} at offset: {} for page: {} in " \ + "page version: {} for version: {}." + log_message = log_message.format(self.bytes_on_first_page, m, self.index, self.start_offset, + self.page_number, self.page_version_number, self.version_number) + self._logger.error(log_message) + raise CellParsingError(log_message) + + self.byte_size = self.payload_byte_size_varint_length + self.payload_byte_size + self.byte_size += FIRST_OVERFLOW_PAGE_NUMBER_LENGTH if self.has_overflow else 0 + self.end_offset = self.start_offset + self.byte_size - self.payload_byte_size + self.bytes_on_first_page + + self.overflow_byte_size = self.payload_byte_size - self.bytes_on_first_page + self.expected_number_of_overflow_pages, \ + self.expected_last_overflow_page_content_size = calculate_expected_overflow(self.overflow_byte_size, u) + + self.md5_hex_digest = get_md5_hash(page[self.start_offset:self.end_offset]) + + if self.has_overflow: + + """ + + The overflow pages are in a dictionary keyed off of their page number in the format: + overflow_page[OVERFLOW_PAGE_NUMBER] = OVERFLOW_PAGE + + Originally, the overflow pages were nested objects, ie. each overflow page had the following overflow + page within it, and so on. However, this lead to recursion depth problems with larger cell content. + It was changed to be a dictionary of pages here instead. + + Note: Although overflow pages have to be replaced when any overflow page in a chain is updated, the + overflow here may not be updated due to a different cell in this page being updated. Therefore, + we allow the first overflow page to be in a earlier version. However, the overflow pages still + check that all overflow versions in respect to the first overflow page and beyond in the linked + list are all equal. + + """ + + self.overflow_pages = {} + payload_remaining = self.overflow_byte_size + + overflow_page = OverflowPage(self._version_interface, self.overflow_page_number, self.page_number, + FIRST_OVERFLOW_PARENT_PAGE_NUMBER, FIRST_OVERFLOW_PAGE_INDEX, + payload_remaining) + + self.overflow_pages[overflow_page.number] = overflow_page + self.last_overflow_page_content_size = overflow_page.content_length + + while overflow_page.next_overflow_page_number: + payload_remaining = payload_remaining - overflow_page.size + OVERFLOW_HEADER_LENGTH + overflow_page = OverflowPage(self._version_interface, overflow_page.next_overflow_page_number, + self.page_number, overflow_page.number, overflow_page.index + 1, + payload_remaining) + self.overflow_pages[overflow_page.number] = overflow_page + self.last_overflow_page_content_size = overflow_page.content_length + + if self.expected_number_of_overflow_pages != self.number_of_overflow_pages: + log_message = "The number of expected overflow pages: {} was not the actual number of overflow pages " \ + "parsed: {} for b-tree index leaf cell index: {} at offset: {} for page: {} in " \ + "page version: {} for version: {}." + log_message = log_message.format(self.expected_number_of_overflow_pages, self.number_of_overflow_pages, + self.index, self.start_offset, self.page_number, self.page_version_number, + self.version_number) + self._logger.error(log_message) + raise CellParsingError(log_message) + + if self.expected_last_overflow_page_content_size != self.last_overflow_page_content_size: + log_message = "The expected last overflow page content size: {} was not the actual last overflow page " \ + "content size parsed: {} for b-tree index leaf cell index: {} at offset: {} for " \ + "page: {} in page version: {} for version: {}." + log_message = log_message.format(self.expected_last_overflow_page_content_size, + self.last_overflow_page_content_size, self.index, self.start_offset, + self.page_number, self.page_version_number, self.version_number) + raise CellParsingError(log_message) + + self.payload = Record(page, self.payload_offset, self.payload_byte_size, + self.bytes_on_first_page, self.overflow) + + def stringify(self, padding=""): + string = "\n" \ + + padding + "Payload Byte Size: {}\n" \ + + padding + "Payload Byte Size VARINT Length: {}\n" \ + + padding + "Payload Offset: {}\n" \ + + padding + "Bytes on First Page: {}\n" \ + + padding + "Has Overflow: {}\n" \ + + padding + "Overflow Byte Size: {}\n" \ + + padding + "Expected Number of Overflow Pages: {}\n" \ + + padding + "Expected Last Overflow Page Content Size: {}\n" \ + + padding + "Number of Overflow Pages: {}\n" \ + + padding + "Overflow Page Number Offset: {}\n" \ + + padding + "Overflow Page Number: {}\n" \ + + padding + "Last Overflow Page Content Size: {}\n" \ + + padding + "Overflow (Hex): {}" + string = string.format(self.payload_byte_size, + self.payload_byte_size_varint_length, + self.payload_offset, + self.bytes_on_first_page, + self.has_overflow, + self.overflow_byte_size, + self.expected_number_of_overflow_pages, + self.expected_last_overflow_page_content_size, + self.number_of_overflow_pages, + self.overflow_page_number_offset, + self.overflow_page_number, + self.last_overflow_page_content_size, + hexlify(self.overflow)) + string += "\n" + padding + "Payload:\n{}".format(self.payload.stringify(padding + "\t")) + if self.has_overflow: + overflow_page = self.overflow_pages[self.overflow_page_number] + string += "\n" + padding + "Overflow Page:\n{}".format(self.overflow_page.stringify(padding + "\t")) + while overflow_page.next_overflow_page_number: + overflow_page = self.overflow_pages[overflow_page.next_overflow_page_number] + string += "\n" + padding + "Overflow Page:\n{}".format(self.overflow_page.stringify(padding + "\t")) + return super(IndexLeafCell, self).stringify(padding) + string + + @property + def number_of_overflow_pages(self): + return len(self.overflow_pages) if self.overflow_pages else 0 + + @property + def overflow(self): + overflow = bytearray() + + if not self.has_overflow: + + return overflow + + else: + + overflow_page = self.overflow_pages[self.overflow_page_number] + overflow += overflow_page.content + while overflow_page.next_overflow_page_number: + overflow_page = self.overflow_pages[overflow_page.next_overflow_page_number] + overflow += overflow_page.content + + if len(overflow) != self.overflow_byte_size: + log_message = "The expected overflow size: {} did not match the overflow size parsed: {} " \ + "for b-tree table leaf cell index: {} at offset: {} for page: {} " \ + "in page version: {} for version: {}." + log_message = log_message.format(self.overflow_byte_size, len(overflow), self.index, self.start_offset, + self.page_number, self.page_version_number, self.version_number) + raise CellParsingError(log_message) + + return overflow + + +class Freeblock(BTreeCell): + + def __init__(self, version_interface, page_version_number, file_offset, page_number, page, index, offset): + + super(Freeblock, self).__init__(version_interface, page_version_number, file_offset, page_number, index, offset) + + next_freeblock_end_offset = self.start_offset + NEXT_FREEBLOCK_OFFSET_LENGTH + self.next_freeblock_offset = unpack(b">H", page[self.start_offset:next_freeblock_end_offset])[0] + self.content_start_offset = next_freeblock_end_offset + FREEBLOCK_BYTE_LENGTH + self.byte_size = unpack(b">H", page[next_freeblock_end_offset:self.content_start_offset])[0] + self.content_end_offset = self.start_offset + self.byte_size + self.end_offset = self.content_end_offset + + self.content_length = self.end_offset - self.content_start_offset + + self.md5_hex_digest = get_md5_hash(page[self.start_offset:self.end_offset]) + + def stringify(self, padding=""): + string = "\n" \ + + padding + "Next Freeblock Offset: {}\n" \ + + padding + "Content Start Offset: {}\n" \ + + padding + "Content End Offset: {}\n" \ + + padding + "Content Length: {}\n" \ + + padding + "Content (Hex): {}" + string = string.format(self.next_freeblock_offset, + self.content_start_offset, + self.content_end_offset, + self.content_length, + hexlify(self.content)) + return super(Freeblock, self).stringify(padding) + string + + @property + def content(self): + + """ + + This property returns the content inside this freeblock. This is only the body of the freeblock, unallocated + portion, and does not include the 4 byte freeblock header. + + :return: bytearray The byte array for freeblock content. + + """ + + if self.content_length == 0: + return bytearray() + else: + return self._version_interface.get_page_data(self.page_number, self.content_start_offset, + self.content_length) + + +class Fragment(BTreeCell): + + """ + + + + Note: It is important to note that fragments are three bytes in length or less. If four bytes or more become + unallocated within the cell area of the page, then a freeblock is created since four bytes are required. + (The first two bytes pointing to the offset of the next freeblock offset in the freeblock linked list + on the page and the second two bytes being the size of the freeblock in bytes including this 4 byte header.) + + However, fragments can be found with byte sizes greater than three. This occurs due to the fact that + multiple cells could be added and deleted next to each other creating fragments of size of 3 or less next + to each other. Since we cannot determine exactly where the break between these fragments are, we specify + the whole block as a fragment resulting in fragment sizes greater than the limit of 3 bytes. + + Therefore, if the fragment is greater than 3 bytes it is comprised of multiple fragments. Keep in mind + however that although this is true, the inverse is not true. If a fragment is three bytes or less, it could + still be an aggregate of multiple fragments such as a fragment of 1 byte and another fragment of 2 bytes. + + Note: Since the byte size is the size of the actual content, there is not content size. + + """ + + def __init__(self, version_interface, page_version_number, file_offset, page_number, + page, index, start_offset, end_offset): + + super(Fragment, self).__init__(version_interface, page_version_number, file_offset, + page_number, index, start_offset) + + self.end_offset = end_offset + self.byte_size = self.end_offset - self.start_offset + + self.md5_hex_digest = get_md5_hash(page[self.start_offset:self.end_offset]) + + def stringify(self, padding=""): + string = "\n" + padding + "Content (Hex): {}" + string = string.format(hexlify(self.content)) + return super(Fragment, self).stringify(padding) + string + + @property + def content(self): + + """ + + This property returns the content inside this fragment. + + :return: bytearray The byte array for fragment content. + + """ + + return self._version_interface.get_page_data(self.page_number, self.start_offset, self.end_offset) diff --git a/sqlite_dissect/file/database/payload.py b/sqlite_dissect/file/database/payload.py new file mode 100644 index 0000000..4a71e65 --- /dev/null +++ b/sqlite_dissect/file/database/payload.py @@ -0,0 +1,221 @@ +from abc import ABCMeta +from binascii import hexlify +from logging import getLogger +from re import sub +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.exception import RecordParsingError +from sqlite_dissect.utilities import decode_varint +from sqlite_dissect.utilities import get_md5_hash +from sqlite_dissect.utilities import get_record_content +from sqlite_dissect.utilities import get_serial_type_signature + +""" + +payload.py + +This script holds the objects used for parsing payloads from the cells in SQLite b-tree pages for +index leaf, index interior, and table leaf. (Table Interior pages do not have payloads in their cells.) + +This script holds the following object(s): +Payload(object) +Record(Payload) +RecordColumn(object) + +""" + + +class Payload(object): + + __metaclass__ = ABCMeta + + def __init__(self): + + self.start_offset = None + self.byte_size = None + self.end_offset = None + + self.has_overflow = False + self.bytes_on_first_page = None + self.overflow_byte_size = None + + self.header_byte_size = None + self.header_byte_size_varint_length = None + self.header_start_offset = None + self.header_end_offset = None + self.body_start_offset = None + self.body_end_offset = None + + self.md5_hex_digest = None + + self.record_columns = [] + self.serial_type_signature = "" + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding="", print_record_columns=True): + string = padding + "Start Offset: {}\n" \ + + padding + "End Offset: {}\n" \ + + padding + "Byte Size: {}\n" \ + + padding + "MD5 Hex Digest: {}\n" \ + + padding + "Header Byte Size: {}\n" \ + + padding + "Header Byte Size VARINT Length: {}\n" \ + + padding + "Header Start Offset: {}\n" \ + + padding + "Header End Offset: {}\n" \ + + padding + "Body Start Offset: {}\n" \ + + padding + "Body End Offset: {}\n" \ + + padding + "Has Overflow: {}\n" \ + + padding + "Bytes on First Page: {}\n" \ + + padding + "Overflow Byte Size: {}\n" \ + + padding + "Serial Type Signature: {}" + string = string.format(self.start_offset, + self.end_offset, + self.byte_size, + self.md5_hex_digest, + self.header_byte_size, + self.header_byte_size_varint_length, + self.header_start_offset, + self.header_end_offset, + self.body_start_offset, + self.body_end_offset, + self.has_overflow, + self.bytes_on_first_page, + self.overflow_byte_size, + self.serial_type_signature) + if print_record_columns: + for record_column in self.record_columns: + string += "\n" + padding + "Record Column:\n{}".format(record_column.stringify(padding + "\t")) + return string + + +class Record(Payload): + + def __init__(self, page, payload_offset, payload_byte_size, bytes_on_first_page=None, overflow=bytearray()): + + super(Record, self).__init__() + + logger = getLogger(LOGGER_NAME) + + if bytes_on_first_page is None: + + bytes_on_first_page = payload_byte_size + + if overflow: + log_message = "Bytes on first page not specified on page in record when overflow was (hex): {}." + log_message = log_message.format(hexlify(overflow)) + logger.error(log_message) + raise RecordParsingError(log_message) + + if bytes_on_first_page < payload_byte_size and not overflow: + log_message = "Bytes on first page: {} less than payload byte size: {} on page with overflow not set." + log_message = log_message.format(bytes_on_first_page, payload_byte_size) + logger.error(log_message) + raise RecordParsingError(log_message) + + if bytes_on_first_page > payload_byte_size: + log_message = "Bytes on first page: {} greater than payload byte size: {} on page." + log_message = log_message.format(bytes_on_first_page, payload_byte_size) + logger.error(log_message) + raise RecordParsingError(log_message) + + self.start_offset = payload_offset + self.byte_size = payload_byte_size + self.end_offset = self.start_offset + bytes_on_first_page + + self.has_overflow = False if not overflow else True + self.bytes_on_first_page = bytes_on_first_page + self.overflow_byte_size = self.byte_size - self.bytes_on_first_page + + if self.overflow_byte_size == 0 and overflow: + log_message = "Overflow determined to exist with byte size: {} on page with overflow set: {}." + log_message = log_message.format(self.overflow_byte_size, hexlify(overflow)) + logger.error(log_message) + raise RecordParsingError(log_message) + + self.header_byte_size, self.header_byte_size_varint_length = decode_varint(page, self.start_offset) + self.header_start_offset = self.start_offset + self.header_end_offset = self.start_offset + self.header_byte_size + self.body_start_offset = self.header_end_offset + self.body_end_offset = self.end_offset + + current_page_record_content = page[self.start_offset:self.end_offset] + + total_record_content = current_page_record_content + overflow + + if len(total_record_content) != self.byte_size: + log_message = "The record content was found to be a different length of: {} than the specified byte " \ + "size: {} on page." + log_message = log_message.format(len(total_record_content), self.byte_size) + logger.error(log_message) + raise RecordParsingError(log_message) + + self.md5_hex_digest = get_md5_hash(total_record_content) + + current_header_offset = self.header_byte_size_varint_length + current_body_offset = 0 + column_index = 0 + while current_header_offset < self.header_byte_size: + + serial_type, serial_type_varint_length = decode_varint(total_record_content, current_header_offset) + + self.serial_type_signature += str(get_serial_type_signature(serial_type)) + + record_column_md5_hash_string = total_record_content[current_header_offset: + current_header_offset + serial_type_varint_length] + + body_content = total_record_content[self.header_byte_size:self.byte_size] + + content_size, value = get_record_content(serial_type, body_content, current_body_offset) + + """ + + Note: If content_size == 0 then this will read out no data + + """ + + record_column_md5_hash_string += body_content[current_body_offset:current_body_offset + content_size] + + record_column_md5_hex_digest = get_md5_hash(record_column_md5_hash_string) + + record_column = RecordColumn(column_index, serial_type, serial_type_varint_length, + content_size, value, record_column_md5_hex_digest) + + self.record_columns.append(record_column) + + current_header_offset += serial_type_varint_length + current_body_offset += content_size + column_index += 1 + + +class RecordColumn(object): + + def __init__(self, index, serial_type, serial_type_varint_length, content_size, value, md5_hex_digest): + self.index = index + self.serial_type = serial_type + self.serial_type_varint_length = serial_type_varint_length + self.content_size = content_size + self.value = value + self.md5_hex_digest = md5_hex_digest + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding=""): + string = padding + "Index: {}\n" \ + + padding + "Serial Type: {}\n" \ + + padding + "Serial Type VARINT Length: {}\n" \ + + padding + "Content Size: {}\n" \ + + padding + "Value: {}\n" \ + + padding + "MD5 Hex Digest: {}" + return string.format(self.index, + self.serial_type, + self.serial_type_varint_length, + self.content_size, + self.value, + self.md5_hex_digest) diff --git a/sqlite_dissect/file/database/utilities.py b/sqlite_dissect/file/database/utilities.py new file mode 100644 index 0000000..bae73e6 --- /dev/null +++ b/sqlite_dissect/file/database/utilities.py @@ -0,0 +1,268 @@ +from logging import getLogger +from math import floor +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import PAGE_TYPE +from sqlite_dissect.constants import POINTER_MAP_ENTRY_LENGTH +from sqlite_dissect.exception import ParsingError +from sqlite_dissect.file.database.page import IndexInteriorPage +from sqlite_dissect.file.database.page import IndexLeafPage +from sqlite_dissect.file.database.page import PointerMapPage +from sqlite_dissect.file.database.page import TableInteriorPage +from sqlite_dissect.file.database.page import TableLeafPage + +""" + +utilities.py + +This script holds utility functions for dealing with database specific objects such as pages rather than more general +utility methods. + +This script holds the following function(s): +aggregate_leaf_cells(b_tree_page, accounted_for_cell_md5s=None, records_only=False) +create_pointer_map_pages(version, database_size_in_pages, page_size) +get_maximum_pointer_map_entries_per_page(page_size) +get_page_numbers_and_types_from_b_tree_page(b_tree_page) +get_pages_from_b_tree_page(b_tree_page) + +""" + + +def aggregate_leaf_cells(b_tree_page, accounted_for_cell_md5s=None, payloads_only=False): + + """ + + This function will parse through all records across all leaf pages in a b-tree recursively and return a total + number of cells found along with a dictionary of cells where the dictionary is in the form of: + cells[CELL_MD5_HEX_DIGEST] = cell. Therefore, without the accounted for cell md5s specified, + the number of cells will match the length of the records dictionary. + + If the accounted for cell md5s field is set with entries, then those entries will be ignored from the dictionary + but the number of cells will include the number of accounted for cell md5s in it. Therefore, with the accounted + for cell md5s specified, the number of cells will match the length of the records dictionary + the number of + accounted for cell md5s found. + + If the payloads only flag is specified, the dictionary will only contain payloads (ie. records) and not the cells: + cells[CELL_MD5_HEX_DIGEST] = cell.payload. + + Note: As this function name implies, this only parses through the leaf pages of table and index b-tree pages. + Cells of interior pages will be not be handled by this function. + + :param b_tree_page: + :param accounted_for_cell_md5s: + :param payloads_only: + + :return: tuple(number_of_records, records) + + :raise: + + """ + + accounted_for_cell_md5s = set() if accounted_for_cell_md5s is None else accounted_for_cell_md5s + + number_of_cells = 0 + cells = {} + + if isinstance(b_tree_page, TableLeafPage) or isinstance(b_tree_page, IndexLeafPage): + + number_of_cells += len(b_tree_page.cells) + + if payloads_only: + for cell in b_tree_page.cells: + if cell.md5_hex_digest not in accounted_for_cell_md5s: + accounted_for_cell_md5s.add(cell.md5_hex_digest) + cells[cell.md5_hex_digest] = cell.payload + else: + for cell in b_tree_page.cells: + if cell.md5_hex_digest not in accounted_for_cell_md5s: + accounted_for_cell_md5s.add(cell.md5_hex_digest) + cells[cell.md5_hex_digest] = cell + + elif isinstance(b_tree_page, TableInteriorPage) or isinstance(b_tree_page, IndexInteriorPage): + + right_most_page_number_of_records, right_most_page_records = aggregate_leaf_cells(b_tree_page.right_most_page, + accounted_for_cell_md5s, + payloads_only) + number_of_cells += right_most_page_number_of_records + cells.update(right_most_page_records) + + for cell in b_tree_page.cells: + + left_child_page_number_of_records, left_child_page_records = aggregate_leaf_cells(cell.left_child_page, + accounted_for_cell_md5s, + payloads_only) + number_of_cells += left_child_page_number_of_records + cells.update(left_child_page_records) + + else: + + log_message = "Invalid page type found: {} to aggregate cells on.".format(type(b_tree_page)) + getLogger(LOGGER_NAME).error(log_message) + raise ValueError(log_message) + + return number_of_cells, cells + + +def create_pointer_map_pages(version, database_size_in_pages, page_size): + + """ + + + + Note: When calling this function, the caller should have already determined if pointer map pages exist in the file + they are parsing or not. This can be done by checking the largest root b-tree page number exists in the + database header. If it does not exist, then pointer map pages are not enabled. This function does not + have any way nor need to check that field and solely computes what the pointer map pages would be off of + the database size in pages and page size. + + :param version: + :param database_size_in_pages: + :param page_size: + + :return: + + """ + + logger = getLogger(LOGGER_NAME) + + maximum_entries_per_page = get_maximum_pointer_map_entries_per_page(page_size) + + number_of_pointer_map_pages = 1 + if database_size_in_pages - 2 > maximum_entries_per_page: + database_pages_left = database_size_in_pages - 2 - maximum_entries_per_page + while database_pages_left > 0: + database_pages_left -= maximum_entries_per_page - 1 + number_of_pointer_map_pages += 1 + + pointer_map_pages = [] + pointer_map_page_number = 2 + number_of_pointer_map_pages = 0 + while pointer_map_page_number < database_size_in_pages: + + number_of_pointer_map_pages += 1 + entries = number_of_pointer_map_pages * maximum_entries_per_page + next_pointer_map_page_number = entries + 2 + number_of_pointer_map_pages + + number_of_entries = maximum_entries_per_page + if next_pointer_map_page_number > database_size_in_pages: + previous_entries = ((number_of_pointer_map_pages - 1) * maximum_entries_per_page) + number_of_entries = database_size_in_pages - previous_entries - number_of_pointer_map_pages - 1 + + pointer_map_pages.append(PointerMapPage(version, pointer_map_page_number, number_of_entries)) + pointer_map_page_number = next_pointer_map_page_number + + if pointer_map_page_number == database_size_in_pages: + log_message = "The next pointer map page number: {} is equal to the database size in pages: {} " \ + "for version: {} resulting in erroneous pointer map pages." + log_message = log_message.format(pointer_map_page_number, database_size_in_pages, version.version_number) + logger.error(log_message) + raise ParsingError(log_message) + + """ + + Iterate through the pointer map pages that were created and tally up all the pointer map pages along with their + pointer map entries. This total should match the total number of pages in the database. + + Note: The first pointer map page in the database is page 2 and therefore the root page always appears before the + first pointer map page at page 2. Below the calculated database pages starts at one to account for the root + database page. + + """ + + calculated_database_pages = 1 + for pointer_map_page in pointer_map_pages: + calculated_database_pages += 1 + calculated_database_pages += pointer_map_page.number_of_entries + + if calculated_database_pages != database_size_in_pages: + log_message = "The calculated number of database pages from the pointer map pages: {} does not equal the " \ + "database size in pages: {} for version: {}." + log_message = log_message.format(calculated_database_pages, database_size_in_pages, version.version_number) + logger.error(log_message) + raise ParsingError(log_message) + + return pointer_map_pages + + +def get_maximum_pointer_map_entries_per_page(page_size): + return int(floor(float(page_size)/POINTER_MAP_ENTRY_LENGTH)) + + +def get_page_numbers_and_types_from_b_tree_page(b_tree_page): + + logger = getLogger(LOGGER_NAME) + + b_tree_page_numbers = {} + + if isinstance(b_tree_page, TableLeafPage): + b_tree_page_numbers[b_tree_page.number] = PAGE_TYPE.B_TREE_TABLE_LEAF + elif isinstance(b_tree_page, IndexLeafPage): + b_tree_page_numbers[b_tree_page.number] = PAGE_TYPE.B_TREE_INDEX_LEAF + elif isinstance(b_tree_page, TableInteriorPage): + b_tree_page_numbers[b_tree_page.number] = PAGE_TYPE.B_TREE_TABLE_INTERIOR + b_tree_page_numbers.update(get_page_numbers_and_types_from_b_tree_page(b_tree_page.right_most_page)) + for b_tree_cell in b_tree_page.cells: + b_tree_page_numbers.update(get_page_numbers_and_types_from_b_tree_page(b_tree_cell.left_child_page)) + elif isinstance(b_tree_page, IndexInteriorPage): + b_tree_page_numbers[b_tree_page.number] = PAGE_TYPE.B_TREE_INDEX_INTERIOR + b_tree_page_numbers.update(get_page_numbers_and_types_from_b_tree_page(b_tree_page.right_most_page)) + for b_tree_cell in b_tree_page.cells: + b_tree_page_numbers.update(get_page_numbers_and_types_from_b_tree_page(b_tree_cell.left_child_page)) + else: + log_message = "The b-tree page is not a BTreePage object but has a type of: {}." + log_message = log_message.format(type(b_tree_page)) + logger.error(log_message) + raise ValueError(log_message) + + if not isinstance(b_tree_page, TableInteriorPage): + for cell in b_tree_page.cells: + if cell.has_overflow: + overflow_page = cell.overflow_pages[cell.overflow_page_number] + b_tree_page_numbers[overflow_page.number] = PAGE_TYPE.OVERFLOW + while overflow_page.next_overflow_page_number: + overflow_page = cell.overflow_pages[overflow_page.next_overflow_page_number] + b_tree_page_numbers[overflow_page.number] = PAGE_TYPE.OVERFLOW + + return b_tree_page_numbers + + +def get_pages_from_b_tree_page(b_tree_page): + + """ + + + + Note: The b-tree page sent in is included in the return result. + + :param b_tree_page: + + :return: + + """ + + logger = getLogger(LOGGER_NAME) + + b_tree_pages = [] + + if isinstance(b_tree_page, TableLeafPage) or isinstance(b_tree_page, IndexLeafPage): + b_tree_pages.append(b_tree_page) + elif isinstance(b_tree_page, TableInteriorPage) or isinstance(b_tree_page, IndexInteriorPage): + b_tree_pages.append(b_tree_page) + b_tree_pages.extend(get_pages_from_b_tree_page(b_tree_page.right_most_page)) + for b_tree_cell in b_tree_page.cells: + b_tree_pages.extend(get_pages_from_b_tree_page(b_tree_cell.left_child_page)) + else: + log_message = "The b-tree page is not a BTreePage object but has a type of: {}." + log_message = log_message.format(type(b_tree_page)) + logger.error(log_message) + raise ValueError(log_message) + + if not isinstance(b_tree_page, TableInteriorPage): + for cell in b_tree_page.cells: + if cell.has_overflow: + overflow_page = cell.overflow_pages[cell.overflow_page_number] + b_tree_pages.append(overflow_page) + while overflow_page.next_overflow_page_number: + overflow_page = cell.overflow_pages[overflow_page.next_overflow_page_number] + b_tree_pages.append(overflow_page) + + return b_tree_pages diff --git a/sqlite_dissect/file/file_handle.py b/sqlite_dissect/file/file_handle.py new file mode 100644 index 0000000..7791ceb --- /dev/null +++ b/sqlite_dissect/file/file_handle.py @@ -0,0 +1,262 @@ +import os +from logging import getLogger +from re import sub +from warnings import warn +from sqlite_dissect.constants import FILE_TYPE +from sqlite_dissect.constants import LOCK_BYTE_PAGE_START_OFFSET +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import ROLLBACK_JOURNAL_HEADER_LENGTH +from sqlite_dissect.constants import SQLITE_DATABASE_HEADER_LENGTH +from sqlite_dissect.constants import UTF_8 +from sqlite_dissect.constants import UTF_8_DATABASE_TEXT_ENCODING +from sqlite_dissect.constants import UTF_16BE +from sqlite_dissect.constants import UTF_16BE_DATABASE_TEXT_ENCODING +from sqlite_dissect.constants import UTF_16LE +from sqlite_dissect.constants import UTF_16LE_DATABASE_TEXT_ENCODING +from sqlite_dissect.constants import WAL_HEADER_LENGTH +from sqlite_dissect.constants import WAL_INDEX_HEADER_LENGTH +from sqlite_dissect.file.database.header import DatabaseHeader +from sqlite_dissect.file.journal.header import RollbackJournalHeader +from sqlite_dissect.file.wal.header import WriteAheadLogHeader +from sqlite_dissect.file.wal_index.header import WriteAheadLogIndexHeader + +""" + +file_handle.py + +This script holds the file handle for file objects to be worked with in relation to the database, wal, journal and other +supported file types specified in the FILE_TYPE file types list. + +This script holds the following object(s): +FileHandle(object) + +""" + + +class FileHandle(object): + + def __init__(self, file_type, file_identifier, database_text_encoding=None, file_size=None): + + """ + + Constructor. This constructor initializes this object. + + Note: Either the file name or the file object needs to be specified as the file_identifier. The file name + is derived from the file object in order to derive the file size of the object by calling getsize on + the file name as well as for informational and logging purposes. + + :param file_type: str The type of the file. Must be one of the file types in the FILE_TYPE list. + :param file_identifier: str or file The full file path to the file to be opened or the file object. + :param database_text_encoding: str The encoding of the text strings in the sqlite database file. + :param file_size: int Optional parameter to supply the file size. + + :raise: IOError If the file_name is specified and upon opening the file: + 1.) the file name specifies a file that does not exist, or + 2.) the file name specified a file that is not a file, or + 3.) the file name is unable to be opened in "rb" mode. + :raise: ValueError If: + 1.) both the file name and file are set, or + 2.) neither the file name or file are set, or + 3.) the file type is not a valid file type. + + """ + + self._logger = getLogger(LOGGER_NAME) + + self.file_type = file_type + self.file_object = None + self.file_externally_controlled = False + self._database_text_encoding = database_text_encoding + + if isinstance(file_identifier, basestring): + + """ + + Note: The file identifier is the name (full path) of the file if it is an instance of basestring. We check + to make sure the file exists and it is actually a file. + + """ + + if not os.path.exists(file_identifier): + log_message = "The file name specified does not exist: {}".format(file_identifier) + self._logger.error(log_message) + raise IOError(log_message) + + if not os.path.isfile(file_identifier): + log_message = "The file name specified is not a file: {}".format(file_identifier) + self._logger.error(log_message) + raise IOError(log_message) + + try: + self.file_object = open(file_identifier, "rb") + except IOError: + log_message = "Unable to open the file in \"rb\" mode with file name: {}.".format(file_identifier) + self._logger.error(log_message) + raise + + else: + self.file_object = file_identifier + self.file_externally_controlled = True + + if file_size: + self.file_size = file_size + else: + try: + self.file_size = os.fstat(self.file_object.fileno()).st_size + except AttributeError: + # If all else fails, use the seek to the end of the file trick. + self.file_object.seek(0, os.SEEK_END) + self.file_size = self.file_object.tell() + self.file_object.seek(0) + + if self.file_type == FILE_TYPE.DATABASE: + + if self.file_size > LOCK_BYTE_PAGE_START_OFFSET: + log_message = "The file size: {} is >= lock byte offset: {} and the lock byte page is not supported." + self._logger.error(log_message) + raise NotImplementedError(log_message) + + try: + + database_header = DatabaseHeader(self.file_object.read(SQLITE_DATABASE_HEADER_LENGTH)) + + if self._database_text_encoding: + log_message = "Database text encoding specified as: {} when should not be set." + self._logger.error(log_message) + raise ValueError(log_message) + + if database_header.database_text_encoding == UTF_8_DATABASE_TEXT_ENCODING: + self._database_text_encoding = UTF_8 + elif database_header.database_text_encoding == UTF_16LE_DATABASE_TEXT_ENCODING: + self._database_text_encoding = UTF_16LE + elif database_header.database_text_encoding == UTF_16BE_DATABASE_TEXT_ENCODING: + self._database_text_encoding = UTF_16BE + elif database_header.database_text_encoding: + log_message = "The database text encoding: {} is not recognized as a valid database text encoding." + log_message = log_message.format(database_header.database_text_encoding) + self._logger.error(log_message) + raise RuntimeError(log_message) + + self.header = database_header + + except: + log_message = "Failed to initialize the database header." + self._logger.error(log_message) + raise + + elif self.file_type == FILE_TYPE.WAL: + + try: + self.header = WriteAheadLogHeader(self.file_object.read(WAL_HEADER_LENGTH)) + except: + log_message = "Failed to initialize the write ahead log header." + self._logger.error(log_message) + raise + + elif self.file_type == FILE_TYPE.WAL_INDEX: + + try: + self.header = WriteAheadLogIndexHeader(self.file_object.read(WAL_INDEX_HEADER_LENGTH)) + except: + log_message = "Failed to initialize the write ahead log index header." + self._logger.error(log_message) + raise + + elif self.file_type == FILE_TYPE.ROLLBACK_JOURNAL: + + try: + self.header = RollbackJournalHeader(self.file_object.read(ROLLBACK_JOURNAL_HEADER_LENGTH)) + except: + log_message = "Failed to initialize the rollback journal header." + self._logger.error(log_message) + raise + + else: + + log_message = "Invalid file type specified: {}.".format(self.file_type) + self._logger.error(log_message) + raise ValueError(log_message) + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding="", print_header=True): + string = padding + "File Type: {}\n" \ + + padding + "File Size: {}\n" \ + + padding + "Database Text Encoding: {}" + string = string.format(self.file_type, + self.file_size, + self.database_text_encoding) + if print_header: + string += "\n" + padding + "Header:\n{}".format(self.header.stringify(padding + "\t")) + return string + + @property + def database_text_encoding(self): + return self._database_text_encoding + + @database_text_encoding.setter + def database_text_encoding(self, database_text_encoding): + + if self._database_text_encoding and self._database_text_encoding != database_text_encoding: + log_message = "Database text encoding is set to: {} and cannot be set differently to: {}. " \ + "Operation not permitted." + log_message = log_message.format(self._database_text_encoding, database_text_encoding) + self._logger.error(log_message) + raise TypeError(log_message) + + if database_text_encoding not in [UTF_8, UTF_16LE, UTF_16BE]: + log_message = "The database text encoding: {} is not recognized as a valid database text encoding." + log_message = log_message.format(database_text_encoding) + self._logger.error(log_message) + raise ValueError(log_message) + + self._database_text_encoding = database_text_encoding + + def close(self): + + if self.file_externally_controlled: + + log_message = "Ignored request to close externally controlled file." + self._logger.warn(log_message) + warn(log_message, RuntimeWarning) + + else: + + try: + + self.file_object.close() + + except IOError: + + log_message = "Unable to close the file object." + self._logger.exception(log_message) + raise + + def read_data(self, offset, number_of_bytes): + + if offset >= self.file_size: + log_message = "Requested offset: {} is >= the file size: {}." + log_message = log_message.format(offset, self.file_size) + self._logger.error(log_message) + raise EOFError(log_message) + + if offset + number_of_bytes > self.file_size: + log_message = "Requested length of data: {} at offset {} to {} is > than the file size: {}." + log_message = log_message.format(number_of_bytes, offset, number_of_bytes + offset, self.file_size) + self._logger.error(log_message) + raise EOFError(log_message) + + try: + + self.file_object.seek(offset) + return self.file_object.read(number_of_bytes) + + except ValueError: + log_message = "An error occurred while reading from the file at offset: {} for {} number of bytes." + log_message = log_message.format(offset, number_of_bytes) + self._logger.error(log_message) + raise diff --git a/sqlite_dissect/file/header.py b/sqlite_dissect/file/header.py new file mode 100644 index 0000000..e68471e --- /dev/null +++ b/sqlite_dissect/file/header.py @@ -0,0 +1,42 @@ +from abc import ABCMeta +from abc import abstractmethod +from logging import getLogger +from re import sub +from sqlite_dissect.constants import LOGGER_NAME + +""" + +header.py + +This script holds an abstract class for file header objects to extend and inherit from. File headers such as that +of the wal, journal, and database file headers will extend this class. + +Note: The database file header is the same as the file header for the sqlite database. However, for cases like the wal + file, the file has a file header that is not related to the actual database information and then depending on how + many commits were done with the first page in them, could have many database headers. + +This script holds the following object(s): +SQLiteHeader(object) + +""" + + +class SQLiteHeader(object): + + __metaclass__ = ABCMeta + + def __init__(self): + self.page_size = None + self.md5_hex_digest = None + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + @abstractmethod + def stringify(self, padding=""): + log_message = "The abstract method stringify was called directly and is not implemented." + getLogger(LOGGER_NAME).error(log_message) + raise NotImplementedError(log_message) diff --git a/sqlite_dissect/file/journal/README.md b/sqlite_dissect/file/journal/README.md new file mode 100644 index 0000000..64d8d91 --- /dev/null +++ b/sqlite_dissect/file/journal/README.md @@ -0,0 +1,56 @@ + +# sqlite_dissect.file.journal + +This package will control parsing and access to the sqlite journal files. + +- header.py +- journal.py + +TODO items for the "journal" package: + +- [ ] Finish UML class diagrams. + +
+ +### header.py +This script holds the header objects for the rollback journal file and page record. + +This script holds the following object(s): +- RollbackJournalHeader(SQLiteHeader) +- RollbackJournalPageRecordHeader(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Finish try/except exception handling for struct.error and ord in classes. + ##### RollbackJournalHeader Class: + - [ ] Investigate invalid rollback journal header strings (warning currently raised). + - [ ] How to handle "zero'd out" headers. + - [ ] Calling classes should check the auto-vacuum mode in the database header for validity. + - [ ] Investigate why most headers observed aren't zero padded like sqlite documentation states. + - [ ] Check if there are use cases of different endianness for journals in sqlite documentation. + ##### RollbackJournalPageRecordHeader Class: + - [ ] Needs to be implemented. + +
+ +### journal.py +This script holds the class to parse the rollback journal file. + +This script holds the following object(s): +- RollbackJournal(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Finish try/except exception handling for struct.error in classes. +- [ ] Investigate if rollback journals can store data from multiple transactions. + ##### RollbackJournal Class: + - [ ] Account for the database text encoding in the file handle. + - [ ] This class needs to be fully implemented. + - [ ] Should this be incorporated with the version/version history somehow? + - [ ] The file_size arg may not be needed since it is in the file handle and may be removed + - [ ] Implement the stringify method correctly. + \ No newline at end of file diff --git a/sqlite_dissect/file/journal/__init__.py b/sqlite_dissect/file/journal/__init__.py new file mode 100644 index 0000000..26a0037 --- /dev/null +++ b/sqlite_dissect/file/journal/__init__.py @@ -0,0 +1,10 @@ + +""" + +__init__.py + +This init script will initialize any needed logic for this package. + +This package will control parsing and access to the sqlite journal files. + +""" diff --git a/sqlite_dissect/file/journal/header.py b/sqlite_dissect/file/journal/header.py new file mode 100644 index 0000000..e093c8b --- /dev/null +++ b/sqlite_dissect/file/journal/header.py @@ -0,0 +1,98 @@ +from binascii import hexlify +from logging import getLogger +from struct import unpack +from re import sub +from warnings import warn +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import ROLLBACK_JOURNAL_ALL_CONTENT_UNTIL_END_OF_FILE +from sqlite_dissect.constants import ROLLBACK_JOURNAL_HEADER_ALL_CONTENT +from sqlite_dissect.constants import ROLLBACK_JOURNAL_HEADER_HEX_STRING +from sqlite_dissect.constants import ROLLBACK_JOURNAL_HEADER_LENGTH +from sqlite_dissect.utilities import get_md5_hash +from sqlite_dissect.file.header import SQLiteHeader + +""" + +header.py + +This script holds the header objects for the rollback journal file and page record. + +This script holds the following object(s): +RollbackJournalHeader(SQLiteHeader) +RollbackJournalPageRecordHeader(object) + +""" + + +class RollbackJournalHeader(SQLiteHeader): + + def __init__(self, rollback_journal_header_byte_array): + + super(RollbackJournalHeader, self).__init__() + + logger = getLogger(LOGGER_NAME) + + if len(rollback_journal_header_byte_array) != ROLLBACK_JOURNAL_HEADER_LENGTH: + log_message = "The rollback journal header byte array of size: {} is not the expected size of: {}." + log_message = log_message.format(len(rollback_journal_header_byte_array), ROLLBACK_JOURNAL_HEADER_LENGTH) + logger.error(log_message) + raise ValueError(log_message) + + self.header_string = rollback_journal_header_byte_array[0:8] + + if self.header_string != ROLLBACK_JOURNAL_HEADER_HEX_STRING.decode("hex"): + + """ + + Instead of throwing an error here, a warning is thrown instead. This is due to the fact that the header + string was found in a few files that did not match the appropriate rollback journal header string. + Additional research needs to be done into what use cases this could lead to and if these are valid use + cases or not. + + """ + + log_message = "The header string is invalid." + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + self.page_count = unpack(b">I", rollback_journal_header_byte_array[8:12])[0] + + if rollback_journal_header_byte_array[8:12] == ROLLBACK_JOURNAL_HEADER_ALL_CONTENT.decode("hex"): + self.page_count = ROLLBACK_JOURNAL_ALL_CONTENT_UNTIL_END_OF_FILE + + self.random_nonce_for_checksum = unpack(b">I", rollback_journal_header_byte_array[12:16])[0] + self.initial_size_of_database_in_pages = unpack(b">I", rollback_journal_header_byte_array[16:20])[0] + self.disk_sector_size = unpack(b">I", rollback_journal_header_byte_array[20:24])[0] + self.size_of_pages_in_journal = unpack(b">I", rollback_journal_header_byte_array[24:28])[0] + + # The page size will be the same size as the "size of pages in journal" attribute of the header. + self.page_size = self.size_of_pages_in_journal + + self.md5_hex_digest = get_md5_hash(rollback_journal_header_byte_array) + + def stringify(self, padding=""): + string = padding + "Header String (Hex): {}\n" \ + + padding + "Page Count: {}\n" \ + + padding + "Random Nonce for Checksum: {}\n" \ + + padding + "Initial Size of Database in Pages: {}\n" \ + + padding + "Disk Sector Size: {}\n" \ + + padding + "Size of Pages in Journal: {}\n" \ + + padding + "MD5 Hex Digest: {}" + return string.format(hexlify(self.header_string), self.page_count, self.random_nonce_for_checksum, + self.initial_size_of_database_in_pages, self.disk_sector_size, + self.size_of_pages_in_journal, self.md5_hex_digest) + + +class RollbackJournalPageRecordHeader(object): + + def __init__(self): + pass + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding=""): + pass diff --git a/sqlite_dissect/file/journal/jounal.py b/sqlite_dissect/file/journal/jounal.py new file mode 100644 index 0000000..97aa12a --- /dev/null +++ b/sqlite_dissect/file/journal/jounal.py @@ -0,0 +1,32 @@ +from re import sub +from sqlite_dissect.constants import FILE_TYPE +from sqlite_dissect.file.file_handle import FileHandle + +""" + +journal.py + +This script holds the class to parse the rollback journal file. + +This script holds the following object(s): +RollbackJournal(object) + +""" + + +class RollbackJournal(object): + + def __init__(self, file_identifier, file_size=None): + + self.file_handle = FileHandle(FILE_TYPE.ROLLBACK_JOURNAL, file_identifier, file_size=file_size) + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding=""): + string = padding + "File Handle:\n{}" + string = string.format(self.file_handle.stringify(padding + "\t")) + return string diff --git a/sqlite_dissect/file/schema/README.md b/sqlite_dissect/file/schema/README.md new file mode 100644 index 0000000..688ef3c --- /dev/null +++ b/sqlite_dissect/file/schema/README.md @@ -0,0 +1,138 @@ + +# sqlite_dissect.file.schema + +This package will control parsing and access to the sqlite master schema files. + +- column.py +- master.py +- table.py +- utilities.py + +TODO items for the "schema" package: + +- [ ] Finish UML class diagrams. + +
+ +### column.py +This script holds the objects needed for parsing column related objects to the master schema. + +This script holds the following object(s): +- ColumnDefinition(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Create variables/constants for regular expressions used? + ##### ColumnDefinition Class: + - [ ] Improve the handling of finding and skipping comments. + - [ ] Handle column constraints correctly. + - [ ] Address the "(one/\*comment\*/two)" comment use case where sqlite allows this but ignores "two". + - [ ] Decide if static methods should be moved to a utility class (ie. do they have a reuse need). + - [ ] When getting the next segment index FOREIGN KEY constraints will cause issues when implemented. + - [ ] Test where the trim replaced all whitespace removed for segment in else for data types. + - [ ] Add additional documentation on the "NOT SPECIFIED" being a data type in addition to "INVALID". + - [ ] Address additional token use cases possibly. + - [ ] _get_next_segment_ending_index: The specific data type checking is not needed. + - [ ] _get_next_segment_ending_index: Document that the string should be trimmed. + - [ ] _get_next_segment_ending_index: Check on constraint strings such as "DEFAULT 0". + - [ ] _get_column_affinity: Check if this has duplicate functionality to other utility methods. + ##### ColumnConstraint Class: + - [ ] Implement comments. + - [ ] Needs to be implemented. + +
+ +### master.py +This script holds the main objects used for parsing the master schema and master schema entries (ie. rows). + +This script holds the following object(s): +- MasterSchema(object) +- MasterSchemaRow(object) +- TableRow(MasterSchemaRow) +- OrdinaryTableRow(TableRow) +- VirtualTableRow(TableRow) +- IndexRow(MasterSchemaRow) +- ViewRow(MasterSchemaRow) +- TriggerRow(MasterSchemaRow) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Investigate use cases quotes may be used in sql outside index, table, and column names. +- [ ] Investigate if trigger statements can be used in regards to modifying the master schema. +- [ ] Does it make more sense to have a WithoutRowIdTableRow instead of the OrdinaryTableRow with a flag? +- [ ] All table and index rows should have column definitions of some sort. +- [ ] Create variables/constants for regular expressions used? + ##### MasterSchema Class: + - [ ] Rename master_schema_entries just entries? + - [ ] Check if indexes are created on virtual tables for validation. + - [ ] Check to make sure every index has an associated table. + - [ ] Check to make sure every view has associated tables. + - [ ] Check to make sure trigger has associated tables and/or views. + - [ ] Validation on the master schema entries such as if indexes exist without any tables defined. + - [ ] When adding entries to the master schema entries, check if they already exist or not. + - [ ] Change the master schema entries to be better defined (for example a type keyed dictionary). + - [ ] Additional validation for the 0 root page use case in master_schema_b_tree_root_page_numbers. + - [ ] Remove the "master schema" in front of class attributes? + ##### MasterSchemaRow Class: + - [ ] Validate use cases of 0 or None for root page in rows (see root_page property). + - [ ] Implement comments in virtual tables, index, trigger and view rows once implemented. + - [ ] Address the "(one/*comment*/two)" comment use case where sqlite allows this but ignores "two". + - [ ] Investigate removal of the sql_has_comments flag. + - [ ] The row id is incorporated in the identifier and may be able to change upon alter statements. + - [ ] The root page nomenclature can be confusing since there is a master schema root and b-tree root. + - [ ] master_schema_b_tree_root_page_numbers: Test with a empty schema. + ##### TableRow Class: + - [ ] If a virtual table is found, the database version must be >= 3.8.2. + ##### OrdinaryTableRow Class: + - [ ] Provide better "sqlite_" internal schema object support (may not be needed). + - [ ] Implement parsing of the "AS" use case in the create table statement. + - [ ] The sql parsing is a bit complicated. This should be able to be done easier. + - [ ] During sql parsing use the size of the constraints array to check against instead of a boolean. + ##### VirtualTableRow Class: + - [ ] Provide better support for modules and a ModuleArgument class. Currently a warning is given. + - [ ] Virtual tables are assumed to always have a root page of 0. Investigate and enforce this. + ##### IndexRow Class: + - [ ] Handle the use case of indexes on table rows that have "without rowid" specified on them. + - [ ] Implement "sqlite_autoindex_TABLE_N" index internal schema objects. Currently a warning is given. + - [ ] Implement parsing of index columns. + - [ ] Implement partial indexes. + ##### ViewRow Class: + - [ ] Implement. + - [ ] Check tables exist for view information and validation. + ##### TriggerRow Class: + - [ ] Implement. + - [ ] Check tables and views exist for trigger information and validation. + +
+ +### table.py +This script holds the objects needed for parsing table related objects to the master schema. + +This script holds the following object(s): +- TableConstraint(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. + ##### TableConstraint Class: + - [ ] Needs to be implemented. + +
+ +### utilities.py +This script holds utility functions for dealing with schema specific objects such as parsing comments from sql rather +than more general utility methods. + +This script holds the following function(s): +- get_index_of_closing_parenthesis(string, opening_parenthesis_offset=0) +- parse_comment_from_sql_segment(sql_segment) +

+ +TODO: +- [ ] Documentation improvements. + \ No newline at end of file diff --git a/sqlite_dissect/file/schema/__init__.py b/sqlite_dissect/file/schema/__init__.py new file mode 100644 index 0000000..4fef9ce --- /dev/null +++ b/sqlite_dissect/file/schema/__init__.py @@ -0,0 +1,10 @@ + +""" + +__init__.py + +This init script will initialize any needed logic for this package. + +This package will control parsing and access to the sqlite master schema files. + +""" diff --git a/sqlite_dissect/file/schema/column.py b/sqlite_dissect/file/schema/column.py new file mode 100644 index 0000000..d38b3e0 --- /dev/null +++ b/sqlite_dissect/file/schema/column.py @@ -0,0 +1,604 @@ +from logging import getLogger +from re import match +from re import sub +from sqlite_dissect.constants import COLUMN_CONSTRAINT_PREFACES +from sqlite_dissect.constants import DATA_TYPE +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import TYPE_AFFINITY +from sqlite_dissect.exception import MasterSchemaRowParsingError +from sqlite_dissect.file.schema.utilities import get_index_of_closing_parenthesis + +""" + +column.py + +This script holds the objects needed for parsing column related objects to the master schema. + +This script holds the following object(s): +ColumnDefinition(object) + +""" + + +class ColumnDefinition(object): + + def __init__(self, index, column_text, comments=None): + + logger = getLogger(LOGGER_NAME) + + self.index = index + self.column_text = sub("\s\s+", " ", column_text.strip()) + + """ + + When the column text is sent in, the column text starts from the first column name until the "," in the + following form: + "COLUMN_NAME ... ," + + Any comments that may appear before the COLUMN_NAME or after the "," should already be parsed and sent in + through the constructor as the comments field. However, there may still be comments in the column text + itself, where the "...." appear above. These are parsed out here and removing them from the column text. + After the column text has all the comments removed, all multiple whitespace character segments including + newlines, etc. are replaced by single whitespace characters and then the column text is stripped. Comments + are only stripped since the "-- ... \n" comment form cannot have more than the terminating "\n" character + in it and the "/* ... */ segment may have "\n" characters in it for a reason, such as length of the comment. + + The way the comments are parsed out here is done by character and skipping ahead instead of pattern matches + since technically a comment may have another comment form in it. + + Any comment pulled out from any place in the column definition is considered on the column definition level, + and not tied to specific constraints, data types, etc. + + Note: The self.column_text field will be set to the column text sent into this class with only whitespace + modifications to strip the text and replace multiple whitespace characters with a single space, " ". + + """ + + # Setup the field to parse the column text and comments + parsed_column_text = "" + parsed_comments = [] + parsed_comments_total_length = 0 + + # Define an index for the parsing the column text + character_index = 0 + + # Iterate through all of the characters in the column text + while character_index < len(column_text): + + # Get the current indexed character + character = column_text[character_index] + + # Check for the "/* ... */" comment form + if character is "/": + last_comment_character_index = column_text.index("*/", character_index) + 1 + parsed_comment = column_text[character_index:last_comment_character_index + 1] + parsed_comments_total_length += len(parsed_comment) + parsed_comments.append(parsed_comment) + character_index = last_comment_character_index + + # Check for the "-- ... \n" comment form + elif character is "-" and column_text[character_index + 1] == "-": + + """ + + Above, we check to make sure we are encountering a comment by checking the next character as well + for the "-- ... \n" comment. + + Note: A single "-" is allowed since it can be before a negative default value for example in the + create statement. + + """ + + last_comment_character_index = column_text.index("\n", character_index) + parsed_comment = column_text[character_index:last_comment_character_index + 1] + parsed_comments_total_length += len(parsed_comment) + parsed_comments.append(parsed_comment) + character_index = last_comment_character_index + + else: + parsed_column_text += character + + # Increment the character index + character_index += 1 + + # Make sure the parsed lengths add up correctly to the original length + if parsed_comments_total_length + len(parsed_column_text) != len(column_text): + log_message = "Column index: {} with column text: {} of length: {} was not parsed correctly. The length " \ + "of the parsed comments total length was: {} with the following comments: {} and the " \ + "length of the parsed column text was: {} as: {}." + log_message = log_message.format(self.index, column_text, len(column_text), parsed_comments_total_length, + parsed_comments, len(parsed_column_text), parsed_column_text) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Update the parsed column text replacing any whitespace with a single " " character and stripping it + parsed_column_text = sub("\s\s+", " ", parsed_column_text.strip()) + + # Check the comments sent in for validity + if comments: + for comment in comments: + if not comment.startswith("--") and not comment.startswith("/*"): + log_message = "Comment specified does not start with the schema comment prefix: {}.".format(comment) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Below we strip the comments but if a "\n" happens to be in a "/* ... */", we leave it alone. + self.comments = [comment.strip() for comment in comments] if comments else [] + self.comments += [comment.strip() for comment in parsed_comments] + + # Retrieve the column name and remaining column text after the column name is removed + self.column_name, \ + remaining_column_text = ColumnDefinition._get_column_name_and_remaining_sql(index, parsed_column_text) + + # Setup default values for the column definition fields + self.derived_data_type_name = None + self.data_type = DATA_TYPE.NOT_SPECIFIED + self.column_constraints = [] + + """ + + If there is a remaining column text then we parse through it since there is either at least one data type + or column constraint defined. + + There are 0..1 data types and 0...* column constraints if there is remaining column text. + + Note: The following statements are valid column definitions: + 1.) field previous_field TEXT + 2.) field TEXT INTEGER BLOB + + This was noticed in a database that had a create table statement that had multiple field names but did + not throw an error in SQLite. This is because SQLite pulls the first field as the column name and then + takes the string until it hits a column constraint as the whole data type field. In the above examples, + the derived data types would be: + 1.) previous_field TEXT + 2.) TEXT INTEGER BLOB + + SQLite checks for the data type seeing if certain patterns are in this string in a certain order (see + the _get_column_affinity function for more information). Therefore, the affinities of the two examples + above would be: + 1.) TEXT + 2.) INTEGER + + Due to this, we parse out the data type the same way as SQLite. We move through the file until we find + a column constraint or the end of the column definition and then take that as the data type segment to + check on. Keep in mind there are more use cases that are tokenized during this process in SQLite. For + instance, if the column definition "field previous_field TEXT as BLOB" was specified, it would fail in + SQLite since "as" is a word that is identified as a particular use case in addition to column + constraints. This will not be worried about here since this will address all use cases allowed by SQLite + and be a superset of all of the use cases allowed for better compatibility instead of trying to handle + all of the same token use cases in the SQLite library. + + """ + + while len(remaining_column_text): + + # Get the next column definition segment + segment_index = ColumnDefinition._get_next_segment_ending_index(self.index, self.column_name, + remaining_column_text) + + # Make sure an error did not occur retrieving the segment index + if segment_index <= 0 or segment_index > len(remaining_column_text): + log_message = "Column name: {} with index: {} has a segment out of bounds with index: {} when the " \ + "remaining column text is: {} with length: {} from full column text: {}." + log_message = log_message.format(self.column_name, self.index, segment_index, remaining_column_text, + len(remaining_column_text), self.column_text) + logger.error(log_message) + raise IndexError(log_message) + + # Get the next segment + segment = remaining_column_text[:segment_index + 1] + + if (len(segment) == len(remaining_column_text) or match("\w", remaining_column_text[segment_index + 1])) \ + and ColumnDefinition._is_column_constraint_preface(segment): + + """ + + Here we set the column constraints to the rest of the remaining text. + + """ + + # Set the column constraints + self.column_constraints = [remaining_column_text] + + # Set the remaining column text (This will be an empty string but needed to exit from while.) + + """ + + The next step here is to parse the table constraints: + remaining_column_text = remaining_column_text[len(self.column_constraints):] + ... + + """ + + break + + else: + + """ + + The data type may have "(" and ")" characters in it to specify size (size of which is ignored by SQLite + as a side note) and needs to be correctly accounted for. Here we get rid of any whitespace around the + parenthesis and then any leading or trailing whitespace. + + """ + + segment = sub("\s*\(\s*", "(", segment) + segment = sub("\s*\)\s*", ")", segment) + segment = segment.strip() + + # Convert it to all uppercase for the derived data type name + self.derived_data_type_name = segment.upper() + + # Obtain the data type (if possible, otherwise it will be INVALID) from the derived data type name + self.data_type = self._get_data_type(self.derived_data_type_name) + + # Set the remaining column text accounting for the white space character after + remaining_column_text = remaining_column_text[segment_index + 1:] + + self.type_affinity = self._get_column_affinity(self.data_type, self.derived_data_type_name) + + @staticmethod + def _get_column_affinity(data_type, derived_data_type): + + column_type = data_type + + """ + + Below we check if the data type was invalid. If the data type is invalid, it means the original + type statement was not a predefined type. However, SQLite does not check against predefined types. + The SQLite codes does string matches on what was defined to determine affinity. For instance when + defining a table: "CREATE TABLE example (a CHAR, b CHARACTER)", both a and b will be determined to have + both TEXT affinity according to the rules below. Due to this, we set the type to check on back to the + derived data type since that has the original text in it with only some spacing modifications which is + negligible. Since the patterns are matched on case sensitivity, we call upper() on the derived data type. + + """ + + if column_type == DATA_TYPE.INVALID: + column_type = derived_data_type.upper() + + """ + + In order to determine the column affinity from the declared column data type we have to follow the + set of rules from the SQLite Data Type Documentation below in order: + + 1.) If the declared type contains the string "INT" then it is assigned INTEGER affinity. + 2.) If the declared type of the column contains any of the strings "CHAR", "CLOB", or "TEXT" + then that column has TEXT affinity. Notice that the type VARCHAR contains the string "CHAR" and is + thus assigned TEXT affinity. + 3.) If the declared type for a column contains the string "BLOB" or if no type is specified then the column + has affinity BLOB. + 4.) If the declared type for a column contains any of the strings "REAL", "FLOA", or "DOUB" then the column + has REAL affinity. + 5.) Otherwise, the affinity is NUMERIC. + + """ + + if "INT" in column_type: + return TYPE_AFFINITY.INTEGER + elif "CHAR" in column_type or "CLOB" in column_type or "TEXT" in column_type: + return TYPE_AFFINITY.TEXT + elif "BLOB" in column_type or column_type == DATA_TYPE.NOT_SPECIFIED: + return TYPE_AFFINITY.BLOB + elif "REAL" in column_type or "FLOA" in column_type or "DOUB" in column_type: + return TYPE_AFFINITY.REAL + else: + return TYPE_AFFINITY.NUMERIC + + @staticmethod + def _get_column_name_and_remaining_sql(index, column_text): + + # Initialize the logger + logger = getLogger(LOGGER_NAME) + + """ + + Since the column name can be in brackets, backticks, single quotes, or double quotes, we check to make sure + the column name is not in brackets, backticks, single quotes, or double quotes. If it is, our job is fairly + simple, otherwise we parse it normally. + + Note: SQLite allows backticks for compatibility with MySQL and allows brackets for compatibility with + Microsoft databases. + + """ + + if column_text[0] == "`": + + # The column name is surrounded by backticks + match_object = match("^`(.*?)`", column_text) + + if not match_object: + log_message = "No backtick match found for sql column definition: {} with text: {}." + log_message = log_message.format(index, column_text) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Set the column name and strip the backticks + column_name = column_text[match_object.start():match_object.end()].strip("`") + + # Set the remaining column text + remaining_column_text = column_text[match_object.end():] + + # Return the column name and remaining column text stripped of whitespace + return column_name, remaining_column_text.strip() + + elif column_text[0] == "[": + + # The column name is surrounded by brackets + match_object = match("^\[(.*?)\]", column_text) + + if not match_object: + log_message = "No bracket match found for sql column definition: {} with text: {}." + log_message = log_message.format(index, column_text) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Set the column name and strip the brackets + column_name = column_text[match_object.start():match_object.end()].strip("[]") + + # Set the remaining column text + remaining_column_text = column_text[match_object.end():] + + # Return the column name and remaining column text stripped of whitespace + return column_name, remaining_column_text.strip() + + elif column_text[0] == "\'": + + # The column name is surrounded by single quotes + match_object = match("^\'(.*?)\'", column_text) + + if not match_object: + log_message = "No single quote match found for sql column definition: {} with text: {}." + log_message = log_message.format(index, column_text) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Set the column name and strip the single quotes + column_name = column_text[match_object.start():match_object.end()].strip("\'") + + # Set the remaining column text + remaining_column_text = column_text[match_object.end():] + + # Return the column name and remaining column text stripped of whitespace + return column_name, remaining_column_text.strip() + + elif column_text[0] == "\"": + + # The column name is surrounded by double quotes + match_object = match("^\"(.*?)\"", column_text) + + if not match_object: + log_message = "No double quote match found for sql column definition: {} with text: {}." + log_message = log_message.format(index, column_text) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Set the column name and strip the double quotes + column_name = column_text[match_object.start():match_object.end()].strip("\"") + + # Set the remaining column text + remaining_column_text = column_text[match_object.end():] + + # Return the column name and remaining column text stripped of whitespace + return column_name, remaining_column_text.strip() + + else: + + """ + + We know now that either the space character is used to separate the column name or the column name + makes up the entirety of the column text if there is no space. + + """ + + if column_text.find(" ") != -1: + + # There is whitespace delimiting the column name + column_name = column_text[:column_text.index(" ")] + + # Parse the remaining column text + remaining_column_text = column_text[column_text.index(" ") + 1:] + + # Return the column name and remaining column text stripped of whitespace + return column_name, remaining_column_text.strip() + + else: + + # The whole column text is just the column name + column_name = column_text + + # The remaining column text should be an empty string but we return it for better interoperability + remaining_column_text = column_text[len(column_text):] + + if remaining_column_text: + log_message = "Column text remaining when none expected for column name: {} with text: {} " \ + "and remaining: {} for index: {}." + log_message = log_message.format(column_name, column_text, remaining_column_text, index) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Return the column name and remaining column text stripped of whitespace + return column_name, remaining_column_text.strip() + + @staticmethod + def _get_data_type(derived_data_type): + + # Convert the derived data type to uppercase + derived_data_type = derived_data_type.upper() + + # Remove any parenthesis along with numerical values + derived_data_type = sub("\(.*\)$", "", derived_data_type) + + # Replace spaces with underscores + derived_data_type = sub(" ", "_", derived_data_type) + + for data_type in DATA_TYPE: + + # We remove any numerical values from the end since sqlite does not recognize them in the data types + if sub("_\d+.*$", "", data_type) == derived_data_type: + return data_type + + # If no data type was found we return an invalid data type + return DATA_TYPE.INVALID + + @staticmethod + def _get_next_segment_ending_index(index, column_name, remaining_column_text): + + # Initialize the logger + logger = getLogger(LOGGER_NAME) + + if len(remaining_column_text) == 0: + log_message = "Invalid remaining column text of 0 length found for column index: {} with name: {}: {}." + log_message = log_message.format(index, column_name, remaining_column_text) + logger.error(log_message) + raise ValueError(log_message) + + """ + + Note: We do not want to trim the string ourselves here since we are parsing text and do not know what the + calling logic is doing outside this function. + + """ + + # Make sure all space is trimmed from the front of the remaining column text as it should be + if remaining_column_text[0].isspace(): + log_message = "Invalid remaining column text beginning with a space found for column " \ + "index: {} with name: {}: {}." + log_message = log_message.format(index, column_name, remaining_column_text) + logger.error(log_message) + raise ValueError(log_message) + + # Iterate through the remaining column text to find the next segment + next_segment_ending_index = 0 + while next_segment_ending_index < len(remaining_column_text): + + """ + + Note: Since column constraints are not properly implemented at the moment the following will work for + column data types but in the future, when this is expanded for column constraints, the + constraints will all work the same way according to the documentation except for the FOREIGN KEY + constraint which has content following the closing parenthesis. + + """ + + if remaining_column_text[next_segment_ending_index] == "(": + + # If we find a "(", we return the index of the closing ")" accounting for the following whitespace + return get_index_of_closing_parenthesis(remaining_column_text, next_segment_ending_index) + 1 + + elif remaining_column_text[next_segment_ending_index].isspace(): + + if remaining_column_text[next_segment_ending_index + 1] == "(": + + # If we find a "(", return the index of the closing one accounting for the following whitespace + return get_index_of_closing_parenthesis(remaining_column_text, next_segment_ending_index + 1) + 1 + + """ + + We do not have to worry about checking the length of the remaining column text since that is already + done above. However, this function does not properly check for constraint segments such as "DEFAULT 0" + where there still may be content following the initial constraint. However, constraints are not fully + implemented at this time, and when this is returned it will be detected within this class, and the rest + of the string will be used. A TODO has been put at the top of this script in regards to this. + + Note: We know that if there is a space, than there must be characters following that space since + all whitespace was replaced with single whitespaces and the string was trimmed. + + """ + + if ColumnDefinition._is_column_constraint_preface( + remaining_column_text[next_segment_ending_index + 1:]): + + return next_segment_ending_index + + else: + next_segment_ending_index += 1 + + else: + + # Check if this segment index equals the end of the remaining column text and if so, return it + + if next_segment_ending_index + 1 == len(remaining_column_text): + return next_segment_ending_index + + next_segment_ending_index += 1 + + """ + + The next segment was unable to be found + + """ + + log_message = "Was unable to find the next segment for column index: {} with name: {} on {}." + log_message = log_message.format(index, column_name, remaining_column_text) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + @staticmethod + def _is_column_constraint_preface(segment): + + for column_constraint_preface in COLUMN_CONSTRAINT_PREFACES: + + """ + + Note: When the check is done on the segment, we check the next character is not one of the allowed + characters in a column name, data type, etc. to make sure the constraint preface is not the + beginning of a longer name where it is not actually a constraint preface (example: primaryEmail). + The "\w" regular expression when no LOCALE and UNICODE flags are set will be equivalent to the set: + [a-zA-Z0-9_]. + + """ + + # Check to see if the segment starts with the column constraint preface + if segment.upper().startswith(column_constraint_preface): + if not (len(column_constraint_preface) + 1 <= len(segment) + and match("\w", segment[len(column_constraint_preface)])): + return True + + return False + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding="", print_column_constraints=True): + string = padding + "Column Text: {}\n" \ + + padding + "Index: {}\n" \ + + padding + "Column Name: {}\n" \ + + padding + "Derived Data Type Name: {}\n" \ + + padding + "Data Type: {}\n" \ + + padding + "Type Affinity: {}\n" \ + + padding + "Number of Comments: {}" + string = string.format(self.column_text, + self.index, + self.column_name, + self.derived_data_type_name, + self.data_type, + self.type_affinity, + len(self.comments)) + for comment in self.comments: + string += "\n" + padding + "Comment: {}".format(comment) + if print_column_constraints: + string += "\n" + padding + "Column Constraints: {}".format(self.column_constraints) + return string + + +class ColumnConstraint(object): + + def __init__(self, index, constraint): + + self.index = index + self.constraint = constraint + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding=""): + string = padding + "Index: {}\n" \ + + padding + "Constraint: {}" + return string.format(self.index, self.constraint) diff --git a/sqlite_dissect/file/schema/master.py b/sqlite_dissect/file/schema/master.py new file mode 100644 index 0000000..f830478 --- /dev/null +++ b/sqlite_dissect/file/schema/master.py @@ -0,0 +1,2327 @@ +from abc import ABCMeta +from abc import abstractmethod +from binascii import hexlify +from collections import namedtuple +from logging import getLogger +from re import match +from re import sub +from warnings import warn +from sqlite_dissect.constants import CREATE_TABLE_CLAUSE +from sqlite_dissect.constants import CREATE_VIRTUAL_TABLE_CLAUSE +from sqlite_dissect.constants import CREATE_INDEX_CLAUSE +from sqlite_dissect.constants import CREATE_UNIQUE_INDEX_CLAUSE +from sqlite_dissect.constants import INDEX_ON_COMMAND +from sqlite_dissect.constants import INDEX_WHERE_CLAUSE +from sqlite_dissect.constants import INTERNAL_SCHEMA_OBJECT_INDEX_PREFIX +from sqlite_dissect.constants import INTERNAL_SCHEMA_OBJECT_PREFIX +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import MASTER_PAGE_HEX_ID +from sqlite_dissect.constants import MASTER_SCHEMA_COLUMN +from sqlite_dissect.constants import MASTER_SCHEMA_NUMBER_OF_COLUMNS +from sqlite_dissect.constants import MASTER_SCHEMA_ROW_TYPE +from sqlite_dissect.constants import ORDINARY_TABLE_AS_CLAUSE +from sqlite_dissect.constants import SQLITE_MASTER_SCHEMA_ROOT_PAGE +from sqlite_dissect.constants import TABLE_CONSTRAINT_PREFACES +from sqlite_dissect.constants import VIRTUAL_TABLE_USING_CLAUSE +from sqlite_dissect.exception import MasterSchemaParsingError +from sqlite_dissect.exception import MasterSchemaRowParsingError +from sqlite_dissect.file.database.header import InteriorPageHeader +from sqlite_dissect.file.database.page import TableInteriorPage +from sqlite_dissect.file.database.page import TableLeafCell +from sqlite_dissect.file.database.page import TableLeafPage +from sqlite_dissect.file.database.utilities import get_pages_from_b_tree_page +from sqlite_dissect.file.schema.column import ColumnDefinition +from sqlite_dissect.file.schema.utilities import parse_comment_from_sql_segment +from sqlite_dissect.file.schema.table import TableConstraint +from sqlite_dissect.file.schema.utilities import get_index_of_closing_parenthesis +from sqlite_dissect.utilities import get_md5_hash + +""" + +master.py + +This script holds the main objects used for parsing the master schema and master schema entries (ie. rows). + +This script holds the following object(s): +MasterSchema(object) +MasterSchemaRow(object) +TableRow(MasterSchemaRow) +OrdinaryTableRow(TableRow) +VirtualTableRow(TableRow) +IndexRow(MasterSchemaRow) +ViewRow(MasterSchemaRow) +TriggerRow(MasterSchemaRow) + +""" + + +class MasterSchema(object): + + MasterSchemaEntryData = namedtuple("MasterSchemaEntryData", + "record_columns row_type sql b_tree_table_leaf_page_number cell") + + def __init__(self, version_interface, root_page): + + logger = getLogger(LOGGER_NAME) + + if root_page.number != SQLITE_MASTER_SCHEMA_ROOT_PAGE: + log_message = "The root page number: {} is not the expected sqlite master schema root page number: {}." + log_message = log_message.format(root_page.number, SQLITE_MASTER_SCHEMA_ROOT_PAGE) + logger.error(log_message) + raise ValueError(log_message) + + if root_page.hex_type != MASTER_PAGE_HEX_ID: + log_message = "The root page hex type: {} is not the expected master page hex: {}." + log_message = log_message.format(hexlify(root_page.hex_type), hexlify(MASTER_PAGE_HEX_ID)) + logger.error(log_message) + raise ValueError(log_message) + + self._version_interface = version_interface + + self.version_number = self._version_interface.version_number + self.page_version_number = self._version_interface.get_page_version(root_page.number) + self.root_page = root_page + self.master_schema_entries = [] + + """ + + The master schema entry data attribute below is a dictionary with up to four keys in it representing each of + the four types of master schema entries: index, table, trigger, and view pointing to an array of row data + where each entry is a MasterSchemaEntryData object describing an entry of that type. + + """ + + database_text_encoding = self._version_interface.database_text_encoding + + if isinstance(self.root_page, TableInteriorPage): + + master_schema_entry_data = MasterSchema._parse_table_interior(self.root_page, database_text_encoding) + + elif isinstance(self.root_page, TableLeafPage): + + master_schema_entry_data = MasterSchema._parse_table_leaf(self.root_page, database_text_encoding) + + else: + + """ + + Note: This case should never occur since we checked above that the root page needs to start with the + master page hex id and a ValueError would have already been thrown if this was not true. This + check is still done just in case. + + """ + + log_message = "The root page is not a table page but is a: {}.".format(type(self.root_page)) + logger.error(log_message) + raise ValueError(log_message) + + if not master_schema_entry_data: + + """ + + There is the use case that no master schema entry data was found (ie. empty/no defined schema). + + Double check this use case by making sure the root page b-tree header has: + 1.) The number of cells on the page set to zero. + 2.) The cell content offset is equal to the page size (meaning there is no page content). + 3.) The b-tree table page is not an interior page (referring that it would have subpages with information). + + """ + + b_tree_root_page_header = self.root_page.header + + if b_tree_root_page_header.number_of_cells_on_page != 0: + log_message = "The b-tree root page header has a cell count of: {} where the master schema entry " \ + "data was not set in version: {}." + log_message = log_message.format(b_tree_root_page_header.number_of_cells_on_page, self.version_number) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + + if b_tree_root_page_header.cell_content_offset != self._version_interface.page_size: + log_message = "The b-tree root page cell content offset is: {} when it should match the page " \ + "size: {} where the master schema entry data was not set in version: {}." + log_message = log_message.format(b_tree_root_page_header.cell_content_offset, + self._version_interface.page_size, self.version_number) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + + if isinstance(b_tree_root_page_header, InteriorPageHeader): + log_message = "The b-tree root page is an interior table page where the master schema entry data " \ + "was not set in version: {}." + log_message = log_message.format(self.version_number) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + + """ + + Next we create dictionaries for both tables and views. + + Table names are unique across both tables and views, however: + 1.) indexes can only be created on tables (not virtual tables or views) + 2.) views are built off of the tables + 3.) triggers can be built off of either tables and/or views but it is helpful to know which + + Therefore, we work with two dictionaries instead of one general table dictionary in the form of: + dictionary[TABLE_NAME] = [MasterSchemaRow] where MasterSchemaRow will be either a TableRow or IndexRow + depending on the respective dictionary. + + Note: Virtual tables will be checked in a different manner to ensure no indexes have been created from it + for validation purposes. + + """ + + master_schema_tables = {} + master_schema_views = {} + + if master_schema_entry_data: + + # Make sure the database text encoding is set. + if not self._version_interface.database_text_encoding: + log_message = "Master schema entries were found, however no database text encoding as been set yet " \ + "as expected in version: {}." + log_message = log_message.format(self.version_number) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + + """ + + Due to the way each type is built off of each other, we create the entries in the following order: + 1.) Tables + 2.) Indexes + 3.) Views + 4.) Triggers + + Since information from tables in creating indexes is helpful (especially in generating signatures), tables + are created first and then sent into the IndexRow class. The specific table that belongs to the index being + created is then pulled out and check in the IndexRow constructor. This table is not pulled out ahead of + time and sent in by itself since we don't have a good way to get to the index table name until the IndexRow + is created itself. + + Next, all tables are sent into the ViewRow since a view can be made of multiple tables. + + Last, all tables and views are sent into the TriggerRow since a trigger can be across multiple tables + and views. Triggers can be defined on views. Although INSERT, UPDATE, DELETE operations will not work + on views, triggers will cause associated triggers to fire. + + """ + + # Account for table master schema rows + if MASTER_SCHEMA_ROW_TYPE.TABLE in master_schema_entry_data: + for row_type_data in master_schema_entry_data[MASTER_SCHEMA_ROW_TYPE.TABLE]: + + """ + + For tables, we have the choice of two types of tables. The ordinary table and a virtual table. + There are two classes for these: OrdinaryTableRow and VirtualTableRow. Both of these classes + extend the TableRow class but need to be specified differently since they both are parsed + differently. We figure out what type of table we have by checking the beginning of the command. + If the command starts with CREATE_TABLE_COMMAND then the table is a create [ordinary] table + command and if it starts with CREATE_VIRTUAL_TABLE_COMMAND then the table is a virtual table. + + Note: Due to the way the rules work (documented in the table row classes themselves), the + create command at the beginning is always a set static command. All capitals with single + spaces until the table name. Therefore, we can be assured that these checks will work. + + """ + + if row_type_data.sql.startswith(CREATE_TABLE_CLAUSE): + table_row = OrdinaryTableRow(self._version_interface, + row_type_data.b_tree_table_leaf_page_number, + row_type_data.cell, row_type_data.record_columns) + elif row_type_data.sql.startswith(CREATE_VIRTUAL_TABLE_CLAUSE): + table_row = VirtualTableRow(self._version_interface, + row_type_data.b_tree_table_leaf_page_number, + row_type_data.cell, row_type_data.record_columns) + else: + log_message = "Master schema table row with table name: {} has invalid sql: {}." + log_message = log_message.format(row_type_data.sql) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + + if not table_row: + log_message = "Master schema table row was not set." + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + + self.master_schema_entries.append(table_row) + if table_row.table_name in master_schema_tables: + log_message = "Master schema table row with table name: {} was already specified in table rows." + log_message = log_message.format(table_row.table_name) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + master_schema_tables[table_row.table_name] = table_row + + # Account for index master schema rows + if MASTER_SCHEMA_ROW_TYPE.INDEX in master_schema_entry_data: + for row_type_data in master_schema_entry_data[MASTER_SCHEMA_ROW_TYPE.INDEX]: + index_row = IndexRow(self._version_interface, row_type_data.b_tree_table_leaf_page_number, + row_type_data.cell, row_type_data.record_columns, master_schema_tables) + self.master_schema_entries.append(index_row) + + # Account for view master schema rows + if MASTER_SCHEMA_ROW_TYPE.VIEW in master_schema_entry_data: + for row_type_data in master_schema_entry_data[MASTER_SCHEMA_ROW_TYPE.VIEW]: + view_row = ViewRow(self._version_interface, + row_type_data.b_tree_table_leaf_page_number, + row_type_data.cell, + row_type_data.record_columns, + master_schema_tables) + self.master_schema_entries.append(view_row) + if view_row.table_name in master_schema_tables: + log_message = "Master schema view row with table name: {} was already specified in table rows." + log_message = log_message.format(view_row.table_name) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + if view_row.table_name in master_schema_views: + log_message = "Master schema view row with table name: {} was already specified in view rows." + log_message = log_message.format(view_row.table_name) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + master_schema_views[view_row.table_name] = view_row + + # Account for trigger master schema rows + if MASTER_SCHEMA_ROW_TYPE.TRIGGER in master_schema_entry_data: + for row_type_data in master_schema_entry_data[MASTER_SCHEMA_ROW_TYPE.TRIGGER]: + trigger_row = TriggerRow(self._version_interface, row_type_data.b_tree_table_leaf_page_number, + row_type_data.cell, row_type_data.record_columns, master_schema_tables, + master_schema_views) + self.master_schema_entries.append(trigger_row) + + self.master_schema_pages = get_pages_from_b_tree_page(self.root_page) + self.master_schema_page_numbers = [master_schema_page.number for master_schema_page in self.master_schema_pages] + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding="", print_master_schema_root_page=True, + print_master_schema_entries=True, print_b_tree_root_pages=True): + string = padding + "Version Number: {}\n" \ + + padding + "Page Version Number: {}\n" \ + + padding + "Master Schema Page Numbers: {}\n" \ + + padding + "Master Schema Entries Length: {}\n" \ + + padding + "Master Schema B-Tree Root Page Numbers: {}" + string = string.format(self.version_number, + self.page_version_number, + self.master_schema_page_numbers, + len(self.master_schema_entries), + self.master_schema_b_tree_root_page_numbers) + if print_master_schema_root_page: + string += "\n" + padding + "Master Schema Root Page:\n{}" + string = string.format(self.root_page.stringify(padding + "\t")) + if print_master_schema_entries: + for master_schema_entry in self.master_schema_entries: + string += "\n" + padding + "Master Schema Entry:\n{}" + string = string.format(master_schema_entry.stringify(padding + "\t"), print_b_tree_root_pages) + return string + + @property + def master_schema_b_tree_root_page_numbers(self): + + """ + + This property will return a list of all of the root page numbers obtained from all master schema entries but + only in the following cases: + 1.) The entry has a root page number and is not None. + 2.) The root page number is not 0. + + Therefore, if the entries were called manually and inspected, master schema entries that are not in + this returned page number list may either be 0 or none. + + Note: Originally there was a method to retrieve root b-tree pages directly from the master schema. This was + changed by just having the master schema report the root page numbers and then have the client retrieve + them as needed from the version interface itself. In regards to pulling out the root pages the following + note was made that still applies: + + Additional investigation needs to be done here to see and confirm exactly where the root page can be + 0 or None. Right now we know of that according to the documentation "rows that define views, + triggers, and virtual tables, the rootpage column is 0 or NULL". We have seen that: + 1.) None seems to be used for triggers and views. + 2.) The root page number 0 seems to be used for virtual tables. + + Again additional investigation needs to be done here but these should be documented and checked. It + may be better to check this in the subclasses themselves instead of here (or both). + + :return: list A list of int data types representing the root page numbers from the master schema entries. + + """ + + return [entry.root_page_number for entry in self.master_schema_entries if entry.root_page_number] + + @staticmethod + def _create_master_schema_entry_data_named_tuple(b_tree_table_leaf_page_number, cell, database_text_encoding): + + logger = getLogger(LOGGER_NAME) + + record_columns = dict(map(lambda x: [x.index, x], cell.payload.record_columns)) + + if MASTER_SCHEMA_COLUMN.TYPE not in record_columns: + log_message = "No type column found in record columns for cell index: {}.".format(cell.index) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + + if not record_columns[MASTER_SCHEMA_COLUMN.TYPE].value: + log_message = "No type value set in type record column index: {} for cell index: {}." + log_message = log_message.format(record_columns[MASTER_SCHEMA_COLUMN.TYPE].index, cell.index) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + + row_type = record_columns[MASTER_SCHEMA_COLUMN.TYPE].value.decode(database_text_encoding) + + if MASTER_SCHEMA_COLUMN.SQL not in record_columns: + log_message = "No sql column found in record columns for cell index: {}.".format(cell.index) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + + """ + + Note: The value in the SQL record column may be None if there is a index internal schema object found. + + """ + + sql_value = record_columns[MASTER_SCHEMA_COLUMN.SQL].value + sql = sql_value.decode(database_text_encoding) if sql_value else None + + return MasterSchema.MasterSchemaEntryData(record_columns, row_type, sql, b_tree_table_leaf_page_number, cell) + + @staticmethod + def _parse_table_interior(b_tree_table_interior_page, database_text_encoding): + + logger = getLogger(LOGGER_NAME) + + pages = [b_tree_table_interior_page.right_most_page] + for b_tree_table_interior_cell in b_tree_table_interior_page.cells: + pages.append(b_tree_table_interior_cell.left_child_page) + + """ + + The master schema entry data attribute below is a dictionary with up to four keys in it representing each of + the four types of master schema entries: index, table, trigger, and view pointing to an array of row data + where each entry is a MasterSchemaEntryData object describing an entry of that type. + + """ + + master_schema_entry_data = {} + + for page in pages: + + if isinstance(page, TableInteriorPage): + returned_master_schema_entry_data = MasterSchema._parse_table_interior(page, database_text_encoding) + elif isinstance(page, TableLeafPage): + returned_master_schema_entry_data = MasterSchema._parse_table_leaf(page, database_text_encoding) + else: + log_message = "Invalid page type found: {} when expecting TableInteriorPage or TableLeafPage." + log_message = log_message.format(type(page)) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + + if not returned_master_schema_entry_data: + log_message = "Returned master schema entry data was not set." + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + + for row_type, row_type_data in returned_master_schema_entry_data.iteritems(): + if row_type in master_schema_entry_data: + master_schema_entry_data[row_type].extend(row_type_data) + else: + master_schema_entry_data[row_type] = row_type_data + + return master_schema_entry_data + + @staticmethod + def _parse_table_leaf(b_tree_table_leaf_page, database_text_encoding): + + logger = getLogger(LOGGER_NAME) + + """ + + All leaf pages should have at least one cell entry in them unless they are the root page. If the leaf page + is the root page, it can have 0 cells indicating no schema. + + """ + + if len(b_tree_table_leaf_page.cells) == 0 and b_tree_table_leaf_page.number != SQLITE_MASTER_SCHEMA_ROOT_PAGE: + log_message = "Length of cells on leaf page is 0 and page number is: {}." + log_message = log_message.format(b_tree_table_leaf_page.number) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + + """ + + The master schema entry data attribute below is a dictionary with up to four keys in it representing each of + the four types of master schema entries: index, table, trigger, and view pointing to an array of row data + where each entry is a MasterSchemaEntryData object describing an entry of that type. + + """ + + master_schema_entry_data = {} + + for cell in b_tree_table_leaf_page.cells: + entry_data = MasterSchema._create_master_schema_entry_data_named_tuple(b_tree_table_leaf_page.number, cell, + database_text_encoding) + if entry_data.row_type not in master_schema_entry_data: + master_schema_entry_data[entry_data.row_type] = [entry_data] + else: + master_schema_entry_data[entry_data.row_type].append(entry_data) + + return master_schema_entry_data + + +class MasterSchemaRow(object): + + __metaclass__ = ABCMeta + + @abstractmethod + def __init__(self, version_interface, b_tree_table_leaf_page_number, b_tree_table_leaf_cell, record_columns): + + logger = getLogger(LOGGER_NAME) + + if not isinstance(b_tree_table_leaf_cell, TableLeafCell): + log_message = "Invalid cell type found: {} when expecting TableLeafCell." + log_message = log_message.format(type(b_tree_table_leaf_cell)) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + self._version_interface = version_interface + + self.b_tree_table_leaf_page_number = b_tree_table_leaf_page_number + self.version_number = self._version_interface.version_number + self.page_version_number = self._version_interface.get_page_version(self.b_tree_table_leaf_page_number) + + self.row_id = b_tree_table_leaf_cell.row_id + self.row_md5_hex_digest = b_tree_table_leaf_cell.md5_hex_digest + self.record_md5_hex_digest = b_tree_table_leaf_cell.payload.md5_hex_digest + self.record_columns = record_columns + + if len(self.record_columns) != MASTER_SCHEMA_NUMBER_OF_COLUMNS: + log_message = "Invalid number of columns: {} when expected {} for row id: {} of row type: {} on page: {}." + log_message = log_message.format(len(self.record_columns), MASTER_SCHEMA_NUMBER_OF_COLUMNS, + self.row_id, self.row_type, self.b_tree_table_leaf_page_number) + logger.error(log_message) + MasterSchemaRowParsingError(log_message) + + if not self.record_columns[MASTER_SCHEMA_COLUMN.TYPE].value: + log_message = "No master schema column row type value found for row id: {} of row type: {} on page: {}." + log_message = log_message.format(self.row_id, self.row_type, self.b_tree_table_leaf_page_number) + logger.error(log_message) + MasterSchemaRowParsingError(log_message) + + if not self.record_columns[MASTER_SCHEMA_COLUMN.NAME].value: + log_message = "No master schema column name value found for row id: {} of row type: {} on page: {}." + log_message = log_message.format(self.row_id, self.row_type, self.b_tree_table_leaf_page_number) + logger.error(log_message) + MasterSchemaRowParsingError(log_message) + + if not self.record_columns[MASTER_SCHEMA_COLUMN.TABLE_NAME].value: + log_message = "No master schema column table name value found for row id: {} of row type: {} on page: {}." + log_message = log_message.format(self.row_id, self.row_type, self.b_tree_table_leaf_page_number) + logger.error(log_message) + MasterSchemaRowParsingError(log_message) + + # Get the database text encoding + database_text_encoding = version_interface.database_text_encoding + + # The fields are read out as strings for better incorporation with calling classes when hashing since + # if this is not done they are bytearray types and will be unhashable possibly throwing an exception. + self.row_type = self.record_columns[MASTER_SCHEMA_COLUMN.TYPE].value.decode(database_text_encoding) + self.name = self.record_columns[MASTER_SCHEMA_COLUMN.NAME].value.decode(database_text_encoding) + self.table_name = self.record_columns[MASTER_SCHEMA_COLUMN.TABLE_NAME].value.decode(database_text_encoding) + self.root_page_number = self.record_columns[MASTER_SCHEMA_COLUMN.ROOT_PAGE].value + + sql_value = self.record_columns[MASTER_SCHEMA_COLUMN.SQL].value + self.sql = sql_value.decode(database_text_encoding) if sql_value else None + + self.sql_has_comments = False + + self.comments = [] + + if self.sql: + + """ + + Below describes the documentation and assumptions that have been made while parsing the schema. + It is important to keep in mind that these may change in the future or might be different for + older SQLite files. Most of the files being test with are in the range of SQLite version 3.6 to 3.9. + + For the SQLITE_MASTER_TABLE_TYPE the table type could be a normal table or virtual table. + The two SQL commands this would account for would be CREATE TABLE and CREATE VIRTUAL TABLE. + + According to the SQLite File Format Documentation, the following modifications are done to the + SQL commands before storing them into the SQLite master table SQL column: + 1.) The CREATE, TABLE, VIEW, TRIGGER, and INDEX keywords at the beginning of the statement are + converted to all upper case letters. + 2.) The TEMP or TEMPORARY keyword is removed if it occurs after the initial CREATE keyword. + 3.) Any database name qualifier that occurs prior to the name of the object being created is removed. + 4.) Leading spaces are removed. + 5.) All spaces following the first two keywords are converted into a single space. + + To note, number 5 above does not work as exactly worded. The spaces are removed throughout all of + main keywords to the table name. After the table name, all spaces and capitalization are kept as + entered. + + Due to this we don't have to check for the TEMP, TEMPORARY, or database name qualifier such as + main.[DB NAME], temp.[DB NAME], etc. These qualifiers only place the table into the corresponding + opened database (schema name) and then removes this portion of the statement. As a side note, + temporary database files are stored in the temp directory of the user along with any additional files + such as a rollback journal or WAL file. + + Also, virtual tables were not incorporated until SQLite version 3.8.2 and therefore will not appear + in earlier version of SQLite. + + The statement "IF NOT EXISTS" is also removed but not documented in the above for table creation. + Therefore, we do not need to check for this use case. + + """ + + # Check if comments exist + if self.sql.find("--") != -1 or self.sql.find("/*"): + self.sql_has_comments = True + + """ + + Below we make a unique identifier for this master schema entry. This is build from all of the fields in the + master schema entry except for the root page. + + Note: All fields will have a value except for the SQL. This could be None but "None" will just be used in + the creation of the identifier. + + """ + + master_schema_entry_identifier_string = "{}{}{}{}".format(self.row_id, self.row_type, self.name, + self.table_name, self.sql) + self.md5_hash_identifier = get_md5_hash(master_schema_entry_identifier_string) + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding="", print_record_columns=True): + string = padding + "Version Number: {}\n" \ + + padding + "Page Version Number: {}\n" \ + + padding + "B-Tree Table Leaf Page Number: {}\n" \ + + padding + "Row ID: {}\n" \ + + padding + "Row MD5 Hex Digest: {}\n" \ + + padding + "Record MD5 Hex Digest: {}\n" \ + + padding + "Row Type: {}\n" \ + + padding + "Name: {}\n" \ + + padding + "Table Name: {}\n" \ + + padding + "Root Page Number: {}\n" \ + + padding + "SQL: {}\n" \ + + padding + "SQL Has Comments: {}\n" \ + + padding + "MD5 Hash Identifier: {}" + string = string.format(self.version_number, + self.page_version_number, + self.b_tree_table_leaf_page_number, + self.row_id, + self.row_md5_hex_digest, + self.record_md5_hex_digest, + self.row_type, + self.name, + self.table_name, + self.root_page_number, + self.sql, + self.sql_has_comments, + self.md5_hash_identifier) + for comment in self.comments: + string += "\n" + padding + "Comment: {}".format(comment) + if print_record_columns: + for index, record_column in self.record_columns.iteritems(): + string += "\n" \ + + padding + "Record Column {}:\n{}:".format(index, record_column.stringify(padding + "\t")) + return string + + @staticmethod + def _get_master_schema_row_name_and_remaining_sql(row_type, name, sql, remaining_sql_command): + + # Initialize the logger + logger = getLogger(LOGGER_NAME) + + """ + + This method can only be called on table or index types. + + """ + + if row_type not in [MASTER_SCHEMA_ROW_TYPE.TABLE, MASTER_SCHEMA_ROW_TYPE.INDEX]: + log_message = "Invalid row type: {} defined when parsing master schema row name: {} from sql: {} when " \ + "type {} or {} was expected." + log_message = log_message.format(row_type, name, sql, + MASTER_SCHEMA_ROW_TYPE.TABLE, MASTER_SCHEMA_ROW_TYPE.INDEX) + logger.error(log_message) + raise ValueError(log_message) + + """ + + Since the table or index name can be in brackets, backticks, single quotes, or double quotes, we check to + make sure the table or index name is not in single or double quotes. If it is, our job is fairly simple, + otherwise we parse it normally. + + Note: Characters like the '.' character are not allowed since it implies a schema. However, if it is in + brackets, backticks, or quotes (single or double), it is allowed. + + Note: There may be comments following the table name preceding the column definitions, ie. "(...)", portion + of the SQL. If the table name has brackets, backticks, or quotes (single or double) around it, + then this use case is handled in the way the table name is pulled out. However, if there are not + brackets, backticks, or quotes around the table name, the table name and remaining SQL have to be + accounted for differently in the case that there are comments. + + Note: SQLite allows backticks for compatibility with MySQL and allows brackets for compatibility with + Microsoft databases. + + """ + + if remaining_sql_command[0] == "[": + + # The table name or index name is surrounded by brackets + match_object = match("^\[(.*?)\]", remaining_sql_command) + + if not match_object: + log_message = "No bracket match found for {} name in sql for {} row name: {} and sql: {}." + log_message = log_message.format(row_type, row_type, name, sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Set the parsed name and strip the brackets + parsed_name = remaining_sql_command[match_object.start():match_object.end()].strip("[]") + + # Set the remaining sql + remaining_sql_command = remaining_sql_command[match_object.end():] + + # Return the parsed name and remaining sql command + return parsed_name, remaining_sql_command + + elif remaining_sql_command[0] == "`": + + # The table name or index name is surrounded by backticks + match_object = match("^`(.*?)`", remaining_sql_command) + + if not match_object: + log_message = "No backtick match found for {} name in sql for {} row name: {} and sql: {}." + log_message = log_message.format(row_type, row_type, name, sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Set the parsed name and strip the backticks + parsed_name = remaining_sql_command[match_object.start():match_object.end()].strip("`") + + # Set the remaining sql + remaining_sql_command = remaining_sql_command[match_object.end():] + + # Return the parsed name and remaining sql command + return parsed_name, remaining_sql_command + + elif remaining_sql_command[0] == "\'": + + # The table name or index name is surrounded by single quotes + match_object = match("^\'(.*?)\'", remaining_sql_command) + + if not match_object: + log_message = "No single quote match found for {} name in sql for {} row name: {} and sql: {}." + log_message = log_message.format(row_type, row_type, name, sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Set the parsed name and strip the single quotes + parsed_name = remaining_sql_command[match_object.start():match_object.end()].strip("\'") + + # Set the remaining sql + remaining_sql_command = remaining_sql_command[match_object.end():] + + # Return the parsed name and remaining sql command + return parsed_name, remaining_sql_command + + elif remaining_sql_command[0] == "\"": + + # The table name or index name is surrounded by double quotes + match_object = match("^\"(.*?)\"", remaining_sql_command) + + if not match_object: + log_message = "No double quote match found for {} name in sql for {} row name: {} and sql: {}." + log_message = log_message.format(row_type, row_type, name, sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Set the parsed name and strip the double quotes + parsed_name = remaining_sql_command[match_object.start():match_object.end()].strip("\"") + + # Set the remaining sql + remaining_sql_command = remaining_sql_command[match_object.end():] + + # Return the parsed name and remaining sql command + return parsed_name, remaining_sql_command + + else: + + # Iterate through the characters in the remaining sql command + for index, character in enumerate(remaining_sql_command): + + """ + + This works for both table and index since with indexes: + 1.) Indexes: Following the index name there has to be a newline, space or a comment indicator. + There is no use case for it to be anything else such as the opening parenthesis. + 2.) Tables: Following the table name, there may or may not be a space between the table name and + opening parenthesis. There may also be a comment (with or without a space) directly + after the table name. Here will only care in the case it is a comment indicator directly + after the table name without a space. We also check for newlines. + + Note: This may be a bit more time consuming for virtual table module names since at this point you + could just parse out the name by finding the next " " character index as the ending index for + the name. + + Note: A single "-" character is not allowed here as it is within the column definitions such as + default negative integer values, etc. + + """ + + # See if the character is a single space or an opening parenthesis, or comment indicator + if character == '\n' or character == ' ' or character == '(' or character == '-' or character == '/': + + # Check to make sure the full comment indicators were found for "--" and "/*" + if (character == '-' and remaining_sql_command[index + 1] != '-') or \ + (character == '/' and remaining_sql_command[index + 1] != '*'): + + log_message = "Comment indicator '{}' found followed by an invalid secondary comment " \ + "indicator: {} found in {} name in sql for {} row name: {} and sql: {}." + log_message = log_message.format(character, remaining_sql_command[index + 1], + row_type, row_type, name, sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Set the table name or index name + parsed_name = remaining_sql_command[:index] + + # Set the remaining sql + remaining_sql_command_start_offset = remaining_sql_command.index(parsed_name) + len(parsed_name) + remaining_sql_command = remaining_sql_command[remaining_sql_command_start_offset:] + + # Return the parsed name and remaining sql command + return parsed_name, remaining_sql_command + + # See if the character is a "." since this would apply a schema name which we know shouldn't exist. + elif character == '.': + log_message = "Invalid \'.\' character found in {} name in sql for " \ + "{} row name: {} and sql: {}." + log_message = log_message.format(row_type, row_type, name, sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + """ + + Note: The index method could throw an exception if the table name or index name is not found but this + use case is ignored here since we just retrieved it from the remaining SQL command itself. + + """ + + log_message = "No {} name found in sql for {} row name: {} and sql: {}." + log_message = log_message.format(row_type, row_type, name, sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + +class TableRow(MasterSchemaRow): + + def __init__(self, version, b_tree_table_leaf_page_number, b_tree_table_leaf_cell, record_columns): + + # Call the superclass to initialize this object + super(TableRow, self).__init__(version, b_tree_table_leaf_page_number, b_tree_table_leaf_cell, record_columns) + + # Initialize the logger + logger = getLogger(LOGGER_NAME) + + # Make sure this is the table row type after initialized by it's superclass + if self.row_type != MASTER_SCHEMA_ROW_TYPE.TABLE: + log_message = "Invalid row type: {} when expecting: {} with name: {}." + log_message = log_message.format(self.row_type, MASTER_SCHEMA_ROW_TYPE.TABLE, self.name) + logger.error(log_message) + raise ValueError(log_message) + + """ + + The SQL is always specified for tables (as well as triggers and views). The majority of indexes also have + the SQL specified. However, "internal indexes" created by "unique" or "primary key" constraints on ordinary + tables do not have SQL. + + """ + + # The sql statement must exist for table rows + if not self.sql: + log_message = "SQL does not exist for table row with name: {}." + log_message = log_message.format(self.name) + logger.error(log_message) + raise ValueError(log_message) + + def stringify(self, padding="", print_record_columns=True): + return super(TableRow, self).stringify(padding, print_record_columns) + + @staticmethod + def _get_module_name_and_remaining_sql(name, sql, remaining_sql_command): + return MasterSchemaRow._get_master_schema_row_name_and_remaining_sql(MASTER_SCHEMA_ROW_TYPE.TABLE, name, sql, + remaining_sql_command) + + +class OrdinaryTableRow(TableRow): + + def __init__(self, version, b_tree_table_leaf_page_number, b_tree_table_leaf_cell, record_columns): + + # Call the superclass to initialize this object + super(OrdinaryTableRow, self).__init__(version, b_tree_table_leaf_page_number, + b_tree_table_leaf_cell, record_columns) + + # Initialize the logger + logger = getLogger(LOGGER_NAME) + + # Make sure this is a create table statement + if not self.sql.startswith(CREATE_TABLE_CLAUSE): + log_message = "Invalid sql for create ordinary table statement: {} with name: {}." + log_message = log_message.format(self.sql, self.name) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Declare the column definitions and table constraints + self.column_definitions = [] + self.table_constraints = [] + + """ + + Note: The "without rowid" option can not be used in virtual tables. + + Note: Virtual tables do not have any "internal schema objects". + + """ + + self.without_row_id = False + self.internal_schema_object = False + + # Retrieve the sql command to this table and replace all multiple spaces with a single space + sql_command = sub("[\t\r\f\v ][\t\r\f\v ]+", " ", self.sql) + + # Set the create command offset to point to the end of the "create table" statement + create_command_offset = len(CREATE_TABLE_CLAUSE) + + """ + + We take off the "create table" beginning portion of the command here leaving the table name followed by + the column definitions and table constraints with an optional "without rowid" at the end. + + Note: The schema names are never included in the statements themselves since they just redirect which file + the data will be stored in. Schemas act more as file handles to open SQLite files in the driver. + + """ + + # Left strip the "create table" command from the beginning of the create table statement removing any whitespace + remaining_sql_command = str(sql_command[create_command_offset:]).lstrip() + + """ + + We now parse through the remaining SQL command to find the table name. Once we find the table name and set it, + we remove the table name from the remaining SQL command. + + Note: The table and/or column names may be in single or double quotes. For example, quotes need to be used + if a table name has spaces. This is only seen in the SQL statement. These quotes are removed in the + name and table name fields. + + Note: It was observed that there may be or may not be a space between the table name and opening parenthesis. + + Note: There may also be a comment directly following the table name (with or without a space character) before + the column definitions. The SQL function checks for this use + case but does not remove the comment from the returned string. Therefore, it needs to be checked here + for comments. + + Note: The above was noticed with one of the sequence tables automatically created by SQLite in some use cases + was parsed. The following tables are examples of this in the documentation: + 1.) CREATE TABLE sqlite_sequence(name,seq); + 2.) CREATE TABLE sqlite_stat1(tbl,idx,stat); + 3.) CREATE TABLE sqlite_stat2(tbl,idx,sampleno,sample) + 4.) CREATE TABLE sqlite_stat3(tbl,idx,nEq,nLt,nDLt,sample) + 5.) CREATE TABLE sqlite_stat4(tbl,idx,nEq,nLt,nDLt,sample); + + These use cases are "internal schema objects" and any master schema objects with the name beginning + with "sqlite_" these types of objects. The prefix "sqlite_" used in the name of SQLite master schema + rows is reserved for use by SQLite. + + Note: There is no current use case of having "internal schema objects" for virtual tables and therefore + no virtual table name will start with "sqlite_". + + """ + + # Retrieve the table name and remaining sql after the table name is removed + table_name, remaining_sql_command = \ + MasterSchemaRow._get_master_schema_row_name_and_remaining_sql(self.row_type, self.name, self.sql, + remaining_sql_command) + + # Left strip the remaining sql command + remaining_sql_command = remaining_sql_command.lstrip() + + # Make sure the table name was set which may not have if for some reason the remaining sql command + # did not contain a single space character which would not be an acceptable create table statement + if not table_name: + log_message = "The table name was not set while parsing sql for table row name: {} and sql: {}." + log_message = log_message.format(self.name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Check the table name is equal to the name as specified in the sqlite documentation + if table_name.lower() != self.name.lower(): + log_message = "For table master schema row: {}, the derived table name: {} from the sql: {} " \ + "does not match the name: {}," + log_message = log_message.format(self.row_id, table_name, self.sql, self.name) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Check the table name is equal to the table name as specified in the sqlite documentation + if table_name.lower() != self.table_name.lower(): + log_message = "For table master schema row: {}, the derived table name: {} from the sql: {} " \ + "does not match the table name: {}," + log_message = log_message.format(self.row_id, table_name, self.sql, self.table_name) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + """ + + Check the table name to see if it is a internal schema object starting with "sqlite_". More investigation is + needed for these objects if there are any different use cases that may apply to them. It appears that these + can be parsed just as normal tables. Therefore, we only throw an info message to the logging framework and + continue on. + + """ + + if self.table_name.startswith(INTERNAL_SCHEMA_OBJECT_PREFIX): + self.internal_schema_object = True + + log_message = "Master schema ordinary table row found as internal schema object with name: {}, " \ + "table name: {} and sql: {} and may have use cases that still need to be addressed." + log_message = log_message.format(self.name, self.table_name, self.sql) + logger.info(log_message) + + """ + + The remaining SQL command must now either start with an opening parenthesis "(", a comment indicator, or "AS". + Comment indicators would be either the "--" or "/*" character sequences. + + Note: At this moment the "as [select-stmt]" is not addressed and if detected, a NotImplementedError + will be thrown. + + Note: Comments are parsed differently for each row. In the case of a normal table row comments can be + anywhere in the create table statement following the name. Therefore the beginning statement: + "CREATE TABLE [NAME]" cannot include any comments, but comments can directly follow the name, + with or without a space. It was also noted that comments will not appear after the ending ")" + parenthesis after the column definitions unless "WITHOUT ROWID" is specified in which case they + will occur even after the "WITHOUT ROWID" SQL. + + """ + + # Check for comments after the table name, before the column definitions + while remaining_sql_command.startswith(("--", "/*")): + comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command) + self.comments.append(comment.rstrip()) + remaining_sql_command = remaining_sql_command.lstrip() + + # See if the opening parenthesis is not the first character + if remaining_sql_command.find("(") != 0: + + # Check if this remaining sql statement starts with "AS" + if remaining_sql_command[:len(ORDINARY_TABLE_AS_CLAUSE)].upper() == ORDINARY_TABLE_AS_CLAUSE: + log_message = "Create table statement has an \"AS\" clause for master schema table row with " \ + "name: {} and sql: {} and is not implemented." + log_message = log_message.format(self.name, self.sql) + logger.error(log_message) + raise NotImplementedError(log_message) + + # If the remaining sql statement does not hit the above two use cases then this is an erroneous statement + else: + log_message = "Create table statement has an unknown clause for master schema table row with " \ + "name: {} and sql: {}." + log_message = log_message.format(self.name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + """ + + Due to the checks above and the fact that the "AS" use case is not handled yet, we can be assured that + this create statement remaining SQL command is now in the form of: "(...) ...". + + Next we will parse out the column definitions and table constraints between the "(" and "). After this is done, + we will investigate the trailing portion of the create statement past the closing parenthesis if it exists. + + Note: If the "AS" statement was used instead of the opening parenthesis here, the create table statement + would be needed to be parsed differently and not in the form of: "(...) ...". Due to this, there + is not the same concept of a trailing portion of the create statement past the closing parenthesis. + Instead the remaining statement following the "AS" would be a select statement and need to be parsed + as such. + + """ + + # The first thing is to get the closing parenthesis index to the column definitions and table constraints + closing_parenthesis_index = get_index_of_closing_parenthesis(remaining_sql_command) + + # Declare the definitions to be the "(...)" section of the "(...) ..." explained above + definitions = remaining_sql_command[:closing_parenthesis_index + 1] + + # Double check the definitions has a beginning opening parenthesis and ends with a closing parenthesis + if definitions.find("(") != 0 or definitions.rfind(")") != len(definitions) - 1: + log_message = "The definitions are not surrounded by parenthesis as expected for table row with name: {}" \ + "and sql: {} with definitions: {}." + log_message = log_message.format(self.name, self.sql, definitions) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Remove the beginning and ending parenthesis and left strip the string in case single whitespace characters + # appear directly after the opening parenthesis and set it back to the definitions. The characters before + # the ending parenthesis are allowed since there could be a "\n" character corresponding to a "--" comment. + definitions = definitions[1:len(definitions) - 1].lstrip() + + """ + + At this point the column definitions, column constraints, and table constraints should be in the format of: + ( column-name [[type-name] [column constraint]] [, ...,] [, table constraint] [, ...,] ) + where the brackets [] represent optional declaration and [, ...,] represents repeats of the previous argument. + + A definition can be a column definition or table constraint: + 1.) A column definition is in the form of: column-name [[type-name] [column constraint]] + column-name [type-name] [COLUMN-CONSTRAINT ....] + 2.) A table constraint is in the form of: [table-constraint] + [TABLE-CONSTRAINT ...] + + In order to parse the column definitions and table constraints we need to break them up in their respective + segments. Since parentheses and commas exist in their respective segments, we cannot simply do a split on + a comma to divide up the sections. In order to break up the sections correctly, we iterate through the + definitions string looking for the commas but if we find an opening parenthesis, skip to the closing + parenthesis ignoring commas if they exist as well as other characters in that portion of the string. Also, + if we find a quote character such as " or ', we need to skip to the following " or ' character. + + According to the documentation it appears that commas separate each segment defining the column definitions + and table constraints and only appear within a pair of opening/closing parenthesis within the segment + otherwise. Therefore we do not make an assumption here, but raise an exception. + + As we move along parsing the individual segments, we check the beginning of each new section (minus leading + whitespace that is removed) if it begins with one of the table constraint prefaces. If it does, we know + that is the end of the column definitions and the start of the table constraints. From the first (if any) + segment matches one of the table constraint prefaces, than that and any following definitions should all be + table constraints and no more column definitions should show up. If any of the following from the first table + constraint here does not begin with a table constraint preface, than an exception will be thrown. + + To note, if the first definition found is a table constraint, than an exception will be thrown as well. Also, + at least one column definition must be present in the definitions in order to be a proper create statement. + According to the documentation, this appears true and therefore if this use case is detected, an exception is + thrown. + + Note: When a table is created it must have at least one column. + + Note: The above documentation does not account for comments. Comments may be found anywhere within the + definitions. However, if quotes are used to define a default value, data type, etc. the comment is + ignored. + + Example: CREATE TABLE example (text_field "TEXT -- I am a text field") + In the above example, the data type is "TEXT -- I am a text field" which resolves to a TEXT + storage class and from SQLite's perspective, there is no comment. + + Note: The above also gives merit to the following use case: + + Example: CREATE TABLE example (text_field "TEXT -- maintenance information") + In the above example, the storage class IS NOT TEXT. It is INTEGER since "int" appears + in the string and is checked for first by SQLite when checking the storage class. + + Note: If a value, or other field has a "," in it, it also gets ignored in the same manner if inside single or + double quotes. As an example usage, this was first noticed in the DEFAULT clause of a column definition + which contained "," characters in the default text string. + + """ + + # Make sure the definitions is not an empty string + if not definitions: + log_message = "No definitions parsed for the table row name: {} and sql: {}." + log_message = log_message.format(self.name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Define a index for the column definitions and table constraints + definition_index = 0 + + # Define an index for the parsing the definitions and the beginning definition index + character_index = 0 + beginning_definition_index = 0 + + # Define a boolean for when the table constraints + table_constraints_found = False + + # Initialize comments + column_definition_comments = [] + + # Iterate through all of the characters in the definitions + while character_index < len(definitions): + + # Get the current indexed character + character = definitions[character_index] + + """ + + Check to make sure we are not encountering a comment. + + Note: A single "-" is allowed since it can be before a negative default value for example in the create + statement. + + """ + + if character is "-": + + # Check to make sure the full comment indicator was found for "--" + if definitions[character_index + 1] == "-": + character_index = definitions.index("\n", character_index) + + elif character is "/": + + # Check to make sure the full comment indicator was found for "/*" + if definitions[character_index + 1] != "*": + log_message = "Comment indicator '{}' found followed by an invalid secondary comment " \ + "indicator: {} found in {}." + log_message = log_message.format(character, definitions[character_index + 1], definitions) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + + character_index = definitions.index("*/", character_index) + 1 + + """ + + Below, we account for column definition comments that may have commas or parenthesis in them in order to + make sure a particular portion of a comment doesn't cause the column definition to be parsed incorrectly. + + This is also done with backticks, single, and double quotes. + + Note: SQLite allows backticks for compatibility with MySQL and allows brackets for compatibility with + Microsoft databases. + + """ + + # Check if the character is an opening bracket, `, and skip to the closing single quote if so + if character == "[": + + try: + + # Set the character index to the closing bracket to this opening one + character_index = definitions.index("]", character_index + 1) + + except ValueError: + + log_message = "No ending \"]\" character found in the definitions: {} starting from index: {} " \ + "while parsing the remaining sql: {} for the table row name: {}." + log_message = log_message.format(definitions, character_index + 1, remaining_sql_command, self.name) + logger.error(log_message) + raise + + # Check if the character is an opening backtick, `, and skip to the closing single quote if so + if character == "`": + + try: + + # Set the character index to the closing backtick to this opening one + character_index = definitions.index("`", character_index + 1) + + except ValueError: + + log_message = "No ending \"`\" character found in the definitions: {} starting from index: {} " \ + "while parsing the remaining sql: {} for the table row name: {}." + log_message = log_message.format(definitions, character_index + 1, remaining_sql_command, self.name) + logger.error(log_message) + raise + + # Check if the character is an opening single quote, ', and skip to the closing single quote if so + if character == "'": + + try: + + # Set the character index to the closing single quote to this opening one + character_index = definitions.index("'", character_index + 1) + + except ValueError: + + log_message = "No ending \"'\" character found in the definitions: {} starting from index: {} " \ + "while parsing the remaining sql: {} for the table row name: {}." + log_message = log_message.format(definitions, character_index + 1, remaining_sql_command, self.name) + logger.error(log_message) + raise + + # Check if the character is an opening double quote, ", and skip to the closing double quote if so + if character == "\"": + + try: + + # Set the character index to the closing double quote to this opening one + character_index = definitions.index("\"", character_index + 1) + + except ValueError: + + log_message = "No ending \"\"\" character found in the definitions: {} starting from index: {} " \ + "while parsing the remaining sql: {} for the table row name: {}." + log_message = log_message.format(definitions, character_index + 1, remaining_sql_command, self.name) + logger.error(log_message) + raise + + # Check if the character is an opening parenthesis and skip to the closing parenthesis if so + if character == "(": + + # Set the character index to the closing parenthesis to this opening one and increment the index + character_index = get_index_of_closing_parenthesis(definitions, character_index) + + # If we find a closing parenthesis character than something went wrong and an exception is thrown + elif character == ")": + log_message = "An error occurred while parsing the remaining sql: {} for the table row name: {}." + log_message = log_message.format(remaining_sql_command, self.name) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + """ + + Above we update the index in the case that we find a opening parenthesis to the closing parenthesis index. + Below we check if the character is a comma or at the end of the definition string in order to make the next + and/or possibly final definition. + + 1.) If the character is a comma then we know we reached that end of the portion of the definition in the + definition string and there are more that follow. + 2.) If the character index + 1 == len(definitions) which means the character index is pointing to the last + element in the array and on the while loop will break on the next iteration. In this case we make the + remaining segment into a definition. + + """ + + # Check if we find a comma character and if so we know we reached the end of the current definition or + # if the character index is either at the end of the definitions string. + if character == "," or character_index + 1 == len(definitions): + + # Initialize a variable to add to the character index if comments are found after the comma + ending_comments_length = 0 + + # If the character index is one length than the length of the definitions (at the end of the + # definitions string) then we want to increment it one in order to pick up the last character. + # This is due to the array for strings being exclusive to the last index specified. + if character_index + 1 == len(definitions): + character_index += 1 + + # Check if there are comments if there was a comma + else: + + """ + + For column definitions and table constraints, we will only parse out the comments and send them + into the constructor if they start out the definition or directly follow a ",". Any other comments, + will not be parsed here and will instead be sent into the column definition or table constraint + class for parsing. This was decided to be the best way to associate comments based on location in + the create table statement based on location. + + Example 1: CREATE TABLE example_1 ( -- field for text + text_field) + Here the comment will be parsed out and sent in to the column definition constructor. + + Example 2: CREATE TABLE example_2 ( text_field, /* text field */ integer_field, + /* integer field */ ) + Here the "/* text field */" comment will be sent in as a comment to the text_field + column definition. The same will be true for the "/* integer field */" comment for the + integer_field. + + Example 3: CREATE TABLE example_3 ( text_field + -- field for text) + Here the comment will be included in the column definition string and not parsed as a + separate comment since there is no "," character even though it's on the next line. + + Example 4: CREATE TABLE example_4 ( text_field, + -- field for text + integer_field + -- field for integer) + Here, both comments will be sent in the column definition string for the integer_field + and not parsed separate since the first comment is after the "," and the second comment + is before (although no following fields are specified here) the next ",". Even though + it can be seen that this may not be correct, the pattern above does not follow a + consistent pattern and is against what was considered the best way to parse schema + comments. + + Example 5: CREATE TABLE example_5 (text_field, -- field for text + /* this is a field for text */ + integer_field -- field for integer) + Here, the "-- field for text" comment on the first line will be parsed and sent into the + column definition for the text_field. However the "/* this is a field for text */" + comment will be parsed and sent into the second column definition. The final comment + "-- field for integer" will be sent in along with the integer_field as part of the + column definition string. + + In summation, comments right in the beginning or directly following a "," in the definitions will be + parsed separate and sent in through the constructor of the corresponding column definition or table + constraint. Otherwise, the comment will be send in as part of the definition string to the + appropriate class and leave that class up to parse the inner comments to that definition. + + Note: The reason why comments preceding the definition had to be parsed was to pull out extra + content from the beginning of the column definition or table constraint in order to be able + to detect if it was a table constraint or not. + + Note: This means in the above form of parsing comments that there can be many "/* ... */" comments + as long as a "\n" does not appear following the ",". This means that as soon as there is a + "-- ... \n" comment, the parsing will end. This also means that there will always be at most + one "-- ... \n" comment and the end of the statement following the ",". + + """ + + # Get the remaining definition past the comma + remaining_definition = definitions[character_index + 1:] + left_stripped_character_length = len(remaining_definition) + remaining_definition = sub("^[\t\r\f\v ]+", "", remaining_definition) + left_stripped_character_length -= len(remaining_definition) + + # See if any comments in the form "/* ... */" exist and remove them if so (there may be 0 ... *) + while remaining_definition.startswith("/*"): + comment, remaining_definition = parse_comment_from_sql_segment(remaining_definition) + left_stripped_character_length += len(remaining_definition) + remaining_definition = remaining_definition.lstrip(" ") + left_stripped_character_length -= len(remaining_definition) + ending_comments_length += len(comment) + left_stripped_character_length + column_definition_comments.append(comment) + + # See if any comments in the form "-- ... \n" exist and remove them if so (there may be 0 ... 1) + if remaining_definition.startswith("--"): + comment, remaining_definition = parse_comment_from_sql_segment(remaining_definition) + left_stripped_character_length += len(remaining_definition) + remaining_definition = remaining_definition.lstrip(" ") + left_stripped_character_length -= len(remaining_definition) + ending_comments_length += len(comment) + left_stripped_character_length + column_definition_comments.append(comment) + + # Initialize a current definition index to validate against later + current_definition_index = definition_index + + # Get the definition string and strip the beginning characters since we do not need any + # default whitespace characters there, but may need them at the end (for example in the case + # of a "--" comment that ends in "\n". + definition = definitions[beginning_definition_index:character_index].lstrip() + + # Check for comments after the beginning of the definition + while definition.startswith(("--", "/*")): + comment, remaining_definition = parse_comment_from_sql_segment(definition) + column_definition_comments.append(comment.rstrip()) + definition = remaining_definition.lstrip() + + # Iterate through the table constraint prefaces and make sure none of them start off the definition + for table_constraint_preface in TABLE_CONSTRAINT_PREFACES: + + # Make sure the length of the definition is at least as long as the table constraint preface + if len(definition) >= len(table_constraint_preface): + + """ + + Note: Even though the column and table constraint share some of the same prefaces for + their constraints, this check is safe since the column definitions will never + start out directly with a column constraint preface name that could be confused with + a table constraint preface name. + + Note: When the check is done on the definition, we check the next character is not one of the + allowed characters in a column name to make sure the constraint preface is not the + beginning of a longer column name where it is not actually a constraint preface + (example: primaryEmail). The "\w" regular expression when no LOCALE and UNICODE flags + are set will be equivalent to the set: [a-zA-Z0-9_]. + + """ + + # Check to see if the definition starts with the table constraint preface + if definition[:len(table_constraint_preface)].upper() == table_constraint_preface: + + if not (len(table_constraint_preface) + 1 <= len(definition) + and match("\w", definition[len(table_constraint_preface)])): + + # We have found a table constraint here and make sure this is not the first definition + if definition_index == 0: + + # The first definition is a table constraint which should not occur + log_message = "First definition found: {} in table row with name: {} and sql: {} " \ + "is a table constraint." + log_message = log_message.format(definition[:len(table_constraint_preface)], + self.name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # The definition is a table constraint and not the first definition + else: + + """ + + Note: Since we are here we assume the first column definition has already been made + because at least one of them had to be parsed successfully before reaching + this portion of the code. Therefore no additional checks need to be done + for checking at least one column definition existing. + + """ + + # Create the table constraint + self.table_constraints.append(TableConstraint(definition_index, definition, + column_definition_comments)) + + # Set the table constraints found variable to true now + table_constraints_found = True + + # Reinitialize the comments + column_definition_comments = [] + + # Increment the definition index + definition_index += 1 + + """ + + After each parsing of the definition we check if that was a table constraint. If it was we make sure + that the first table constraint and all ones following it are. If this iteration is not a table + constraint, that means no table constraints should have been found yet and it is a normal column + definition. + + """ + + # Check if table constraint has not been found yet (previously or on this iteration) + if not table_constraints_found: + + """ + + This definition is a column definition. + + Make sure the index was not incremented since no table constraint was made. + + """ + + # Make sure the definition index has not changed + if current_definition_index != definition_index: + log_message = "The definition index: {} was updated indicating a table constraint was " \ + "made when it should be: {} for a column definition in table row with " \ + "name: {} and sql: {}." + log_message = log_message.format(definition_index, current_definition_index, + self.name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Create the column definition + self.column_definitions.append(ColumnDefinition(definition_index, definition, + column_definition_comments)) + + # Reinitialize the comments to the next segments columns + column_definition_comments = [] + + # Increment the definition index + definition_index += 1 + + # Make sure the table constraint was made + else: + + """ + + This definition is a table constraint. + + Make sure the index was incremented since the table constraint was made. + + """ + + # Check that the definition index was incremented meaning a table constraint was made + if current_definition_index + 1 != definition_index: + log_message = "The definition index: {} was not updated indicating a column definition was " \ + "made when it should be: {} for a table constraint in table row with " \ + "name: {} and sql: {}." + log_message = log_message.format(definition_index, current_definition_index + 1, + self.name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Update the beginning definition and character indexes + character_index += ending_comments_length + 1 + beginning_definition_index = character_index + + # The character is just a normal character + else: + + # Increment the character index + character_index += 1 + + """ + + Lastly, if there is remaining SQL, we check to make sure it is the "without rowid" statement. If it is not, + then an exception will be thrown since that is the only use case allowed here according to the SQLite + documentation. + + """ + + # Last get the remaining sql command to check for the "without rowid" use case + remaining_sql_command = remaining_sql_command[closing_parenthesis_index + 1:].lstrip() + + # See if the remaining sql command has any content left + if len(remaining_sql_command) != 0: + + """ + + Note: Below we check for comments before, in between and after the "without rowid" statement. We only + check for comments assuming we have the "without rowid" specified. This is due to the fact that + if the "without rowid" is not specified, any comments following the end of the column definitions + are ignored in the create table statement by SQLite. Only when "without rowid" is specified, are + comments recognized. + + """ + + # Check for comments after the end of the column definitions before the "without rowid" + while remaining_sql_command.startswith(("--", "/*")): + comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command) + self.comments.append(comment.rstrip()) + remaining_sql_command = remaining_sql_command.lstrip() + + # If there is content left, check if it is the "without rowid" string by seeing if it starts with "without" + if remaining_sql_command.upper().startswith("WITHOUT"): + + remaining_sql_command = remaining_sql_command[len("WITHOUT"):].lstrip() + + # Check for comments after the end of the column definitions before the "without rowid" + while remaining_sql_command.startswith(("--", "/*")): + comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command) + self.comments.append(comment.rstrip()) + remaining_sql_command = remaining_sql_command.lstrip() + + if remaining_sql_command.upper().startswith("ROWID"): + + remaining_sql_command = remaining_sql_command[len("ROWID"):].lstrip() + + # Set the without row id variable to true + self.without_row_id = True + + # Check for comments at the end + while remaining_sql_command.startswith(("--", "/*")): + comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command) + self.comments.append(comment.rstrip()) + remaining_sql_command = remaining_sql_command.lstrip() + + # Make sure we are at the end + if len(remaining_sql_command) != 0: + log_message = "Invalid sql ending: {} found when nothing more expected in " \ + "table row with name: {} and sql: {}." + log_message = log_message.format(remaining_sql_command, self.name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + else: + log_message = "Invalid sql ending: {} found after \"WITHOUT\" when \"ROWID\" expected in " \ + "table row with name: {} and sql: {}." + log_message = log_message.format(remaining_sql_command, self.name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # The remaining string is not the "without rowid" string which, according to sqlite documentation, + # should not occur + else: + log_message = "Invalid sql ending: {} found in table row with name: {} and sql: {}." + log_message = log_message.format(remaining_sql_command, self.name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + """ + + Until the "without rowid" is fully implemented, we will throw a warning here. Tables without a row id have + all of their data stored in index b-tree pages rather than table b-tree pages. Also, the ordering of the + columns are switched around depending on what field(s) the primary key is comprised of and where those fields + are in the column definitions. + + """ + + if self.without_row_id: + log_message = "A table specified without a row id was found in table row with name: {} and sql: {}. " \ + "This use case is not fully implemented." + log_message = log_message.format(self.name, self.sql) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + def stringify(self, padding="", print_record_columns=True, + print_column_definitions=True, print_table_constraints=True): + string = "\n" \ + + padding + "Without Row ID: {}\n" \ + + padding + "Internal Schema Object: {}\n" \ + + padding + "Column Definitions Length: {}\n" \ + + padding + "Table Constraints Length: {}" + string = string.format(self.without_row_id, + self.internal_schema_object, + len(self.column_definitions), + len(self.table_constraints)) + string = super(OrdinaryTableRow, self).stringify(padding, print_record_columns) + string + if print_column_definitions: + for column_definition in self.column_definitions: + string += "\n" \ + + padding + "Column Definition:\n{}".format(column_definition.stringify(padding + "\t")) + if print_table_constraints: + for table_constraint in self.table_constraints: + string += "\n" \ + + padding + "Table Constraint:\n{}".format(table_constraint.stringify(padding + "\t")) + return string + + +class VirtualTableRow(TableRow): + + def __init__(self, version, b_tree_table_leaf_page_number, b_tree_table_leaf_cell, record_columns): + + # Call the superclass to initialize this object + super(VirtualTableRow, self).__init__(version, b_tree_table_leaf_page_number, + b_tree_table_leaf_cell, record_columns) + + # Initialize the logger + logger = getLogger(LOGGER_NAME) + + # Make sure this is a create virtual table statement + if not self.sql.startswith(CREATE_VIRTUAL_TABLE_CLAUSE): + log_message = "Invalid sql for create virtual table statement: {} with name: {}." + log_message = log_message.format(self.sql, self.name) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + """ + + Note: The "without rowid" option can not be used in virtual tables. + + Note: Virtual tables do not have any "internal schema objects". + + """ + + # Retrieve the sql command to this table and replace all multiple spaces with a single space + sql_command = sub("[\t\r\f\v ][\t\r\f\v ]+", " ", self.sql) + + # Set the create command offset to point to the end of the "create virtual table" statement + create_command_offset = len(CREATE_VIRTUAL_TABLE_CLAUSE) + + """ + + We take off the "create virtual table" beginning portion of the command here leaving the table name followed by + the "using" statement and then the module arguments. + + Note: The schema names are never included in the statements themselves since they just redirect which file + the data will be stored in. Schemas act more as file handles to open sqlite files in the driver. + + """ + + # Left strip the "create table" command from the beginning of the create table statement removing any whitespace + remaining_sql_command = str(sql_command[create_command_offset:]).lstrip() + + """ + + We now parse through the remaining SQL command to find the table name. Once we find the table name and set it, + we remove the table name from the remaining SQL command. + + Note: The table and/or column names may be in single or double quotes. For example, quotes need to be used + if a table name has spaces. This is only seen in the SQL statement. These quotes are removed in the + name and table name fields. + + Note: It was observed that there may be or may not be a space between the table name and opening parenthesis. + + Note: The above was noticed with one of the sequence tables automatically created by SQLite in some use cases + was parsed. The following tables are examples of this in the documentation: + 1.) CREATE TABLE sqlite_sequence(name,seq); + 2.) CREATE TABLE sqlite_stat1(tbl,idx,stat); + 3.) CREATE TABLE sqlite_stat2(tbl,idx,sampleno,sample) + 4.) CREATE TABLE sqlite_stat3(tbl,idx,nEq,nLt,nDLt,sample) + 5.) CREATE TABLE sqlite_stat4(tbl,idx,nEq,nLt,nDLt,sample); + + These use cases are "internal schema objects" and any master schema objects with the name beginning + with "sqlite_" these types of objects. The prefix "sqlite_" used in the name of SQLite master schema + rows is reserved for use by SQLite. + + Note: There is no current use case of having "internal schema objects" for virtual tables and therefore + no virtual table name will start with "sqlite_". + + """ + + # Retrieve the table name and remaining sql after the table name is removed + table_name, remaining_sql_command = \ + MasterSchemaRow._get_master_schema_row_name_and_remaining_sql(self.row_type, self.name, self.sql, + remaining_sql_command) + + # Left strip the remaining sql command + remaining_sql_command = remaining_sql_command.lstrip() + + # Check the table name is equal to the name as specified in the sqlite documentation + if table_name.lower() != self.name.lower(): + log_message = "For virtual table master schema row: {}, the derived table name: {} from the sql: {} " \ + "does not match the name: {}," + log_message = log_message.format(self.row_id, table_name, self.sql, self.name) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Check the table name is equal to the table name as specified in the sqlite documentation + if table_name.lower() != self.table_name.lower(): + log_message = "For virtual table master schema row: {}, the derived table name: {} from the sql: {} " \ + "does not match the table name: {}," + log_message = log_message.format(self.row_id, table_name, self.sql, self.table_name) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + """ + + Check the virtual table name to see if it is a internal schema object starting with "sqlite_". Since this + is not expected for a virtual table, a error will be raised if detected. + + """ + + if self.table_name.startswith(INTERNAL_SCHEMA_OBJECT_PREFIX): + log_message = "Master schema virtual table row found as internal schema object with name: {}, " \ + "table name: {} and sql: {} which should not occur." + log_message = log_message.format(self.name, self.table_name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + """ + + The remaining SQL command must now either start with "using" which may be mixed-case or an opening + parenthesis "(", or a comment indicator. Comment indicators would be either the "--" or "/*" character + sequences. + + """ + + # Check for comments after the virtual table name, before the using clause + while remaining_sql_command.startswith(("--", "/*")): + comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command) + self.comments.append(comment.rstrip()) + remaining_sql_command = remaining_sql_command.lstrip() + + # Declare the module arguments + self.module_arguments = [] + + # Check if this remaining sql statement starts with "AS" + if remaining_sql_command[:len(VIRTUAL_TABLE_USING_CLAUSE)].upper() != VIRTUAL_TABLE_USING_CLAUSE: + log_message = "Create virtual table statement does not have a \"USING\" clause for master schema " \ + "table row with name: {} and sql: {}." + log_message = log_message.format(self.name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Remove the using prefix and left strip any whitespace + remaining_sql_command = remaining_sql_command[len(VIRTUAL_TABLE_USING_CLAUSE):].lstrip() + + # Check for comments after the using clause, before the module name + while remaining_sql_command.startswith(("--", "/*")): + comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command) + self.comments.append(comment.rstrip()) + remaining_sql_command = remaining_sql_command.lstrip() + + # Declare the module arguments + self.module_arguments = [] + + # Retrieve the module name and remaining sql after the module name is removed + self.module_name, remaining_sql_command = TableRow._get_module_name_and_remaining_sql(self.name, self.sql, + remaining_sql_command) + + # Left strip the remaining sql command + remaining_sql_command = remaining_sql_command.lstrip() + + # Check for comments after the module name, before the module arguments + while remaining_sql_command.startswith(("--", "/*")): + comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command) + self.comments.append(comment.rstrip()) + remaining_sql_command = remaining_sql_command.lstrip() + + # Declare the module arguments + self.module_arguments = [] + + """ + + At this point the remaining portion of the SQL command should be in the form of "( module-argument, ... )". + + """ + + # The first thing is to get the closing parenthesis index to the module arguments + closing_parenthesis_index = get_index_of_closing_parenthesis(remaining_sql_command) + + # Declare the arguments to be the "(...)" section + arguments = remaining_sql_command[:closing_parenthesis_index + 1] + + # Double check the module arguments has a beginning opening parenthesis and ends with a closing parenthesis + if arguments.find("(") != 0 or arguments.rfind(")") != len(arguments) - 1: + log_message = "The arguments are not surrounded by parenthesis as expected for table row with name: {}" \ + "and sql: {} with arguments: {}." + log_message = log_message.format(self.name, self.sql, arguments) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Remove the beginning and ending parenthesis and left strip the string in case single whitespace characters + # appear directly after the opening parenthesis and set it back to the definitions. The characters before + # the ending parenthesis are allowed since there could be a "\n" character corresponding to a "--" comment. + + """ + + The next step here is to parse and strip the module arguments and continue parsing: + The next step here is the strip the arguments of the parenthesis and continue parsing the module arguments: + arguments = arguments[1:len(arguments) - 1].lstrip() + ... + + Support for virtual table modules and module arguments is not yet implemented. + + """ + + """ + + At this point we have the SQL down to the remaining module arguments. Since the module arguments are different + depending on the module, many use cases will need to be investigated and addressed. For now a warning is + thrown that a virtual table was found. + + """ + + log_message = "Virtual table name: {} was found with module name: {} and sql: {}. Virtual table modules are " \ + "not fully implemented." + log_message = log_message.format(self.name, self.module_name, self.sql) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + """ + + The last thing to do is make sure there is nothing remaining in the SQL after the closing parenthesis of the + module arguments. + + Note: Similarly, like the create table statement, any comments placed after the module name (when there are no + module arguments), or the module arguments, are ignored by SQLite. + + """ + + # Last get the remaining sql command to check for the "without rowid" use case + remaining_sql_command = remaining_sql_command[closing_parenthesis_index + 1:].lstrip() + + # See if the remaining sql command has any content left + if len(remaining_sql_command) != 0: + log_message = "Additional content found in virtual table sql after module arguments in table row" \ + "with name: {} found with module name: {} and sql: {}." + log_message = log_message.format(self.name, self.module_name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + def stringify(self, padding="", print_record_columns=True, print_module_arguments=True): + string = "\n" \ + + padding + "Module Name: {}\n" \ + + padding + "Module Arguments Length: {}" + string = string.format(self.module_name, + len(self.module_arguments)) + string = super(VirtualTableRow, self).stringify(padding, print_record_columns) + string + if print_module_arguments: + for module_argument in self.module_arguments: + string += "\n" \ + + padding + "Module Argument:\n{}".format(module_argument.stringify(padding + "\t")) + return string + + +class IndexRow(MasterSchemaRow): + + def __init__(self, version_interface, b_tree_table_leaf_page_number, + b_tree_table_leaf_cell, record_columns, tables): + + super(IndexRow, self).__init__(version_interface, b_tree_table_leaf_page_number, + b_tree_table_leaf_cell, record_columns) + + # Initialize the logger + logger = getLogger(LOGGER_NAME) + + # Make sure this is the index row type after initialized by it's superclass + if self.row_type != MASTER_SCHEMA_ROW_TYPE.INDEX: + log_message = "Invalid row type: {} when expecting: {} with name: {}." + log_message = log_message.format(self.row_type, MASTER_SCHEMA_ROW_TYPE.INDEX, self.name) + logger.error(log_message) + raise ValueError(log_message) + + """ + + Three boolean fields are declared below: + + 1.) internal_schema_object: + An internal schema object is if the index is created by SQLite implicitly through the create table statement + such as a primary key or unique constraint. + + 2.) unique + If the index is not an internal schema object then it is either a regular index or a unique index. The unique + index only enforces that duplicates are not allowed. + + Note: NULL values are considered unique to each other in SQLite, therefore there may be multiple NULL values + in any index including unique indexes. + + 3.) partial_index: + An index where the WHERE clause is found is a partial index. In ordinary indexes, there is exactly one entry + in the index for every row in the table but in partial indexes only some subset of the rows in the table have + corresponding index entries. For example where a value is not null resulting in a index where only non-null + values have the index over them. + + """ + + self.internal_schema_object = False + self.unique = False + self.partial_index = False + + # Check if this index is an internal schema object + if self.name.startswith(INTERNAL_SCHEMA_OBJECT_PREFIX): + self.internal_schema_object = True + + """ + + Note: Currently the only internal schema objects for indexes begin with "sqlite_autoindex_" according + to SQLite documentation from version 3.9.2. Therefore, if any index starts with "sqlite_" but + without the following "autoindex_" portion, an error will be raised. + + """ + + if self.internal_schema_object and not self.name.startswith(INTERNAL_SCHEMA_OBJECT_INDEX_PREFIX): + log_message = "Internal schema object detected but invalid prefix for index row with name: {}." + log_message = log_message.format(self.name) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + """ + + If this index is an internal schema object index, then it will have no SQL. + + """ + + if self.internal_schema_object and self.sql: + log_message = "Internal schema object detected for index row with name: {} but found sql: {}." + log_message = log_message.format(self.name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + if not self.internal_schema_object and not self.sql: + log_message = "Index row with name: {} found with no sql and is not an internal schema object." + log_message = log_message.format(self.name) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Make sure the table name this index refers to is in the tables and retrieve that table row. + if self.table_name not in tables: + log_message = "Index row with name: {} and table name: {} has not correlating table in the tables." + log_message = log_message.format(self.name, self.table_name) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + table_row = tables[self.table_name] + + if table_row.without_row_id: + log_message = "Index row with name: {} and table name: {} was found to rely on a table without a row id." + log_message = log_message.format(self.name, self.table_name) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + """ + + Since internal schema object do not have SQL, we need to handle internal schema object differently. + Internal schema objects need to have their names parsed rather than SQL. For index internal schema objects, + the name are of the form "sqlite_autoindex_TABLE_N" where table is the table name they refer to (this should + also match the table name) and N is the index of the primary or unique constraint as defined in the schema. + + Note: The INTEGER PRIMARY KEY does not get an index. For older versions of SQLite it would get a + "sqlite_sequence" table created for it if it did not already exist, but this is no longer done unless + the AUTOINCREMENT clause is added which is not recommended per the SQLite documentation. However, + it has been noticed that there are cases where there may be a INTEGER PRIMARY KEY UNIQUE clause on a + column which would cause a unique index internal schema object to be made. This could be confusing + since the naming nomenclature would be the same for either primary key or unique and may at first + appear to be a created primary key index internal schema object. + + Note: Index internal schema objects are created as side affects to create table statements. A index internal + schema object can not be created outside the create table statement. + + """ + + if self.internal_schema_object: + + """ + + Note: An index internal schema object will not be a partial index but may be unique depending on the + clause that created it from the create table statement. + + """ + + """ + + Until the index internal schema objects are fully implemented, we will throw a warning here. The index + internal schema objects are only made on primary key or unique constraints created in the table according + to current documentation as of SQLite 3.9.2. These names are in teh form of "sqlite_autoindex_TABLE_N" + where TABLE is the table name the auto index belongs to (which should also be mirrored in the table name) + and N is the counter for where it appears in the create statement. + + """ + + log_message = "A index internal schema object found in index row with name: {} " \ + "and sql: {}. This is not fully implemented and may cause issues with index pages." + log_message = log_message.format(self.name, self.sql) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + else: + + # Retrieve the sql command to this table and replace all multiple spaces with a single space + sql_command = sub("[\t\r\f\v ][\t\r\f\v ]+", " ", self.sql) + + """ + + At the beginning of the create index statement there can be two use cases to account for: + 1.) CREATE INDEX [INDEX_NAME] ... + 2.) CREATE UNIQUE INDEX [INDEX_NAME] ... + + The spacing and capitalization will always match one of the two the create [...] index statements above due + to the way SQLite works with the SQL. (Also, see documentation in the MasterSchemaRow class.) + + The unique only means that the index is unique and there may not be more than one index in this set that + is equivalent. Keep in mind NULL values considered unique to each other in SQLite. This use case does + not concern us since we are merely parsing the data, creating signatures, carving data, etc. We are not + adding to the index here and therefore this is nothing more than informative for us. However, it may be + helpful to keep in mind in the future for trying to rebuild carved entries in some way. + + """ + + if sql_command.startswith(CREATE_INDEX_CLAUSE): + + # Set the create command offset to point to the end of the "create index" statement + create_command_offset = len(CREATE_INDEX_CLAUSE) + + elif sql_command.startswith(CREATE_UNIQUE_INDEX_CLAUSE): + + self.unique = True + + # Set the create command offset to point to the end of the "create unique index" statement + create_command_offset = len(CREATE_UNIQUE_INDEX_CLAUSE) + + else: + log_message = "Invalid sql for create index statement: {} with name: {}." + log_message = log_message.format(self.sql, self.name) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + if not create_command_offset: + log_message = "The create command offset was not set while parsing sql for index row name: {} " \ + "and sql: {}." + log_message = log_message.format(self.name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + """ + + We take off the "create [unique] index" beginning portion of the command here leaving the index name next. + At this point we have the create index command in the following structure: + + [INDEX_NAME] ON [TABLE_NAME] ( [INDEXED_COLUMN], ... ) [WHERE [EXPR]] + + Note: An INDEXED_COLUMN (specified above) can be either a column-name or expr that may be followed by + either a COLLATE command or ASC/DESC command (or both). + + Note: Capitalization of commands does not matter and checks on exact string commands need to take into + account case insensitivity. + + Note: Following the index name, comments may appear from that point after in the index SQL. + + """ + + # Strip off the "create [unique] index" command from the beginning of the create index statement + remaining_sql_command = str(sql_command[create_command_offset + 1:]) + + # Get the index name and remaining sql + index_name, remaining_sql_command = \ + MasterSchemaRow._get_master_schema_row_name_and_remaining_sql(self.row_type, self.name, + self.sql, remaining_sql_command) + + # Left strip the remaining sql command + remaining_sql_command = remaining_sql_command.lstrip() + + # Check if this remaining sql statement starts with "ON" + if remaining_sql_command[:len(INDEX_ON_COMMAND)].upper() != INDEX_ON_COMMAND: + log_message = "Create index statement does not have a \"ON\" clause for master schema " \ + "index row with name: {} and sql: {}." + log_message = log_message.format(self.name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Remove the using prefix and strip any whitespace from the beginning + remaining_sql_command = remaining_sql_command[len(INDEX_ON_COMMAND):].lstrip() + + # Get the table name and remaining sql + table_name, remaining_sql_command = \ + MasterSchemaRow._get_master_schema_row_name_and_remaining_sql(self.row_type, self.name, + self.sql, remaining_sql_command) + + # Left strip the remaining sql command + remaining_sql_command = remaining_sql_command.lstrip() + + # Check the index name is equal to the name as specified in the sqlite documentation + if index_name.lower() != self.name.lower(): + log_message = "For index master schema row: {}, the index name: {} does not match the derived index" \ + "name: {} from the sql: {}." + log_message = log_message.format(self.row_id, self.name, index_name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Check the table name is equal to the index table name as specified in the sqlite documentation + if table_name.lower() != self.table_name.lower(): + log_message = "For index master schema row: {}, the table name: {} does not match the derived table " \ + "name: {} from the sql: {}." + log_message = log_message.format(self.row_id, self.table_name, table_name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + """ + + Note: Since we already checked above that the table name was in the table master schema entries sent in, + we do not check again here. + + """ + + """ + + The remaining SQL command must now either start with an opening parenthesis "(", or a comment indicator. + Comment indicators would be either the "--" or "/*" character sequences. + + """ + + # Check for comments after the index name, before the indexed columns + while remaining_sql_command.startswith(("--", "/*")): + comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command) + self.comments.append(comment.rstrip()) + remaining_sql_command = remaining_sql_command.lstrip() + + # The first thing to be done is get the closing parenthesis index to the indexed columns + closing_parenthesis_index = get_index_of_closing_parenthesis(remaining_sql_command) + + # Declare the indexed columns to be the "( [INDEXED_COLUMN], ... )" explained above + indexed_columns = remaining_sql_command[:closing_parenthesis_index + 1] + + # Double check the indexed columns has a beginning opening parenthesis and ends with a closing parenthesis. + if indexed_columns.find("(") != 0 or indexed_columns.rfind(")") != len(indexed_columns) - 1: + log_message = "The indexed columns are not surrounded by parenthesis as expected for index row with" \ + "name: {} and sql: {} with definitions: {}." + log_message = log_message.format(self.name, self.sql, indexed_columns) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Remove the beginning and ending parenthesis and left strip the string in case single whitespace characters + # appear directly after the opening parenthesis and set it back to the index columns. The characters before + # the ending parenthesis are allowed since there could be a "\n" character corresponding to a "--" comment. + + """ + + The next step here is to parse and left strip the indexed columns and continue parsing: + indexed_columns = indexed_columns[1:len(indexed_columns) - 1].lstrip() + ... + + Support for indexed columns has not been implemented yet. + + """ + + """ + + Lastly, if there is remaining SQL, we check to make sure it is a "WHERE" statement. If it is not, + then an exception will be thrown since that is the only use case allowed here according to the SQLite + documentation. + + """ + + # Last get the remaining sql command to check for the "where" use case + remaining_sql_command = remaining_sql_command[closing_parenthesis_index + 1:].lstrip() + + """ + + The create index statements work differently than the create table statements in respect to comments + and the clauses after the column definitions/indexed columns. In a create table statement, any comments + after the end of the column definitions is ignored by SQLite unless the "without rowid" clause is + stated which then recognizes comments before, in between, and after the clause. + + For create index statements, comments are not ignored by SQLite no matter if the "where" clause + is specified after the indexed columns or not. Therefore, if the remaining SQL command has any + more content, it may either be a comment, a "where" clause, or both. + + """ + + # Check for comments after the end of the index columns + while remaining_sql_command.startswith(("--", "/*")): + comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command) + self.comments.append(comment.rstrip()) + remaining_sql_command = remaining_sql_command.lstrip() + + # See if the remaining sql command has any content left + if len(remaining_sql_command) != 0: + + """ + + Since we removed any previous comments above, if we still have content at this point, we know that the + only allowed use case in this scenario is to have the "where" statement next in the SQL. + + Note: The "where" clause may be mixed-case. + + """ + + # Check if this remaining sql statement starts with "WHERE" + if remaining_sql_command[:len(INDEX_WHERE_CLAUSE)].upper() != INDEX_WHERE_CLAUSE: + log_message = "Create virtual table statement does not have a \"WHERE\" clause for master schema " \ + "index row with name: {} and sql: {} when expected." + log_message = log_message.format(self.name, self.sql) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + # Set the partial index flag and the where expression + self.partial_index = True + + # Check for comments after the where clause + while remaining_sql_command.startswith(("--", "/*")): + comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command) + self.comments.append(comment.rstrip()) + remaining_sql_command = remaining_sql_command.lstrip() + + """ + + The next step here is to parse the "WHERE" clause: + remaining_sql_command = remaining_sql_command[len(INDEX_WHERE_CLAUSE):].lstrip() + ... + + Support for partial indexes has not been implemented yet. + + """ + + """ + + Until the partial index is fully implemented, we will throw a warning here. Partial indexes are only + made on a subset of rows depending on the "WHERE" clause which would need to be parsed to be exact. + + """ + + if self.partial_index: + log_message = "A index specified as a partial index was found in index row with name: {} " \ + "and sql: {}. This use case is not fully implemented." + log_message = log_message.format(self.name, self.sql) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + def stringify(self, padding="", print_record_columns=True): + string = "\n" \ + + padding + "Internal Schema Object: {}\n" \ + + padding + "Unique: {}\n" \ + + padding + "Partial Index: {}" + string = string.format(self.internal_schema_object, + self.unique, + self.partial_index) + string = super(IndexRow, self).stringify(padding, print_record_columns) + string + return string + + +class ViewRow(MasterSchemaRow): + + def __init__(self, version_interface, b_tree_table_leaf_page_number, + b_tree_table_leaf_cell, record_columns, tables): + + super(ViewRow, self).__init__(version_interface, b_tree_table_leaf_page_number, + b_tree_table_leaf_cell, record_columns) + + logger = getLogger(LOGGER_NAME) + + if self.row_type != MASTER_SCHEMA_ROW_TYPE.VIEW: + log_message = "Invalid row type: {} when expecting: {} with name: {}." + log_message = log_message.format(self.row_type, MASTER_SCHEMA_ROW_TYPE.VIEW, self.name) + logger.error(log_message) + raise ValueError(log_message) + + +class TriggerRow(MasterSchemaRow): + + def __init__(self, version_interface, b_tree_table_leaf_page_number, + b_tree_table_leaf_cell, record_columns, tables, views): + + super(TriggerRow, self).__init__(version_interface, b_tree_table_leaf_page_number, + b_tree_table_leaf_cell, record_columns) + + logger = getLogger(LOGGER_NAME) + + if self.row_type != MASTER_SCHEMA_ROW_TYPE.TRIGGER: + log_message = "Invalid row type: {} when expecting: {} with name: {}." + log_message = log_message.format(self.row_type, MASTER_SCHEMA_ROW_TYPE.TRIGGER, self.name) + logger.error(log_message) + raise ValueError(log_message) diff --git a/sqlite_dissect/file/schema/table.py b/sqlite_dissect/file/schema/table.py new file mode 100644 index 0000000..ba5b1c1 --- /dev/null +++ b/sqlite_dissect/file/schema/table.py @@ -0,0 +1,47 @@ +from logging import getLogger +from re import sub +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.exception import MasterSchemaRowParsingError + +""" + +table.py + +This script holds the objects needed for parsing table related objects to the master schema. + +This script holds the following object(s): +TableConstraint(object) + +""" + + +class TableConstraint(object): + + def __init__(self, index, constraint, comments=None): + + logger = getLogger(LOGGER_NAME) + + self.index = index + self.constraint = constraint + + if comments: + for comment in comments: + if not comment.startswith("--") or not comment.startswith("/*"): + log_message = "Comment specified does not start with the schema comment prefix: {}.".format(comment) + logger.error(log_message) + raise MasterSchemaRowParsingError(log_message) + + self.comments = [comment.strip() for comment in comments] if comments else [] + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding=""): + string = padding + "Index: {}\n" \ + + padding + "Constraint: {}" + for comment in self.comments: + string += "\n" + padding + "Comment: {}".format(comment) + return string.format(self.index, self.constraint) diff --git a/sqlite_dissect/file/schema/utilities.py b/sqlite_dissect/file/schema/utilities.py new file mode 100644 index 0000000..78d1401 --- /dev/null +++ b/sqlite_dissect/file/schema/utilities.py @@ -0,0 +1,187 @@ +from logging import getLogger +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.exception import MasterSchemaParsingError + +""" + +utilities.py + +This script holds utility functions for dealing with schema specific objects such as parsing comments from sql rather +than more general utility methods. + +This script holds the following function(s): +get_index_of_closing_parenthesis(string, opening_parenthesis_offset=0) +parse_comment_from_sql_segment(sql_segment) + +""" + + +def get_index_of_closing_parenthesis(string, opening_parenthesis_offset=0): + + """ + + + Note: Comments are skipped. + + Note: The string to find the index of the closing parenthesis in requires there to be the opening parenthesis + at the index of the opening parenthesis offset. This can be 0 by default representing the opening + parenthesis at the beginning of the string or a specified index. + + :param string: str The string to find the index of the closing parenthesis in. + :param opening_parenthesis_offset: int The index of the first opening parenthesis. + + :return: + + :raise: + + """ + + logger = getLogger(LOGGER_NAME) + + if string[opening_parenthesis_offset] != "(": + log_message = "The opening parenthesis offset specifies a \"{}\" character and not the " \ + "expected \"(\" in {} with opening parenthesis offset: {}." + log_message = log_message.format(string[opening_parenthesis_offset], string, opening_parenthesis_offset) + logger.error(log_message) + raise ValueError(log_message) + + """ + + We need to find the matching ")" character to the opening of the column definition area. To + do this we search looking for the ")" character but skip one for every matching "(" we find from the + first occurrence. + + We also have to skip all comments indicated by "--" and "/*" and terminated by "\n" and "*/" respectively. + In order to skip comments, we have to flag when we are in a comment. In the case that we find: + 1.) "--" comment: We set the comment_indicator field to 1 and back to 0 once the "\n" is found + 2.) "/*" comment: We set the comment_indicator field to 2 and back to 0 once the "*/" is found + + Note: If we are in a comment already, we ignore other comment indicators. + + """ + + closing_parenthesis_offset = opening_parenthesis_offset + embedded_parentheses = 0 + comment_indicator = 0 + + for index, character in enumerate(string[opening_parenthesis_offset + 1:], opening_parenthesis_offset + 1): + + closing_parenthesis_offset = index + + if comment_indicator: + + if (comment_indicator == 1 and character == '\n') or \ + (comment_indicator == 2 and character == '/' and string[index - 1] == '*'): + comment_indicator = 0 + + else: + + if character is "(": + + embedded_parentheses += 1 + + elif character is ")": + + if embedded_parentheses == 0: + break + else: + embedded_parentheses -= 1 + + elif character is "-": + + """ + + Check to make sure we are encountering a comment. + + Note: A single "-" is allowed since it can be before a negative default value for example in the + create statement. + + """ + + # Check to make sure the full comment indicator was found for "--" + if string[index + 1] == "-": + + # Set the comment indicator + comment_indicator = 1 + + elif character == "/": + + # Check to make sure the full comment indicators were found for "--" and "/*" + if character == "/" and string[index + 1] != "*": + log_message = "Comment indicator '{}' found followed by an invalid secondary comment " \ + "indicator: {} found in {}." + log_message = log_message.format(character, string[index + 1], string) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + + # Set the comment indicator + comment_indicator = 2 + + # Check to make sure the closing parenthesis was found + if closing_parenthesis_offset == len(string) - 1 and string[closing_parenthesis_offset] != ")": + log_message = "The closing parenthesis was not found in {} with opening parenthesis offset: {}." + log_message = log_message.format(string, opening_parenthesis_offset) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) + + return closing_parenthesis_offset + + +def parse_comment_from_sql_segment(sql_segment): + + """ + + This function will parse out the comment from the sql_segment. This function assumes that a comment + was already detected and needs to be parsed and therefore the sql_segment should start with: + 1.) -- + 2.) /* + + If the sql_segment does not start with either, then an exception will be raised. If a comment is + found then the comment will be parsed out and returned along with the remaining sql. Only the first comment + will be stripped and returned in the case that there are multiple comments within the supplied sql_segment. + + If the either of the two above use cases above are found, then they will be parsed in the following manner: + 1.) --: The comment will be parsed from the "--" until the newline "\n" character is found: + ... [-- ... \n] ... + 2.) /*: THe comment will be parsed from the "/*" until the matching "*/" character sequence is found: + ... [/* ... */] ... + Note: The "/* ... */" comment tags can have new lines within them. + + Note: The returned comment will include the "--" or "/* and "*/" strings. If the comment was started with the + "--" comment indicator, the ending '\n' character is included in the comment string. It is up to the caller + to call rstrip() or a likewise operation if needed. + + Note: The returned remaining_sql_segment will not have strip() called on it. + + :param sql_segment: + + :return: tuple(comment, remaining_sql_segment) + + :raise: MasterSchemaParsingError + + """ + + logger = getLogger(LOGGER_NAME) + + # Check if the sql segment starts with "--" + if sql_segment.startswith("--"): + + comment = sql_segment[:sql_segment.index('\n') + 1] + remaining_sql_segment = sql_segment[sql_segment.index('\n') + 1:] + + return comment, remaining_sql_segment + + # Check if the sql segment starts with "/*" + elif sql_segment.startswith("/*"): + + comment = sql_segment[:sql_segment.index("*/") + 2] + remaining_sql_segment = sql_segment[sql_segment.index("*/") + 2:] + + return comment, remaining_sql_segment + + # The remaining sql command does not start with "--" or "/*" as expected + else: + log_message = "The sql segment: {} did not start with the expected \"--\" or \"/*\" strings." + log_message = log_message.format(sql_segment.number) + logger.error(log_message) + raise MasterSchemaParsingError(log_message) diff --git a/sqlite_dissect/file/utilities.py b/sqlite_dissect/file/utilities.py new file mode 100644 index 0000000..5c38524 --- /dev/null +++ b/sqlite_dissect/file/utilities.py @@ -0,0 +1,29 @@ +from sqlite_dissect.file.wal.commit_record import WriteAheadLogCommitRecord + +""" + +utilities.py + +This script holds utility functions for dealing with the version classes rather than more general utility methods. + +This script holds the following function(s): +validate_page_version_history(version_history) + +""" + + +def validate_page_version_history(version_history): + for version_number, version in version_history.versions.iteritems(): + for page_number, page in version.pages.iteritems(): + if page.page_version_number != version.page_version_index[page.number]: + return False + if page.version_number != version.version_number: + return False + if isinstance(version, WriteAheadLogCommitRecord): + if page_number in version.updated_page_numbers: + page_frame_index = version.page_frame_index + page_frame = page_frame_index[page.number] + actual_page_frame = version.frames[page.number].frame_number + if page_frame != actual_page_frame: + return False + return True diff --git a/sqlite_dissect/file/version.py b/sqlite_dissect/file/version.py new file mode 100644 index 0000000..684caf3 --- /dev/null +++ b/sqlite_dissect/file/version.py @@ -0,0 +1,388 @@ +from abc import ABCMeta +from abc import abstractmethod +from binascii import hexlify +from logging import getLogger +from re import sub +from sqlite_dissect.constants import INDEX_INTERIOR_PAGE_HEX_ID +from sqlite_dissect.constants import INDEX_LEAF_PAGE_HEX_ID +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import MASTER_PAGE_HEX_ID +from sqlite_dissect.constants import PAGE_TYPE +from sqlite_dissect.constants import PAGE_TYPE_LENGTH +from sqlite_dissect.constants import SQLITE_DATABASE_HEADER_LENGTH +from sqlite_dissect.constants import SQLITE_MASTER_SCHEMA_ROOT_PAGE +from sqlite_dissect.constants import TABLE_INTERIOR_PAGE_HEX_ID +from sqlite_dissect.constants import TABLE_LEAF_PAGE_HEX_ID +from sqlite_dissect.exception import VersionParsingError +from sqlite_dissect.file.database.header import DatabaseHeader +from sqlite_dissect.file.database.page import IndexInteriorPage +from sqlite_dissect.file.database.page import IndexLeafPage +from sqlite_dissect.file.database.page import TableInteriorPage +from sqlite_dissect.file.database.page import TableLeafPage +from sqlite_dissect.file.database.utilities import get_pages_from_b_tree_page +from sqlite_dissect.file.schema.master import MasterSchema + +""" + +version.py + +This script holds the superclass objects used for parsing the database and write ahead log. + +This script holds the following object(s): +Version(object) + +""" + + +class Version(object): + + __metaclass__ = ABCMeta + + def __init__(self, file_handle, version_number, store_in_memory, strict_format_checking): + + self._logger = getLogger(LOGGER_NAME) + + self.file_handle = file_handle + self.version_number = version_number + self.store_in_memory = store_in_memory + self.strict_format_checking = strict_format_checking + self.page_size = self.file_handle.header.page_size + + self._database_header = None + self.database_size_in_pages = None + + self._root_page = None + + self.first_freelist_trunk_page = None + self.freelist_page_numbers = None + self.pointer_map_pages = None + self.pointer_map_page_numbers = None + + self._master_schema = None + + self.updated_page_numbers = None + self.page_version_index = None + + """ + + The _pages variable is only for the use case that the pages are requested to be stored in memory. + + """ + + self._pages = None + + """ + + The following variables are to track across the versions (database and wal commit records) what portions of the + file are changed. + + For the Database: + + The database header, root b-tree page, and master schema will be set to True since these objects are always + considered modified in the database file. As a side note, the master schema could not have any entries if there + was no schema but is still considered modified. + + The freelist pages modified flag will be set to True if freelist pages exist, otherwise False since there are no + freelist pages. + + The pointer map pages modified flag will be set to True if the largest b-tree root page is set in the header + indicating that auto-vacuuming is turned on. Otherwise, if this field is 0, auto-vacuuming is turned off and + the pointer map pages modified flag will be set to False. As a side note, if this is set to False, then it + will continue to be False throughout all following versions since the auto-vacuuming must be set before the + schema creation and cannot be turned off if enabled, or turned on if not enabled initially. (Switching between + full (0) and incremental (1) auto-vacuuming modes is allowed.) + + The updated b-tree page numbers array are all the schema root page numbers including all pages of the b-tree. + These will represent all of the b-tree and overflow pages (excluding the master schema related pages) updated. + All of the b-tree pages for the database will be included in this array. + + For the WriteAheadLogCommitRecord: + + The database header will be set to True if the database header was updated. This should always occur if any + change was made to the root page (although the root page may be in the commit record with no changes). + + The root b-tree page modified flag will be set if the content on the root b-tree portion (not including the + database header) is modified. This will also result in teh master schema modified flag being set to True. + However, the inverse is not true as described next. + + The master schema modified flag will be set if the master schema is updated. This includes any of the master + schema pages being updated in the b-tree. If the master schema updated pages did not include the sqlite master + schema root page (1). then the master schema modified flag will still be set to True, but the root b-tree page + modified flag will be False. + + The freelist pages modified and pointer map pages flags will be set to True when the freelist pages are updated + in any way. + + The updated b-tree page numbers array are all the schema root page numbers including all pages of the b-tree. + These will represent all of the b-tree and overflow pages (excluding the master schema related pages) updated. + Only the b-tree pages for the wal commit record that were updated will be included in this array. + + Note: The database header modified flag and the root b-tree page modified tags refer to different areas of the + sqlite root page. The database header may be modified without the root b-tree page being modified. + However, if the root b-tree page is modified, then the header should always be modified since the header + contains a change counter that is incremented whenever changes to the database are done. + + Note: The following variables below specify if the root b-tree page was modified and if the master + schema was modified. Although the master schema is on the database root b-tree page, the + master schema changes may not be directly on the root page itself. Therefore, the master schema + may be modified without modifying the root page but if the root b-tree page is modified, then the + master schema modified flag will always be set. + + """ + + self.database_header_modified = False + self.root_b_tree_page_modified = False + self.master_schema_modified = False + self.freelist_pages_modified = False + self.pointer_map_pages_modified = False + + self.updated_b_tree_page_numbers = None + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding="", print_pages=True, print_schema=True): + string = padding + "File Type: {}\n" \ + + padding + "Version Number: {}\n" \ + + padding + "Store in Memory: {}\n" \ + + padding + "Strict Format Checking: {}\n" \ + + padding + "Page Size: {}\n" \ + + padding + "File Handle:\n{}\n" \ + + padding + "Database Header:\n{}\n" \ + + padding + "Database Size in Pages: {}\n" \ + + padding + "Freelist Page Numbers: {}\n" \ + + padding + "Pointer Map Page Numbers: {}\n" \ + + padding + "Updated Page Numbers: {}\n" \ + + padding + "Page Version Index: {}\n" \ + + padding + "Database Header Modified: {}\n" \ + + padding + "Root B-Tree Page Modified: {}\n" \ + + padding + "Master Schema Modified: {}\n" \ + + padding + "Freelist Pages Modified: {}\n" \ + + padding + "Pointer Map Pages Modified: {}\n" \ + + padding + "Updated B-Tree Page Numbers: {}" + string = string.format(self.file_type, + self.version_number, + self.store_in_memory, + self.strict_format_checking, + self.page_size, + self.file_handle.stringify(padding + "\t"), + self.database_header.stringify(padding + "\t"), + self.database_size_in_pages, + self.freelist_page_numbers, + self.pointer_map_page_numbers, + self.updated_page_numbers, + self.page_version_index, + self.database_header_modified, + self.root_b_tree_page_modified, + self.master_schema_modified, + self.freelist_pages_modified, + self.pointer_map_pages_modified, + self.updated_b_tree_page_numbers) + if print_pages: + for page in self.pages.itervalues(): + string += "\n" + padding + "Page:\n{}".format(page.stringify(padding + "\t")) + if print_schema: + string += "\n" \ + + padding + "Master Schema:\n{}".format(self.master_schema.stringify(padding + "\t", print_pages)) + return string + + @property + def file_type(self): + return self.file_handle.file_type + + @property + def database_text_encoding(self): + return self.file_handle.database_text_encoding + + @database_text_encoding.setter + def database_text_encoding(self, database_text_encoding): + self.file_handle.database_text_encoding = database_text_encoding + + @property + def database_header(self): + if not self._database_header: + return DatabaseHeader(self.get_page_data(SQLITE_MASTER_SCHEMA_ROOT_PAGE)[:SQLITE_DATABASE_HEADER_LENGTH]) + return self._database_header + + @property + def root_page(self): + if not self._root_page: + return self.get_b_tree_root_page(SQLITE_MASTER_SCHEMA_ROOT_PAGE) + return self._root_page + + @property + def master_schema(self): + if not self._master_schema: + return MasterSchema(self, self.root_page) + return self._master_schema + + @property + def pages(self): + + # Return the pages if they are being stored in memory and already parsed + if self._pages: + return self._pages + + pages = {} + + # Populate the freelist pages into the pages dictionary + freelist_trunk_page = self.first_freelist_trunk_page + while freelist_trunk_page: + pages[freelist_trunk_page.number] = freelist_trunk_page + for freelist_leaf_page in freelist_trunk_page.freelist_leaf_pages: + pages[freelist_leaf_page.number] = freelist_leaf_page + freelist_trunk_page = freelist_trunk_page.next_freelist_trunk_page + + # Populate the pointer map pages into the pages dictionary + for pointer_map_page in self.pointer_map_pages: + pages[pointer_map_page.number] = pointer_map_page + + """ + + Since the WAL commit record may not have the master schema parsed and needs to parse it, we store the master + schema to a variable so it is only parsed once, if need be. + + """ + + master_schema = self.master_schema + + # Populate the master schema page into the pages dictionary including the root page + for master_schema_page in master_schema.master_schema_pages: + pages[master_schema_page.number] = master_schema_page + + # Populate the b-trees from the master schema including the root page + for b_tree_root_page_number in master_schema.master_schema_b_tree_root_page_numbers: + b_tree_root_page = self.get_b_tree_root_page(b_tree_root_page_number) + for b_tree_page in get_pages_from_b_tree_page(b_tree_root_page): + pages[b_tree_page.number] = b_tree_page + + # Set the number of pages that were found + number_of_pages = len(pages) + + if number_of_pages != self.database_size_in_pages: + log_message = "The number of pages: {} did not match the database size in pages: {} for version: {}." + log_message = log_message.format(number_of_pages, self.database_size_in_pages, self.version_number) + self._logger.error(log_message) + raise VersionParsingError(log_message) + + for page_number in [page_index + 1 for page_index in range(self.database_size_in_pages)]: + if page_number not in pages: + log_message = "Page number: {} was not found in the pages: {} for version: {}." + log_message = log_message.format(page_number, pages.keys(), self.version_number) + self._logger.error(log_message) + raise VersionParsingError(log_message) + + return pages + + @abstractmethod + def get_page_data(self, page_number, offset=0, number_of_bytes=None): + log_message = "The abstract method get_page_data was called directly and is not implemented." + self._logger.error(log_message) + raise NotImplementedError(log_message) + + @abstractmethod + def get_page_offset(self, page_number): + log_message = "The abstract method get_page_offset was called directly and is not implemented." + self._logger.error(log_message) + raise NotImplementedError(log_message) + + def get_b_tree_root_page(self, b_tree_page_number): + + """ + + + + Note: There is no real way of efficiently checking if this page is a root page or not and doesn't really + matter for the purpose of this library. Therefore, any b-tree page requested is considered a root + page in relation to it's position to the b-tree that it is a part of for the purposes of this function. + + :param b_tree_page_number: + + :return: + + """ + + # Return the page if it is already being in memory and already parsed + if self._pages: + + b_tree_root_page = self._pages[b_tree_page_number] + + # Make sure the page is a b-tree page + if b_tree_root_page.page_type not in [PAGE_TYPE.B_TREE_TABLE_INTERIOR, PAGE_TYPE.B_TREE_TABLE_LEAF, + PAGE_TYPE.B_TREE_INDEX_INTERIOR, PAGE_TYPE.B_TREE_INDEX_LEAF]: + log_message = "The b-tree page number: {} is not a b-tree page but instead has a type of: {}." + log_message = log_message.format(b_tree_page_number, b_tree_root_page.page_type) + self._logger.error(log_message) + raise ValueError(log_message) + + # Return the b-tree page + return b_tree_root_page + + page_hex_type = self.get_page_data(b_tree_page_number, 0, PAGE_TYPE_LENGTH) + + if page_hex_type == MASTER_PAGE_HEX_ID: + + # Make sure this is the sqlite master schema root page + if b_tree_page_number != SQLITE_MASTER_SCHEMA_ROOT_PAGE: + log_message = "The b-tree page number: {} contains the master page hex but is not page number: {}." + log_message = log_message.format(b_tree_page_number) + self._logger.error(log_message) + raise VersionParsingError(log_message) + + page_hex_type = self.get_page_data(b_tree_page_number, SQLITE_DATABASE_HEADER_LENGTH, PAGE_TYPE_LENGTH) + + # If this is the sqlite master schema root page then this page has to be a table interior or leaf page + if page_hex_type not in [TABLE_INTERIOR_PAGE_HEX_ID, TABLE_LEAF_PAGE_HEX_ID]: + log_message = "The b-tree page number: {} contains the master page hex but has hex type: {} which " \ + "is not the expected table interior or table leaf page hex." + log_message = log_message.format(b_tree_page_number, hexlify(page_hex_type)) + self._logger.error(log_message) + raise VersionParsingError(log_message) + + # Check if it was a b-tree table interior + if page_hex_type == TABLE_INTERIOR_PAGE_HEX_ID: + + # Create the table interior page + return TableInteriorPage(self, b_tree_page_number) + + # Check if it was a b-tree table leaf + elif page_hex_type == TABLE_LEAF_PAGE_HEX_ID: + + # Create the table leaf page + return TableLeafPage(self, b_tree_page_number) + + # Check if it was a b-tree index interior + elif page_hex_type == INDEX_INTERIOR_PAGE_HEX_ID: + + # Create the table interior page + return IndexInteriorPage(self, b_tree_page_number) + + # Check if it was a b-tree index leaf + elif page_hex_type == INDEX_LEAF_PAGE_HEX_ID: + + # Create the table leaf page + return IndexLeafPage(self, b_tree_page_number) + + # Throw an exception since the type of the b-tree page was not a b-tree hex type + else: + + log_message = "The b-tree page number: {} did not refer to a b-tree page but rather a page of hex type: {}." + log_message = log_message.format(hexlify(page_hex_type)) + self._logger.error(log_message) + raise ValueError(log_message) + + def get_page_version(self, page_number): + + try: + + return self.page_version_index[page_number] + + except KeyError: + + log_message = "The page number: {} was not found in the page version index: {} for version: {}." + log_message = log_message.format(page_number, self.page_version_index, self.version_number) + self._logger.error(log_message) + raise diff --git a/sqlite_dissect/file/version_parser.py b/sqlite_dissect/file/version_parser.py new file mode 100644 index 0000000..c2e23ae --- /dev/null +++ b/sqlite_dissect/file/version_parser.py @@ -0,0 +1,338 @@ +from abc import ABCMeta +from logging import getLogger +from re import sub +from warnings import warn +from sqlite_dissect.constants import BASE_VERSION_NUMBER +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import MASTER_SCHEMA_ROW_TYPE +from sqlite_dissect.constants import PAGE_TYPE +from sqlite_dissect.exception import VersionParsingError +from sqlite_dissect.file.schema.master import OrdinaryTableRow +from sqlite_dissect.file.schema.master import VirtualTableRow + +""" + +version_parser.py + +This script holds the objects for parsing through the version history for master schema entries. This can be used +for retrieving cells (records), carving, signature generation, etc.. + +This script holds the following object(s): +VersionParser(object) + +""" + + +class VersionParser(object): + + __metaclass__ = ABCMeta + + def __init__(self, version_history, master_schema_entry, version_number=None, ending_version_number=None): + + """ + + + + The version history will be iterated through and the respective subclass will use the master schema entry + parsed from every version where that master schema entry is found. The version numbers where the master schema + entry is found until the last version it is found in (if applicable) will be set at the parser starting version + number and parser ending version number. + + In addition, the version number may be set for a specific version to be parsed. This way if you only want a + specific version to be parsed, you can specify the version number. If you want the range between two specific + versions, the version number and ending version number can be specified to parse the versions in between + (including the specified version number and ending version number). If these fields are set the parser + starting and ending version number will be set accordingly to be within the range of these versions, if + existing, otherwise None. If the master schema entry does not exist in between the versions, a warning will + be raised and the subclass will handle the use case accordingly (either by creating and empty object(s) or a + "empty" class depending on implementation). + + The md5_hash_identifier field is used from the master schema entry to identify it across the versions. Due + to this, it does not matter what master schema entry from what version you choose. The md5_hash_identifier + is derived from the row id, name, table name, type, and sql to ensure uniqueness. (Root page numbers can be + updated.) + + Note: The use case where the same master schema entry is removed and re-added needs to be addressed in the wal + file and is not fully supported here. + + :param version_history: + :param master_schema_entry: + :param version_number: + :param ending_version_number: + + :return: + + :raise: + + """ + + logger = getLogger(LOGGER_NAME) + + if version_number is None and ending_version_number: + log_message = "Version number not specified where ending version number was specified as: {} for " \ + "master schema entry with root page number: {} row type: {} name: {} table name: {} " \ + "and sql: {}." + log_message = log_message.format(ending_version_number, master_schema_entry.root_page_number, + master_schema_entry.row_type, master_schema_entry.name, + master_schema_entry.table_name, master_schema_entry.sql) + logger.error(log_message) + raise ValueError(log_message) + + if version_number is not None and version_number == ending_version_number: + log_message = "Version number: {} specified where ending version number was also specified as: {} for " \ + "master schema entry with root page number: {} row type: {} name: {} table name: {} and " \ + "sql: {}." + log_message = log_message.format(version_number, ending_version_number, + master_schema_entry.root_page_number, master_schema_entry.row_type, + master_schema_entry.name, master_schema_entry.table_name, + master_schema_entry.sql) + logger.error(log_message) + raise ValueError(log_message) + + number_of_versions = version_history.number_of_versions + + """ + + The ending version number needs to be less than the number of versions since version numbers start from + 0 and go to the last version. Therefore, the number of versions will be one greater than the last version + number. + + """ + + if ending_version_number is not None and (ending_version_number >= number_of_versions or + ending_version_number <= version_number): + log_message = "Invalid ending version number: {} with {} number of versions with version number: {} for " \ + "master schema entry with root page number: {} row type: {} name: {} table name: {} " \ + "and sql: {}." + log_message = log_message.format(ending_version_number, number_of_versions, version_number, + master_schema_entry.root_page_number, master_schema_entry.row_type, + master_schema_entry.name, master_schema_entry.table_name, + master_schema_entry.sql) + logger.error(log_message) + raise ValueError(log_message) + + self.version_number = version_number + self.ending_version_number = ending_version_number + + self.parser_starting_version_number = version_number if version_number is not None else BASE_VERSION_NUMBER + self.parser_ending_version_number = ending_version_number \ + if ending_version_number is not None else number_of_versions - 1 + + """ + + According to the sqlite documentation the only pages with a root page are table and index types (excluding + virtual tables.) Therefore we can only parse cells from these types. In the case that trigger or + view master schema entry row types were specified we raise a warning here. This will result in having a + no entries to parse through. + + Note: Support for virtual table modules that may or may not have database b-tree pages need to be accounted + for. A warning will be displayed if a virtual table is encountered. + + Note: Support for "without rowid" tables are not accounted for properly. For now, a warning will be displayed. + + """ + + if master_schema_entry.row_type not in [MASTER_SCHEMA_ROW_TYPE.TABLE, MASTER_SCHEMA_ROW_TYPE.INDEX]: + log_message = "Invalid master schema entry row type: {} for master schema entry with root page " \ + "number: {} name: {} table name: {} and sql: {}. Only table and index master " \ + "schema entries have associated cells to be parsed." + log_message = log_message.format(master_schema_entry.row_type, master_schema_entry.root_page_number, + master_schema_entry.name, master_schema_entry.table_name, + master_schema_entry.sql) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + # Set the page type and update it as appropriate + self.page_type = PAGE_TYPE.B_TREE_TABLE_LEAF + + if isinstance(master_schema_entry, VirtualTableRow): + log_message = "A virtual table row type was found for the version parser which is not fully supported " \ + "for master schema entry root page number: {} type: {} name: {} table name: {} and sql: {}." + log_message = log_message.format(master_schema_entry.root_page_number, + master_schema_entry.row_type, master_schema_entry.name, + master_schema_entry.table_name, master_schema_entry.sql) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + elif isinstance(master_schema_entry, OrdinaryTableRow) and master_schema_entry.without_row_id: + log_message = "A \"without rowid\" table row type was found for the version parser which is not " \ + "supported for master schema entry root page number: {} row type: {} name: {} " \ + "table name: {} and sql: {}. Erroneous cells may be generated." + log_message = log_message.format(master_schema_entry.root_page_number, + master_schema_entry.row_type, master_schema_entry.name, + master_schema_entry.table_name, master_schema_entry.sql) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + self.page_type = PAGE_TYPE.B_TREE_INDEX_LEAF + + # Set the page type if the master schema row type is a index + if master_schema_entry.row_type == MASTER_SCHEMA_ROW_TYPE.INDEX: + self.page_type = PAGE_TYPE.B_TREE_INDEX_LEAF + + """ + + Set the master schema entry fields we care about in this class. Since root page numbers can be different + depending on versions, root page numbers is a dictionary in the form of: + root_page_number_version_index[VERSION_NUMBER] = ROOT_PAGE_NUMBER(VERSION) + + """ + + self.row_type = master_schema_entry.row_type + self.name = master_schema_entry.name + self.table_name = master_schema_entry.table_name + self.sql = master_schema_entry.sql + self.root_page_number_version_index = {} + + # Get the md5_hash_identifier from the master schema entry + self.master_schema_entry_md5_hash_identifier = master_schema_entry.md5_hash_identifier + + """ + + Setup the version numbers to parse through for the version history. + + Note: If the master schema entry is either not found, or stops being found and then re-found, a warning will + be raised. The master schema entry uniqueness is determined by the master schema entry md5 hash + identifier from the MasterSchemaRow class. + + """ + + versions = version_history.versions + starting_version_number = None + ending_version_number = None + for version_number in range(self.parser_starting_version_number, self.parser_ending_version_number + 1): + + version = versions[version_number] + + if version.master_schema_modified: + master_schema = version.master_schema + else: + master_schema = version.last_master_schema + + if not master_schema: + log_message = "Master schema was unable to be found in starting version number: {} while parsing " \ + "the version history for master schema entry with name: {} table name: {} " \ + "row type: {} and sql: {} for version number: {} and ending version number: {}." + log_message = log_message.format(version_number, self.name, self.table_name, self.row_type, self.sql, + self.parser_starting_version_number, + self.parser_ending_version_number) + logger.error(log_message) + raise VersionParsingError(log_message) + + entries = master_schema.master_schema_entries + entries_dictionary = dict(map(lambda entry: [entry.md5_hash_identifier, entry], entries)) + + if self.master_schema_entry_md5_hash_identifier in entries_dictionary: + + if ending_version_number is None: + + if starting_version_number is not None: + log_message = "The starting version number was set already when it should not have been " \ + "since the ending version number was still not set for master schema entry " \ + "row type: {} with root page number: {} name: {} table name: {} and sql: {}." + log_message = log_message.format(master_schema_entry.row_type, + master_schema_entry.root_page_number, master_schema_entry.name, + master_schema_entry.table_name, master_schema_entry.sql) + logger.error(log_message) + raise VersionParsingError(log_message) + + starting_version_number = version_number + ending_version_number = version_number + + if self.root_page_number_version_index: + log_message = "The root page number version index has already been populated with values " \ + "when it should not have been for master schema entry row type: {} with root " \ + "page number: {} name: {} table name: {} and sql: {}." + log_message = log_message.format(master_schema_entry.row_type, + master_schema_entry.root_page_number, master_schema_entry.name, + master_schema_entry.table_name, master_schema_entry.sql) + logger.error(log_message) + raise VersionParsingError(log_message) + + # Add the first version number and b-tree root page number into the root page number version index + root_page_number = entries_dictionary[self.master_schema_entry_md5_hash_identifier].root_page_number + self.root_page_number_version_index[version_number] = root_page_number + + elif ending_version_number == version_number - 1: + ending_version_number = version_number + + if not self.root_page_number_version_index: + log_message = "The root page number version index has not already been populated with values " \ + "when it should have been for master schema entry row type: {} with root " \ + "page number: {} name: {} table name: {} and sql: {}." + log_message = log_message.format(master_schema_entry.row_type, + master_schema_entry.root_page_number, master_schema_entry.name, + master_schema_entry.table_name, master_schema_entry.sql) + logger.error(log_message) + raise VersionParsingError(log_message) + + # Add the version number and b-tree root page number into the root page number version index + root_page_number = entries_dictionary[self.master_schema_entry_md5_hash_identifier].root_page_number + self.root_page_number_version_index[version_number] = root_page_number + + else: + log_message = "Version number: {} did not have a master schema entry for the previous " \ + "version number for master schema entry with name: {} table name: {} " \ + "row type: {} and sql: {} for version number: {} and ending version number: {}." + log_message = log_message.format(version_number, self.name, self.table_name, self.row_type, + self.sql, self.parser_starting_version_number, + self.parser_ending_version_number) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + if starting_version_number is None and ending_version_number is None: + log_message = "Was unable to find any matching schema entries between version numbers {} " \ + "and {}. The version parser will not parse anything for master schema entry with " \ + "name: {} table name: {} row type: {} and sql: {}." + log_message = log_message.format(self.parser_starting_version_number, + self.parser_ending_version_number, self.name, self.table_name, + self.row_type, self.sql) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + self.parser_starting_version_number = starting_version_number + self.parser_ending_version_number = ending_version_number + + """ + + We now have the parser starting and ending version numbers that we need to parse between and a root + page number version index referring to each version and it's root b-tree page in case it was updated. + + Note: The root pages to the master schema entries are generated on demand from the version which will return + the b-tree page if it is already in memory, or parse it and then return it if it is not. Versions can + either be stored in memory or read out on demand for b-tree pages. This is allowed for conserving + memory and speeding up parsing (so each b-tree page does not need to be parsed in the case where + they do not change). + + """ + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding=""): + string = padding + "Row Type: {}\n" \ + + padding + "Page Type: {}\n" \ + + padding + "Name: {}\n" \ + + padding + "Table Name: {}\n" \ + + padding + "SQL: {}\n" \ + + padding + "Root Page Number Version Index: {}\n" \ + + padding + "Master Schema Entry MD5 Hash Identifier: {}\n" \ + + padding + "Version Number: {}\n" \ + + padding + "Ending Version Number: {}\n" \ + + padding + "Parser Starting Version Number: {}\n" \ + + padding + "Parser Ending Version Number: {}" + string = string.format(self.row_type, + self.page_type, + self.name, + self.table_name, + self.sql, + self.root_page_number_version_index, + self.master_schema_entry_md5_hash_identifier, + self.version_number, + self.ending_version_number, + self.parser_starting_version_number, + self.parser_ending_version_number) + return string diff --git a/sqlite_dissect/file/wal/README.md b/sqlite_dissect/file/wal/README.md new file mode 100644 index 0000000..26182a0 --- /dev/null +++ b/sqlite_dissect/file/wal/README.md @@ -0,0 +1,130 @@ + +# sqlite_dissect.file.wal + +This package will control parsing and access to the SQLite WAL files. + +- commit_record.py +- frame.py +- header.py +- utilities.py +- wal.py + +TODO items for the "wal" package: + +- [ ] Finish UML class diagrams. + +
+ +### commit_record.py +This script holds the objects used for parsing the write ahead log commit records. + +This script holds the following object(s): +- WriteAheadLogCommitRecord(Version) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Better exception handling when creating objects, etc. +- [ ] Investigate where a database file has empty space beyond the page size (wal checkpoints were set). +- [ ] Is there a need (or way) to implement this without an initial database (just wal file)? +- [ ] Investigate where a database file has empty space beyond the page size (wal checkpoints were set). + ##### WriteAheadLogCommitRecord Class: + - [ ] Check lists and dictionaries for fields before adding. + - [ ] Is there a better way to handle pointer map pages (parse on demand)? + - [ ] Investigate when a set of frames does not have a commit frame. (Warning currently thrown.) + - [ ] Investigate root pages in commit records with no changes. (Warning currently thrown.) + - [ ] The incremental vacuum mode can change in the header from 1 to 2 or 2 to 1. + - [ ] Investigate if the database text encoding/schema format number can change after set. + - [ ] Investigate if the size in pages can differ on first update if last version < 3.7.0. + +
+ +### frame.py +This script holds the objects used for parsing the WAL frame. + +> Note: The WriteAheadLogFrame class is not responsible for parsing the page data itself. It is meant to give +> information on the WAL frame and offsets of the page data but in order to parse the page data, the set of all +> page changes to the commit record this frame belongs in is needed. Therefore the commit record class +> (WriteAheadLogCommitRecord) will be responsible for parsing pages. +> +> There was some discussion about the page being stored back in the WriteAheadLogFrame once parsed but it was +> decided that this made little to no difference and should just be retrieved from the commit record. +> +> As a side note, there are some basic things parsed from the page such as the page type. This is only for +> debugging and logging purposes. + +This script holds the following object(s): +-WriteAheadLogFrame(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Finish try/except exception handling for struct.error and ord in classes. + ##### WriteAheadLogFrame Class: + - [ ] Are both the frame index and frame number needed? Should the "frame" prefix be removed? + - [ ] Handle exceptions that may be raised from creating the wal frame header. + - [ ] The contains_sqlite_database_header attribute should apply to table b-trees, not all b-trees. + - [ ] Document that the root page is not parsed or contained in the frame and why. + +
+ +### header.py +This script holds the header objects used for parsing the header of the WAL file and WAL frames. + +This script holds the following object(s): +- WriteAheadLogHeader(SQLiteHeader) +- WriteAheadLogFrameHeader(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Finish try/except exception handling for struct.error and ord in classes. +- [ ] Implement checking of the salt values. +- [ ] Implement checking of checksums in either big/little endian. +- [ ] Investigate if the big/little endian applies to both checksums in the file header and frame header. +- [ ] Create arrays for salt and checksum values rather than separate variables? They are arrays in the sqlite c code. + ##### WriteAheadLogHeader Class: + - [ ] Investigate use cases where the checkpoint != 0. A warning is thrown currently. + +
+ +### utilities.py +This script holds utility functions for dealing with WAL specific objects such as comparing database header rather +than more general utility methods. + +This script holds the following function(s): +- compare_database_headers(previous_database_header, new_database_header) +

+ +TODO: +- [ ] Documentation improvements. + ##### compare_database_headers Function: + - [ ] The \_\_dict\_\_ also returns class objects that may cause issues. + +
+ +### wal.py +This script holds the WAL objects used for parsing the WAL file. + +This script holds the following object(s): +- WriteAheadLog(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. + ##### WriteAheadLog Class: + - [ ] Note that this does not extend the version object, instead the commit record does. + - [ ] Handle exceptions that may be raised from creating the wal frame. + - [ ] Check the salts and checksums across the frames to the header. + - [ ] Address the use case of having additional frames past the last committed frame. + - [ ] Update the commit record number when invalid frames are implemented. + - [ ] Implement wal files with invalid frames. + - [ ] Expand on salt 1 and checkpoint referencing documentation and in stringify() functions. + - [ ] Check the last valid frame index matches that in the wal index file (if found). + - [ ] Check the database size in pages in the wal index file (if found) against the last commit record. + - [ ] The file_size arg may not be needed since it is in the file handle and may be removed. + \ No newline at end of file diff --git a/sqlite_dissect/file/wal/__init__.py b/sqlite_dissect/file/wal/__init__.py new file mode 100644 index 0000000..721329e --- /dev/null +++ b/sqlite_dissect/file/wal/__init__.py @@ -0,0 +1,10 @@ + +""" + +__init__.py + +This init script will initialize any needed logic for this package. + +This package will control parsing and access to the SQLite WAL files. + +""" diff --git a/sqlite_dissect/file/wal/commit_record.py b/sqlite_dissect/file/wal/commit_record.py new file mode 100644 index 0000000..b3f1051 --- /dev/null +++ b/sqlite_dissect/file/wal/commit_record.py @@ -0,0 +1,1314 @@ +from copy import copy +from warnings import warn +from sqlite_dissect.constants import DATABASE_HEADER_VERSIONED_FIELDS +from sqlite_dissect.constants import FIRST_FREELIST_TRUNK_PARENT_PAGE_NUMBER +from sqlite_dissect.constants import FIRST_FREELIST_TRUNK_PAGE_INDEX +from sqlite_dissect.constants import SQLITE_DATABASE_HEADER_LENGTH +from sqlite_dissect.constants import SQLITE_MASTER_SCHEMA_ROOT_PAGE +from sqlite_dissect.constants import UTF_8 +from sqlite_dissect.constants import UTF_8_DATABASE_TEXT_ENCODING +from sqlite_dissect.constants import UTF_16BE +from sqlite_dissect.constants import UTF_16BE_DATABASE_TEXT_ENCODING +from sqlite_dissect.constants import UTF_16LE +from sqlite_dissect.constants import UTF_16LE_DATABASE_TEXT_ENCODING +from sqlite_dissect.constants import WAL_FRAME_HEADER_LENGTH +from sqlite_dissect.constants import WAL_HEADER_LENGTH +from sqlite_dissect.exception import WalCommitRecordParsingError +from sqlite_dissect.file.database.header import DatabaseHeader +from sqlite_dissect.file.database.page import FreelistTrunkPage +from sqlite_dissect.file.database.utilities import create_pointer_map_pages +from sqlite_dissect.file.schema.master import MasterSchema +from sqlite_dissect.file.version import Version +from sqlite_dissect.file.wal.utilities import compare_database_headers +from sqlite_dissect.constants import BASE_VERSION_NUMBER +from sqlite_dissect.utilities import get_md5_hash + +""" + +version.py + +This script holds the objects used for parsing the write ahead log commit records. + +This script holds the following object(s): +WriteAheadLogCommitRecord(Version) + +""" + + +class WriteAheadLogCommitRecord(Version): + + """ + + This class extends the Version class and represents a version based on a commit record in the WAL file. The + database is not considered "committed" until a frame appears in the WAL file with a size of database in pages field + set declaring it a commit record. The SQLite drivers do not read any information out after the last commit record + (if there is any information). Therefore we structure each set of frames up to a commit record as a commit record + version and parse it as such. + + Due to the way only parts of the commit record are updated, only parts of the SQLite database will be parsed and + stored in this class. For instance, the database header and master schema will only be parsed if they are changed + from the previous version. Otherwise, the last database header and last master schema will be set with the previous + version's for reference. If the database header and/or master schema is modified, then the objects will be parsed. + Also, their respective modified flags will be set. This is to reduce memory and parsing time. + + The idea here is that the database header or master schema should never be needed unless changes were done which + can be checked by their respective modified flags which are set in the version and set to true for the original + database. + + However, in order to support the version class, functions have been put in place that will pull the master schema, + root page, and database header for this version if needed, on demand (unless the "store in memory flag" is set). + + The freelist pages and pointer map pages are always parsed since the overhead to do so is minimal and freelist pages + need to be parsed in order to ensure changes in the pages. + + If the "store in memory" flag is set, the commit record will be fully parsed and stored in memory. This includes + the database header and master schema, regardless of changes, and all pages including b-tree pages. This flag is + defaulted to False rather than True as it is defaulted to in the database class due to the nature of how the commit + records are parsed vs the original database. + + Note: The version number of the first commit record defined must start at 1. The previous version to the first + WAL commit record is 0 and will be the base SQLite database file. + + Note: The following fields will be parsed on demand unless this commit record has specific updated pages with + regards to them (unless the "store in memory" flag is set): + 1.) self._database_header + 2.) self._root_page + + Note: The root page may not be set if the database header is set since the root page refers to the master + schema and not the database header. However, the root page will always be set if the master schema + is set and vice-versa. + + 3.) self._master_schema + + """ + + def __init__(self, version_number, database, write_ahead_log, frames, page_frame_index, page_version_index, + last_database_header, last_master_schema, store_in_memory=False, strict_format_checking=True): + + super(WriteAheadLogCommitRecord, self).__init__(write_ahead_log.file_handle, version_number, + store_in_memory, strict_format_checking) + + """ + + Note: The database is needed to refer to the file handle in order to read page data out of the database file + if the particular page being requested has not been updated in the WAL file frames yet. + + Note: The write ahead log is needed only for the use case of setting the database text encoding if it was + not previously set by the database file (Due to a database file with "no content"). + + """ + + self._database = database + + for page_version_number in page_version_index.itervalues(): + if page_version_number >= version_number: + log_message = "Page version number: {} is greater than the commit record specified version: {}." + log_message = log_message.format(page_version_number, version_number) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + max_version_number_in_page_version_index = max(page_version_index.values()) + if self.version_number != max_version_number_in_page_version_index + 1: + log_message = "Version number: {} is not the next version number from the max version: {} in the page " \ + "version index: {}.." + log_message = log_message.format(version_number, max_version_number_in_page_version_index, + page_version_index) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + """ + + Below we declare a boolean value for committed which explains if this commit record was "committed" to the + database. There should be at most one commit record where committed would be false. As the frames are + parsed, if a commit frame is found, the committed flag is set to true. If there are multiple commit frames, + then an exception is thrown since this is not allowed. + + Note: If there are more than one commit frames, then that use case needs to be checked outside of this class. + + Note: As of yet, the use case where there is a set of frames with no commit record has not been seen and + therefore a committed flag will determine if this commit frame was committed to the WAL file or not. + In the creating class (VersionHistory), a warning will be thrown if this use case is detected since it + has not been investigated and handled correctly. + + The committed page size is determined from the commit frame in the frames and may be left as None if this is + the commit record at the end of the file (if it exists) that was not committed and does not have a commit frame. + + The frames variable is a dictionary of page number to frame: + self.frames[FRAME.PAGE_NUMBER] = FRAME + + """ + + self.committed = False + self.committed_page_size = None + self.frames = {} + + # Iterate through the frames + for frame in frames: + + # Make sure the page number to the current frame doesn't already exist in the previous frames + if frame.header.page_number in self.frames: + log_message = "Frame page number: {} found already existing in frame page numbers: {} in version: {}." + log_message = log_message.format(frame.header.page_number, self.frames.keys(), self.version_number) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Check if the frame is a commit frame + if frame.commit_frame: + + # Make sure this commit frame hasn't already been committed + if self.committed: + log_message = "Frame page number: {} is a commit frame when commit record was already committed " \ + "with frame page numbers: {} in version: {}." + log_message = log_message.format(frame.header.page_number, self.frames.keys(), self.version_number) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Set the committed flag to true + self.committed = True + + # Make sure the committed page size has not already been set and set it + if self.committed_page_size: + log_message = "Frame page number: {} has a committed page size of: {} when it was already set " \ + "to: {} with frame page numbers: {} in version: {}." + log_message = log_message.format(frame.header.page_number, frame.header.page_size_after_commit, + self.committed_page_size, self.frames.keys(), self.version_number) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + self.committed_page_size = frame.header.page_size_after_commit + + # Add this frame to the frames dictionary + self.frames[frame.header.page_number] = frame + + # Set the updated page numbers derived from this commit records frame keys + self.updated_page_numbers = copy(self.frames.keys()) + + log_message = "Commit Record Version: {} has the updated page numbers: {}." + log_message = log_message.format(self.version_number, self.updated_page_numbers) + self._logger.debug(log_message) + + """ + + Here we setup the updated b-tree page numbers. This array will be removed from as we parse through the file + to leave just the b-tree pages of the commit record that were updated at the end. + + """ + + self.updated_b_tree_page_numbers = copy(self.updated_page_numbers) + + self.page_frame_index = dict.copy(page_frame_index) + self.page_version_index = dict.copy(page_version_index) + for updated_page_number in self.updated_page_numbers: + self.page_version_index[updated_page_number] = self.version_number + self.page_frame_index[updated_page_number] = self.frames[updated_page_number].frame_number + + self.database_size_in_pages = self.committed_page_size + + """ + + Check to make sure the page version index length match the database size in pages as it should. + + Note: The database size in pages can and has been found to be wrong in some cases where the database + size in pages is specified where the version valid for number equals the file change counter. It is + still unsure of why this can occur but in the use cases this was seen, the database size in pages was + correct and the file was inflated (padded) with empty space at the end indicating additional pages. + For this reason a warning is thrown instead of an exception (in the case that the version valid for + number equals the file change counter and database e in pages is set). + + This may involve the WAL file and checkpoints as the file referred to above had a checkpoint sequence + number that was not 0. More investigation is needed. + + """ + + if len(self.page_version_index) != self.database_size_in_pages: + log_message = "The page version index of length: {} does not equal the database size in pages: {} " \ + "in version: {} for page version index: {}. Possibly erroneous use cases may occur " \ + "when parsing." + log_message = log_message.format(len(self.page_version_index), self.database_size_in_pages, + self.version_number, self.page_version_index) + self._logger.warn(log_message) + warn(log_message, RuntimeWarning) + + """ + + Initialize the root page and master schema to none. + + Note: These are only initialized if the SQLite master schema root page is in the updated pages and the root + b-tree (not including the header) is updated or the master schema is updated. If the root page is set + the master schema will always be set, and vice-versa. + + """ + + self._root_page = None + self._master_schema = None + + """ + + Here we check to see if the SQLite root page was updated or if any of the master schema pages were + updated since the previous version. This is done by keeping track of the master schema pages (which + will always include the root page SQLITE_MASTER_SCHEMA_ROOT_PAGE (1)) and checking if the new + commit record contains any of these pages in the frame array. + + If the root page is in the frame array that means that either: + a.) The database header was updated and the rest of the root page remained unchanged. + b.) Both the database header and root page were changed. + c.) Neither the database header or root page was changed. + + The most observed case is a.) since the schema itself does not seem to change often but rather the + freelist pages, database size in pages, and other fields found in the database header. + + If any of the non-root master schema pages are in the frame array then the master schema was + updated. The master schema is assumed to be able to be updated without always updating the root + page. However, any change in the master schema should result in the schema cookie being updated + in the database header meaning that there should never be a case where the master schema is updated + without updating the database header. + + First we will check to see if the root page is in this commit record's updated page numbers. If it is, then + we will check the database header md5 against the last database header md5 and the root page only md5 hex + digest against the previous master schema root page root page only md5 hex digest. + + This will tell us if the database header changed, and insight into if the master schema changed. + We will not know 100% if the master schema changed until we check all master schema pages against the updated + pages in this commit record. However, if we did find out that the master schema has changed this last step + is not needed. + + """ + + if SQLITE_MASTER_SCHEMA_ROOT_PAGE in self.updated_page_numbers: + + # Remove it from the updated b-tree pages + self.updated_b_tree_page_numbers.remove(SQLITE_MASTER_SCHEMA_ROOT_PAGE) + + """ + + Note: There is a redundancy here in calculating these md5 hash values but the trade off is to + parse the objects when not needed versus calculating md5s of a small portion of that data. + Keep in mind this only occurs when the SQLite master schema root page is in the updated page numbers. + + """ + + root_page_data = self.get_page_data(SQLITE_MASTER_SCHEMA_ROOT_PAGE) + database_header_md5_hex_digest = get_md5_hash(root_page_data[:SQLITE_DATABASE_HEADER_LENGTH]) + root_page_only_md5_hex_digest = get_md5_hash(root_page_data[SQLITE_DATABASE_HEADER_LENGTH:]) + + if last_database_header.md5_hex_digest != database_header_md5_hex_digest: + self.database_header_modified = True + self._database_header = DatabaseHeader(root_page_data[:SQLITE_DATABASE_HEADER_LENGTH]) + + if self._database_header.md5_hex_digest != database_header_md5_hex_digest: + log_message = "The database header md5 hex digest: {} did not match the previously retrieved " \ + "calculated database header md5 hex digest: {} in commit record version: {} " \ + "on updated pages: {}." + log_message = log_message.format(self._database_header.md5_hex_digest, + database_header_md5_hex_digest, self.version_number, + self.updated_page_numbers) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + """ + + Note: The root b-tree page modified flag may be False where the master schema modified flag may be True + depending on if the pages in the master schema updated included the SQLite master schema root + page (1) or not. + + """ + + if last_master_schema.root_page.header.root_page_only_md5_hex_digest != root_page_only_md5_hex_digest: + self.root_b_tree_page_modified = True + self.master_schema_modified = True + + """ + + The root page may be in the updated page numbers in the WAL commit record even if neither the database + header or the root page itself was modified (ie. the page in general). It is not sure why this occurs + and more research needs to be done into the exact reasoning. One theory is that if pointer map pages + are updated, then the root page is automatically included. This could be a flag in the SQLite source + code that sets the root page to have been modified for instance if the largest b-tree root page number + is updated, but updated to the same number. For this reason, we throw a warning below + + """ + + if not self.database_header_modified and not self.root_b_tree_page_modified: + log_message = "The sqlite database root page was found in version: {} in the updated pages: {} when " \ + "both the database header and the root b-tree page were not modified." + log_message = log_message.format(self.version_number, self.updated_page_numbers) + self._logger.warn(log_message) + warn(log_message, RuntimeWarning) + + if not self.master_schema_modified: + + for last_master_schema_page_number in last_master_schema.master_schema_page_numbers: + + """ + + Since we are removing the use case of the SQLite master schema root page and checking for master + schema modifications on other pages, as long as we find at least one page here, we satisfy our + use case and can break. + + Note: We could argue that we should parse the master schema again to make sure the master schema + did not change, but we can do the same by checking the previous master schema pages and if + any of them were updated, as they would have to be if any change was made, figure out from there + without having to deal with the extra overhead of parsing the master schema. + + """ + + if last_master_schema_page_number != SQLITE_MASTER_SCHEMA_ROOT_PAGE: + if last_master_schema_page_number in self.updated_page_numbers: + self.master_schema_modified = True + break + + if not self.database_header_modified and self.master_schema_modified: + log_message = "The database header was not modified when the master schema was modified in version: {}." + log_message = log_message.format(self.version_number) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + """ + + The database header differences will be a dictionary with the key being within the + DATABASE_HEADER_VERSIONED_FIELDS Enum constant variables and value will be a tuple where + the first element will be the value that field held previously and the second element will + be the new value of that field. + + """ + + if self.database_header_modified: + + if not self._database_header: + log_message = "The database header does not exist when the database header was modified in commit " \ + "record version: {} on updated pages: {}." + log_message = log_message.format(self.version_number, self.updated_page_numbers) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + self.database_header_differences = compare_database_headers(last_database_header, self._database_header) + + log_message = "Database header was modified in version: {} with differences: {}." + log_message = log_message.format(self.version_number, self.database_header_differences) + self._logger.info(log_message) + + else: + + self.database_header_differences = {} + + """ + + Note: Below we do not need to worry about the database page in sizes being 0 since this is a write ahead + log file being parsed which requires SQLite version >= 3.7.0. However, there may still be a use + case where the page number is wrong depending on if it was previously opened with a SQLite version + < 3.7.0 and has not been updated yet, however, this use case may not occur and has still yet to be + seen. For now, an exception is raised. + + Note: Below a warning is thrown instead of an exception because the committed page size has been found to + be wrong in some cases where the database size in pages is specified where the version valid for + number equals the file change counter. It is still unsure of why this can occur but in the use cases + this was seen, the committed page size was correct and the file was inflated (padded) with empty + space at the end indicating additional pages when calculating page size from file size. The + database class has additional documentation on this occurring and allows this since it has not been + determined why exactly this occurs. + + """ + + # Make sure the database size in pages remained the same as the committed page size + if self.committed_page_size != last_database_header.database_size_in_pages: + + log_message = "Database header for version: {} specifies a database size in pages of {} but the " \ + "committed page size is {}. Possibly erroneous use cases may occur when parsing." + log_message = log_message.format(self.version_number, last_database_header.database_size_in_pages, + self.committed_page_size) + self._logger.warn(log_message) + warn(log_message, RuntimeWarning) + + if self.master_schema_modified: + + log_message = "Master schema was modified in version: {}." + log_message = log_message.format(self.version_number) + self._logger.info(log_message) + + """ + + Below are fields that are set in the case that the database header is modified. + + These variables are set by the parse database header differences private function. If the value is + not a boolean, then it will only be set if it was updated in the header. + + Note: Even though the number of freelist pages modified may not be set, it does not mean that there have not + been updates to the pages. Same with the first freelist trunk page as well as both fields. + + Note: Pointer map pages may still be updated even if the modified largest root b-tree page number was not + modified. (Assuming it was not 0 and auto-vacuuming is turned on.) + + Note: If the database text encoding was not previously set in the versions, it will be set here. + + """ + + self.file_change_counter_incremented = False + self.version_valid_for_number_incremented = False + self.database_size_in_pages_modified = False + self.modified_first_freelist_trunk_page_number = None + self.modified_number_of_freelist_pages = None + self.modified_largest_root_b_tree_page_number = None + self.schema_cookie_modified = False + self.schema_format_number_modified = False + self.database_text_encoding_modified = False + self.user_version_modified = False + + """ + + Call the _parse_database_header_differences method to setup the above variables and check header use cases. + + """ + + self._parse_database_header_differences() + + """ + + Create the root page and master schema if the master schema was detected to be modified. Also, remove all + master schema page numbers from the updated b-tree pages. + + """ + + if self.master_schema_modified: + + self._root_page = self.get_b_tree_root_page(SQLITE_MASTER_SCHEMA_ROOT_PAGE) + + self._master_schema = MasterSchema(self, self._root_page) + + # Remove the master schema page numbers from the updated b-tree pages + for master_schema_page_number in self._master_schema.master_schema_page_numbers: + if master_schema_page_number in self.updated_b_tree_page_numbers: + self.updated_b_tree_page_numbers.remove(master_schema_page_number) + + """ + + Since we do not know if the freelist pages could have been updated or not we always set them here. + We also set the pointer map pages if they the largest root b-tree page number is specified. + + Note: If there are no freelist pages, the first freelist trunk page will be None and there will be an empty + array for the freelist page numbers. + + Note: We could check and only set the pointer map pages if they were updated but it was decided to do that + regardless in order to fit the object structure of the version and database better and due to the low + overhead of doing this. + + """ + + first_freelist_trunk_page_number = last_database_header.first_freelist_trunk_page_number + if self._database_header: + first_freelist_trunk_page_number = self._database_header.first_freelist_trunk_page_number + + if first_freelist_trunk_page_number: + self.first_freelist_trunk_page = FreelistTrunkPage(self, first_freelist_trunk_page_number, + FIRST_FREELIST_TRUNK_PARENT_PAGE_NUMBER, + FIRST_FREELIST_TRUNK_PAGE_INDEX) + + self.freelist_page_numbers = [] + observed_freelist_pages = 0 + freelist_trunk_page = self.first_freelist_trunk_page + while freelist_trunk_page: + self.freelist_page_numbers.append(freelist_trunk_page.number) + observed_freelist_pages += 1 + for freelist_leaf_page in freelist_trunk_page.freelist_leaf_pages: + self.freelist_page_numbers.append(freelist_leaf_page.number) + observed_freelist_pages += 1 + freelist_trunk_page = freelist_trunk_page.next_freelist_trunk_page + + number_of_freelist_pages = last_database_header.number_of_freelist_pages + if self._database_header: + number_of_freelist_pages = self._database_header.number_of_freelist_pages + + if observed_freelist_pages != number_of_freelist_pages: + log_message = "The number of observed freelist pages: {} does not match the number of freelist pages " \ + "specified in the header: {} for version: {}." + log_message = log_message.format(observed_freelist_pages, number_of_freelist_pages, self.version_number) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + for freelist_page_number in self.freelist_page_numbers: + if freelist_page_number in self.updated_page_numbers: + self.freelist_pages_modified = True + + # Remove the freelist page numbers from the updated b-tree pages + if freelist_page_number in self.updated_b_tree_page_numbers: + self.updated_b_tree_page_numbers.remove(freelist_page_number) + + """ + + Create the pointer map pages. + + Note: If there are no pointer map pages, both the pointer map pages and pointer map page numbers will be an + empty array. + + """ + + largest_root_b_tree_page_number = last_database_header.largest_root_b_tree_page_number + if self._database_header: + largest_root_b_tree_page_number = self._database_header.largest_root_b_tree_page_number + + if largest_root_b_tree_page_number: + self.pointer_map_pages = create_pointer_map_pages(self, self.database_size_in_pages, self.page_size) + else: + self.pointer_map_pages = [] + + self.pointer_map_page_numbers = [] + for pointer_map_page in self.pointer_map_pages: + self.pointer_map_page_numbers.append(pointer_map_page.number) + + for pointer_map_page_number in self.pointer_map_page_numbers: + if pointer_map_page_number in self.updated_page_numbers: + self.pointer_map_pages_modified = True + + # Remove the pointer map page numbers from the updated b-tree pages + if pointer_map_page_number in self.updated_b_tree_page_numbers: + self.updated_b_tree_page_numbers.remove(pointer_map_page_number) + + """ + + Note: At this point the updated_b_tree_page_numbers has all of the page numbers that refer to updated b-trees + in this commit record with all master schema, freelist, and pointer map pages filtered out. + + """ + + """ + + The last database header and last master schema are set if no database header or master schema was parsed from + this commit record for reference. + + """ + + self.last_database_header = None + if not self.database_header_modified: + self.last_database_header = last_database_header + + self.last_master_schema = None + if not self.master_schema_modified: + self.last_master_schema = last_master_schema + + """ + + If the version information is being stored in memory, parse out the database header, root page, and master + schema (if it was already not parsed out) and pages and store them as a private variable. + + """ + + if self.store_in_memory: + + if not self._database_header: + root_page_data = self.get_page_data(SQLITE_MASTER_SCHEMA_ROOT_PAGE) + self._database_header = DatabaseHeader(root_page_data[:SQLITE_DATABASE_HEADER_LENGTH]) + + if not self._root_page: + self._root_page = self.get_b_tree_root_page(SQLITE_MASTER_SCHEMA_ROOT_PAGE) + + if not self._master_schema: + self._master_schema = MasterSchema(self, self._root_page) + + self._pages = self.pages + + log_message = "Commit record: {} on page numbers: {} successfully created." + log_message = log_message.format(self.version_number, self.updated_page_numbers) + self._logger.info(log_message) + + def stringify(self, padding="", print_pages=True, print_schema=True, print_frames=True): + + # Create the initial string + string = "\n" \ + + padding + "Committed: {}\n" \ + + padding + "Committed Page Size: {}\n" \ + + padding + "Frames Length: {}\n" \ + + padding + "Page Frame Index: {}\n" \ + + padding + "File Change Counter Incremented: {}\n" \ + + padding + "Version Valid for Number Incremented: {}\n" \ + + padding + "Database Size in Pages Modified: {}\n" \ + + padding + "Modified First Freelist Trunk Page Number: {}\n" \ + + padding + "Modified Number of Freelist Pages: {}\n" \ + + padding + "Modified Largest Root B-Tree Page Number: {}\n" \ + + padding + "Schema Cookie Modified: {}\n" \ + + padding + "Schema Format Number Modified: {}\n" \ + + padding + "Database Text Encoding Modified: {}\n" \ + + padding + "User Version Modified: {}" + + # Format the string + string = string.format(self.committed, + self.committed_page_size, + self.frames_length, + self.page_frame_index, + self.file_change_counter_incremented, + self.version_valid_for_number_incremented, + self.database_size_in_pages_modified, + self.modified_first_freelist_trunk_page_number, + self.modified_number_of_freelist_pages, + self.modified_largest_root_b_tree_page_number, + self.schema_cookie_modified, + self.schema_format_number_modified, + self.database_text_encoding_modified, + self.user_version_modified) + + # Add the database header differences + string += "\n" + padding + "Database Header Differences:" + + # Parse the database header differences + for field, difference in self.database_header_differences.iteritems(): + difference_string = "\n" + padding + "\t" + "Field: {} changed from previous Value: {} to new Value: {}" + string += difference_string.format(field, difference[0], difference[1]) + + # Print the frames if specified + if print_frames: + for page_number in self.frames: + string += "\n" + padding + "Frame:\n{}".format(self.frames[page_number].stringify(padding + "\t")) + + # Get the super stringify information and concatenate it with this string and return it + return super(WriteAheadLogCommitRecord, self).stringify(padding, print_pages, print_schema) + string + + @property + def frames_length(self): + return len(self.frames) + + def get_page_data(self, page_number, offset=0, number_of_bytes=None): + + page_version = self.page_version_index[page_number] + + if page_version == BASE_VERSION_NUMBER: + + return self._database.get_page_data(page_number, offset, number_of_bytes) + + else: + + # Set the number of bytes to the rest of the page if it was not set + number_of_bytes = self.page_size - offset if not number_of_bytes else number_of_bytes + + if offset >= self.page_size: + log_message = "Requested offset: {} is >= the page size: {} for page: {}." + log_message = log_message.format(offset, self.page_size, page_number) + self._logger.error(log_message) + raise ValueError(log_message) + + if offset + number_of_bytes > self.page_size: + log_message = "Requested length of data: {} at offset {} to {} is > than the page size: {} " \ + "for page: {}." + log_message = log_message.format(number_of_bytes, offset, number_of_bytes + offset, + self.page_size, page_number) + self._logger.error(log_message) + raise ValueError(log_message) + + page_offset = self.get_page_offset(page_number) + + return self.file_handle.read_data(page_offset + offset, number_of_bytes) + + def get_page_offset(self, page_number): + + """ + + + + Note: This method will return the correct page offset depending on where it last showed up in relation to + this commit frame. Therefore the page offset may be very close to the beginning of the WAL file when + the last committed record in the set of frames is near the end of the WAL file. This could also return + an offset in the database file if the WAL file did not have the page updated in it's frames yet. + + This is presumed safe since the get_page_data takes in a page number and unless people are using the + read method directly from the file handles, this function is more for informative purposes. If someone + was reading directly from the file handles, it is assumed they would know the inner workings of this + library. + + :param page_number: + + :return: + + """ + + if page_number < 1 or page_number > self.database_size_in_pages: + log_message = "Invalid page number: {} for version: {} with database size in pages: {}." + log_message = log_message.format(page_number, self.version_number, self.database_size_in_pages) + self._logger.error(log_message) + raise ValueError(log_message) + + page_version = self.page_version_index[page_number] + + if page_version == BASE_VERSION_NUMBER: + + return (page_number - 1) * self.page_size + + else: + + if page_version == self.version_number: + + if page_number not in self.frames: + log_message = "Page number has version: {} but not in frame pages: {}." + log_message = log_message.format(page_number, self.frames.keys()) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + if page_number not in self.page_frame_index: + log_message = "Page number: {} with version: {} is not in the page frame index: {}." + log_message = log_message.format(page_number, page_version, self.page_frame_index) + self._logger.error(log_message) + raise KeyError(log_message) + + frame_number = self.page_frame_index[page_number] + + """ + + The WAL file is structured with a file header, then a series of frames that each have a frame header and + page in them. The offset is determined by adding the WAL header length to the number of frame header + before the page content and then added to the page size multiplied by the number of frames (minus the + current one). + + """ + + # Return where the offset of the page to this commit record in the WAL file would start at + return WAL_HEADER_LENGTH + WAL_FRAME_HEADER_LENGTH * frame_number + self.page_size * (frame_number - 1) + + def _parse_database_header_differences(self): + + """ + + This function is a private function that will check and set the variables for this commit record for differences + in database headers between this commit record and the last database header. + + Note: The database header differences will be a dictionary keyed by the DATABASE_HEADER_VERSIONED_FIELDS + which will refer to a tuple where the first value will be the previous database header value and the + second value will be the new database header value. + + :param self: + + :raise: + + """ + + # Make sure there are database header differences + if not self.database_header_differences: + + # There are no differences so return + return + + # Make a copy of the database header differences to work with + database_header_differences = dict.copy(self.database_header_differences) + + """ + + This shows that the database headers are different and therefore one of the database header fields + have been updated. There are only a specific set of database header fields we expect to change here. + These are found in the DATABASE_HEADER_VERSIONED_FIELDS constant as the following properties of + the database header class: + 1.) MD5_HEX_DIGEST: md5_hex_digest + 2.) FILE_CHANGE_COUNTER: file_change_counter + 3.) VERSION_VALID_FOR_NUMBER: version_valid_for_number + 4.) DATABASE_SIZE_IN_PAGES: database_size_in_pages + 5.) FIRST_FREELIST_TRUNK_PAGE_NUMBER: first_freelist_trunk_page_number + 6.) NUMBER_OF_FREE_LIST_PAGES: number_of_freelist_pages + 7.) LARGEST_ROOT_B_TREE_PAGE_NUMBER: largest_root_b_tree_page_number + 8.) SCHEMA_COOKIE: schema_cookie + 9.) SCHEMA_FORMAT_NUMBER: schema_format_number + 10.) DATABASE_TEXT_ENCODING: database_text_encoding + 11.) USER_VERSION: user_version + + In order to check these fields we first compare the two headers to get back a dictionary keyed by + the property name (above in capitals) with a tuple value where the first element is the previous + database header value and the second element is the modified database header value. The property will + only exist in the dictionary if the values between the two headers are different. If additional + fields not defined above are found to be different, an exception is thrown in order to alert us + to the "assumed not to happen" use case. + + Note: The MD5_HEX_DIGEST: md5_hex_digest is a field of the database header class but not a field in the + actual database header itself. + + """ + + """ + + 1.) MD5_HEX_DIGEST: md5_hex_digest: + This will be different between both database headers since it was checked in order to enter this + area of code. However, this is still a property of the database header class and therefore needs to + be accounted for. If the md5 hex digests are not different (are not in the returned database + header differences dictionary), then a very weird use case has shown up. + + """ + + if DATABASE_HEADER_VERSIONED_FIELDS.MD5_HEX_DIGEST not in database_header_differences: + log_message = "The database header md5 hex digests are not different in the database headers " \ + "for version: {}." + log_message = log_message.format(self.version_number) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Delete the entry from the dictionary + del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.MD5_HEX_DIGEST] + + """ + + The next two fields we will check together are: + 2.) FILE_CHANGE_COUNTER: file_change_counter + 3.) VERSION_VALID_FOR_NUMBER: version_valid_for_number + + These fields are changed whenever the database file is unlocked after having been modified. However, + since this is parsed in a commit record, WAL mode will be in use. In WAL mode, changes to the database + are instead detected using the wal-index (shm) file so this change counter is not needed. Therefore, + the change counter may not be incremented on each transaction. + + Previously, an assumption was made that these fields were incremented only when a checkpoint occurred in + a WAL file. However, these fields were found incremented in commit records of the WAL file outside of + checkpoints occurring. It is still not sure exactly what may or may not cause these fields to increment + in the WAL commit record itself. + + If either one of these fields is incremented, then the other field must also be incremented and both + must be equal. If the case appears that one has been modified and the other one has been not, an + exception will be thrown. + + """ + + # Check that the file change counter was not modified without the version valid for number + if DATABASE_HEADER_VERSIONED_FIELDS.FILE_CHANGE_COUNTER in database_header_differences \ + and DATABASE_HEADER_VERSIONED_FIELDS.VERSION_VALID_FOR_NUMBER not in database_header_differences: + log_message = "The database header file change counter: {} was found in the database header " \ + "differences but the version valid for number was not for version: {}." + log_message = log_message.format(database_header_differences[ + DATABASE_HEADER_VERSIONED_FIELDS.FILE_CHANGE_COUNTER], + self.version_number) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Check that the version valid for number was not modified without the file change counter + elif DATABASE_HEADER_VERSIONED_FIELDS.VERSION_VALID_FOR_NUMBER in database_header_differences \ + and DATABASE_HEADER_VERSIONED_FIELDS.FILE_CHANGE_COUNTER not in database_header_differences: + log_message = "The database header version valid for number: {} was found in the database header " \ + "differences but the file change counter was not for version: {}." + log_message = log_message.format(database_header_differences[ + DATABASE_HEADER_VERSIONED_FIELDS.VERSION_VALID_FOR_NUMBER], + self.version_number) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Check if both file change counter and version valid for number was modified + elif DATABASE_HEADER_VERSIONED_FIELDS.FILE_CHANGE_COUNTER in database_header_differences \ + and DATABASE_HEADER_VERSIONED_FIELDS.VERSION_VALID_FOR_NUMBER in database_header_differences: + + """ + + Note: We check both fields are incremented only one value from their value in the previous version. + If they are not, an exception is thrown. This may be incorrect and their values may be able to + increment more than one value but more investigation is needed on this. + + """ + + # Get the file change counter difference + file_change_counter_difference = database_header_differences[ + DATABASE_HEADER_VERSIONED_FIELDS.FILE_CHANGE_COUNTER] + + # Check the file change counter difference against it's previous value as stated above + if file_change_counter_difference[0] + 1 != file_change_counter_difference[1]: + log_message = "The previous database header file change counter: {} is more than one off from the " \ + "new database header file change counter: {} for version: {}." + log_message = log_message.format(file_change_counter_difference[0], file_change_counter_difference[1], + self.version_number) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Get the version valid for number difference + version_valid_for_number_difference = database_header_differences[ + DATABASE_HEADER_VERSIONED_FIELDS.VERSION_VALID_FOR_NUMBER] + + # Check the version valid for number difference against it's previous value as stated above + if version_valid_for_number_difference[0] + 1 != version_valid_for_number_difference[1]: + log_message = "The previous database header version valid for number: {} is more than one off from " \ + "the new database header version valid for number: {} for version: {}." + log_message = log_message.format(version_valid_for_number_difference[0], + version_valid_for_number_difference[1], self.version_number) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Update the class variables to signify these fields were incremented + self.file_change_counter_incremented = True + self.version_valid_for_number_incremented = True + + # Delete the entries from the dictionary + del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.FILE_CHANGE_COUNTER] + del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.VERSION_VALID_FOR_NUMBER] + + """ + + 4.) DATABASE_SIZE_IN_PAGES: database_size_in_pages: + + Here we check if the database size in pages was updated from it's previous size. If it was we check this + against the committed page size for the commit record. + + Note: We check that the committed page size obtained from the size of the database file in pages field + in the commit record frame is equal to the database size in pages. This should always be equal + unless the previous use case occurs which is checked for above where the "version valid for" + field does not match the change counter. But this will cause an exception preventing the code + from reaching this point. This should additionally be checked since the committed page size + should equal the database header of the previous version database header if the database size + in pages field did not change. + + """ + + if DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_SIZE_IN_PAGES in database_header_differences: + + # Get the database size in pages difference + database_size_in_pages_difference = database_header_differences[ + DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_SIZE_IN_PAGES] + + # The committed page size is checked here but should also be checked at the end of this process + if self.committed_page_size != database_size_in_pages_difference[1]: + log_message = "The committed page size: {} of commit record version: {} does not match the database" \ + "header size in pages: {} changed from {} on updated pages: {}." + log_message = log_message.format(self.committed_page_size, self.version_number, + database_size_in_pages_difference[1], + database_size_in_pages_difference[0], + self.updated_page_numbers) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Set the database size in pages modified flag + self.database_size_in_pages_modified = True + + # Delete the entry from the dictionary + del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_SIZE_IN_PAGES] + + """ + + The next two fields we are going to pay attention to are in respect to freelist pages: + 5.) FIRST_FREELIST_TRUNK_PAGE_NUMBER: first_freelist_trunk_page_number + 6.) NUMBER_OF_FREELIST_PAGES: number_of_freelist_pages + + If either of these two fields are different it signifies that the freelist pages in the database were + changed. If there were no freelist pages previously then both of these should values should be 0 and + not included in the database header differences dictionary after comparison. + + Additional use cases: + + 1.) The first freelist trunk page number could be 0 as well as the number of freelist pages whereas + previously there was at least one freelist trunk page existing. This is checked by making sure + all previous freelist pages are checked that they are either accounted for in this freelist page set + or not in this freelist set but in the pages of this commit record as another page. If not, an + exception is thrown. + + 2.) There is a possibility where the freelist pages were updated without changing the + number of freelist pages and/or freelist trunk page which additionally needs to be checked. + This would mean freelist pages could change without updates to the database header itself. + + 3.) If the database size in pages changed then the freelist pages could be out of range if the modified + size is less than the previous size. However, this use case applies to all other page types as well + and will be checked when the database size is checked against all of the page numbers in the + database/WAL commit record so it is not needed to be worried about here. + + """ + + if DATABASE_HEADER_VERSIONED_FIELDS.FIRST_FREELIST_TRUNK_PAGE_NUMBER in database_header_differences: + value = database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.FIRST_FREELIST_TRUNK_PAGE_NUMBER] + self.modified_first_freelist_trunk_page_number = value[1] + + # Delete the entry from the dictionary + del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.FIRST_FREELIST_TRUNK_PAGE_NUMBER] + + if DATABASE_HEADER_VERSIONED_FIELDS.NUMBER_OF_FREE_LIST_PAGES in database_header_differences: + value = database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.NUMBER_OF_FREE_LIST_PAGES] + self.modified_number_of_freelist_pages = value[1] + + # Delete the entry from the dictionary + del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.NUMBER_OF_FREE_LIST_PAGES] + + """ + + 7.) LARGEST_ROOT_B_TREE_PAGE_NUMBER: largest_root_b_tree_page_number + The next thing to check in the header is the largest root b tree page number. We will check further + down if pointer map pages are being used by seeing if this field is set to a non-zero value. Here + we are going to see if it changed. If it did change, we are only worried over the use case of it going + from 0 to a non-zero value. According to the SQLite documentation, the auto-vacuuming mode has to be set + (enabled) before any tables are created in the schema. Once a table has been created, it cannot be turned + off. However, the mode can be changed between full (1) and incremental (2). + + """ + + if DATABASE_HEADER_VERSIONED_FIELDS.LARGEST_ROOT_B_TREE_PAGE_NUMBER in database_header_differences: + change = database_header_differences[ + DATABASE_HEADER_VERSIONED_FIELDS.LARGEST_ROOT_B_TREE_PAGE_NUMBER] + previous_largest_root_b_tree_page_number = change[0] + new_largest_root_b_tree_page_number = change[1] + + # Check if auto-vacuuming was turned off + if previous_largest_root_b_tree_page_number and not new_largest_root_b_tree_page_number: + log_message = "The previous largest root b-tree page number: {} existed where the new one does not " \ + "meaning that auto-vacuuming was turned off which cannot occur in version: {} on " \ + "updated pages: {}." + log_message = log_message.format(previous_largest_root_b_tree_page_number, self.version_number, + self.updated_page_numbers) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Check if auto-vacuuming was turned on + elif not previous_largest_root_b_tree_page_number and new_largest_root_b_tree_page_number: + log_message = "The previous largest root b-tree page number did not exist where the new one is: {} " \ + "meaning that auto-vacuuming was turned on which cannot occur in version: {} on " \ + "updated pages: {}." + log_message = log_message.format(previous_largest_root_b_tree_page_number, self.version_number, + self.updated_page_numbers) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + """ + + Note: Since an exception is being thrown here, we do not delete the entry from the dictionary. + + """ + + """ + + At this point we know that auto-vacuuming was on and has remained on and only the largest root + b tree page number changed. We had five use cases to be concerned about here: + 1.) Auto-Vacuuming was on initially and then turned off: + This use case was handled above and an exception is currently thrown. + 2.) Auto-Vacuuming was off initially and then turned on: + This use case was handled above and an exception is currently thrown. + 3.) Auto-Vacuuming was never on: + In this case there would be a zero in both headers meaning there would not be a change + from the previous version and this portion of the code would not be executing. + 4.) Auto-Vacuuming was turned on and the largest root b tree page number did not change: + In this case both headers would have the same non-zero value meaning there would not be a change + from the previous version and this portion of the code would not be executing. + 5.) Auto-Vacuuming was turned on and the largest root b tree page number changed: + Here we don't have to worry about doing anything extra other than removing the change from the + database header differences so it does not cause a exception later on. Other areas of the code + will use the modified largest root b-tree page number to handle pointer map pages. + + """ + + # Set the modified largest root b-tree page number + self.modified_largest_root_b_tree_page_number = new_largest_root_b_tree_page_number + + # Delete the entry from the dictionary + del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.LARGEST_ROOT_B_TREE_PAGE_NUMBER] + + """ + + 8.) SCHEMA_COOKIE: schema_cookie + Next we check for the schema cookie. This field is incremented if a change to the database schema + occurs. This will mean that at least one of the master schema pages had to change and be in this + version's pages. This could be the root page or any of it's b-tree pages (if any). Keep in mind + that the schema cookie being incremented does not mean the root page b-tree content has to change, rather + a leaf page to the root page could change. Later on in this process, the schema cookie will be checked + against the master schema pages to make make sure at least one of the pages was in this version, otherwise + an exception is thrown since this is not expected. + + Note: If the schema cookie is updated, then the master schema must have been updated so this is check as well. + + """ + + if DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_COOKIE in database_header_differences: + + # Get the schema cookie difference + schema_cookie_difference = database_header_differences[ + DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_COOKIE] + + # Check the schema cookie difference against 'previous value to make sure it is not less + if schema_cookie_difference[0] > schema_cookie_difference[1]: + log_message = "The schema cookie was modified but the previous value: {} is greater than the new " \ + "value: {} which cannot occur in version: {} on updated pages: {}." + log_message = log_message.format(schema_cookie_difference[0], schema_cookie_difference[1], + self.version_number, self.updated_page_numbers) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Update the file change counter modified variable to signify this field was incremented + self.schema_cookie_modified = True + + if not self.master_schema_modified: + log_message = "The schema cookie was modified from {} to: {} indicating the master schema was " \ + "modified but was found not to have been in version: {} on updated pages: {}." + log_message = log_message.format(schema_cookie_difference[0], schema_cookie_difference[1], + self.version_number, self.updated_page_numbers) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Delete the entry from the dictionary + del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_COOKIE] + + elif self.master_schema_modified: + log_message = "The schema cookie was not modified indicating the master schema was not modified " \ + "as well but was found to have been in version: {} on updated pages: {}." + log_message = log_message.format(self.version_number, self.updated_page_numbers) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + """ + + The next two fields to check are the: + 9.) SCHEMA_FORMAT_NUMBER: schema_format_number + 10.) DATABASE_TEXT_ENCODING: database_text_encoding + + These should only appear where the master schema was originally empty and then had entries added to it. In + this case both of these numbers should originally have been zero. When changed, the schema format number will + be within the VALID_SCHEMA_FORMATS and the the database text encoding will be within the + DATABASE_TEXT_ENCODINGS. However, it is not needed that we check against this since this is done when parsing + the database header itself. + + When these are specified we check for the following use cases to validate: + 1.) Both fields exist in the database header differences. + 1.) Both of their values were originally 0. + 2.) The database size in pages was originally 1. + + """ + + # Check that the schema format number was not modified without the database text encoding + if DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_FORMAT_NUMBER in database_header_differences \ + and DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_TEXT_ENCODING not in database_header_differences: + log_message = "The database header schema format number: {} was found in the database header " \ + "differences but the database text encoding was not for version: {}." + log_message = log_message.format(database_header_differences[ + DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_FORMAT_NUMBER], + self.version_number) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Check that the database text encoding was not modified without the schema format number + elif DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_TEXT_ENCODING in database_header_differences \ + and DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_FORMAT_NUMBER not in database_header_differences: + log_message = "The database header database text encoding: {} was found in the database header " \ + "differences but the schema format number was not for version: {}." + log_message = log_message.format(database_header_differences[ + DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_TEXT_ENCODING], + self.version_number) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Check if both the schema format number was not modified without the database text encoding was modified + elif DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_FORMAT_NUMBER in database_header_differences \ + and DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_TEXT_ENCODING in database_header_differences: + + # Get the schema format number difference + schema_format_number_difference = database_header_differences[ + DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_FORMAT_NUMBER] + + # Check that the schema format number was previously 0 + if schema_format_number_difference[0] != 0: + log_message = "The previous database header schema format number: {} is not equal to 0 as expected " \ + "and has a new database header schema format number: {} for version: {}." + log_message = log_message.format(schema_format_number_difference[0], schema_format_number_difference[1], + self.version_number) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Get the database text encoding difference + database_text_encoding_difference = database_header_differences[ + DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_TEXT_ENCODING] + + # Check that the database text encoding was previously 0 + if database_text_encoding_difference[0] != 0: + log_message = "The previous database header database text encoding: {} is not equal to 0 as expected " \ + "and has a new database header database text encoding: {} for version: {}." + log_message = log_message.format(database_text_encoding_difference[0], + database_text_encoding_difference[1], self.version_number) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + """ + + Make sure the database size in pages was previously 1. + + Note: This is pulled from the original database header differences dictionary since it has already been + removed from the local copy. + + """ + + if DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_SIZE_IN_PAGES not in self.database_header_differences: + log_message = "The schema format number was changed from: {} to: {} and database text encoding was " \ + "changed from: {} to: {} when the database size in pages was not updated and " \ + "stayed the same size of: {} when it should have initially been 1 and changed to a " \ + "greater number in version: {} on updated pages: {}." + log_message = log_message.format(schema_format_number_difference[0], schema_format_number_difference[1], + database_text_encoding_difference[0], + database_text_encoding_difference[1], + self.database_size_in_pages, + self.version_number, self.updated_page_numbers) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Get the database size in pages difference + database_size_in_pages_difference = self.database_header_differences[ + DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_SIZE_IN_PAGES] + + # Check the database size in pages was previously 1 + if database_size_in_pages_difference[0] != 1: + log_message = "The schema format number was changed from: {} to: {} and database text encoding was " \ + "changed from: {} to: {} when the database size in pages was updated from: {} to:{} " \ + "when it should have initially been 1 in version: {} on updated pages: {}." + log_message = log_message.format(schema_format_number_difference[0], schema_format_number_difference[1], + database_text_encoding_difference[0], + database_text_encoding_difference[1], + database_size_in_pages_difference[0], + database_size_in_pages_difference[1], + self.version_number, self.updated_page_numbers) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Update the class variables to signify these fields were incremented + self.schema_format_number_modified = True + self.database_text_encoding_modified = True + + """ + + Since the database encoding as not been set yet, we set it in the WAL file handle by calling the + database_text_encoding property of the superclass.. Since nothing should be reading from the database + since nothing was written to it, we do not have to worry about setting the database text encoding in + the database. + + Note: Once the database text encoding is set, it can no longer be changed. + + """ + + database_text_encoding = database_text_encoding_difference[1] + + if database_text_encoding == UTF_8_DATABASE_TEXT_ENCODING: + self.database_text_encoding = UTF_8 + elif database_text_encoding == UTF_16LE_DATABASE_TEXT_ENCODING: + self.database_text_encoding = UTF_16LE + elif database_text_encoding == UTF_16BE_DATABASE_TEXT_ENCODING: + self.database_text_encoding = UTF_16BE + elif database_text_encoding: + log_message = "The database text encoding: {} is not recognized as a valid database text encoding." + log_message = log_message.format(database_text_encoding) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Delete the entries from the dictionary + del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_FORMAT_NUMBER] + del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_TEXT_ENCODING] + + """ + + 11.) USER_VERSION: user_version: + + The user version is not used by SQLite and is a user-defined version for developers to be able to track their + own versions of a SQLite database file for instances where the schema may be modified constantly, etc. + + Here we only check for this, and report it by setting the flag. Afterwards, we remove it from the database + header differences dictionary since it cannot be used to gleam any information about the database file while + parsing. + + """ + + if DATABASE_HEADER_VERSIONED_FIELDS.USER_VERSION in database_header_differences: + + # Set the user version modified flag + self.user_version_modified = True + + # Delete the entry from the dictionary + del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.USER_VERSION] + + """ + + Make sure there are no additional differences that are not accounted for. If there are, throw an + exception in order to flag the use case for occurring. + + """ + + # Throw an exception if any database header differences still exist + if database_header_differences: + log_message = "Database header differences still exist after checking the last database header against " \ + "this current commit record version: {} on updated pages: {}. The main set of differences " \ + "was: {} with remaining differences: {}." + log_message = log_message.format(self.version_number, self.updated_page_numbers, + self.database_header_differences, + database_header_differences) + self._logger.error(log_message) + raise WalCommitRecordParsingError(log_message) diff --git a/sqlite_dissect/file/wal/frame.py b/sqlite_dissect/file/wal/frame.py new file mode 100644 index 0000000..77acbdd --- /dev/null +++ b/sqlite_dissect/file/wal/frame.py @@ -0,0 +1,106 @@ +from binascii import hexlify +from logging import getLogger +from re import sub +from sqlite_dissect.constants import FILE_TYPE +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import MASTER_PAGE_HEX_ID +from sqlite_dissect.constants import SQLITE_DATABASE_HEADER_LENGTH +from sqlite_dissect.constants import WAL_FRAME_HEADER_LENGTH +from sqlite_dissect.constants import WAL_HEADER_LENGTH +from sqlite_dissect.exception import WalParsingError +from sqlite_dissect.file.wal.header import WriteAheadLogFrameHeader + +""" + +frame.py + +This script holds the objects used for parsing the WAL frame. + +Note: The WriteAheadLogFrame class is not responsible for parsing the page data itself. It is meant to give + information on the WALv frame and offsets of the page data but in order to parse the page data, the set of all + page changes to the commit record this frame belongs in is needed. Therefore the commit record class + (WriteAheadLogCommitRecord) will be responsible for parsing pages. + + There was some discussion about the page being stored back in the WriteAheadLogFrame once parsed but it was + decided that this made little to no difference and should just be retrieved from the commit record. + + As a side note, there are some basic things parsed from the page such as the page type. This is only for + debugging and logging purposes. + +This script holds the following object(s): +WriteAheadLogFrame(object) + +""" + + +class WriteAheadLogFrame(object): + + def __init__(self, file_handle, frame_index, commit_record_number): + + logger = getLogger(LOGGER_NAME) + + if file_handle.file_type != FILE_TYPE.WAL: + log_message = "The wal frame file handle file type is not {} as expected but is {} for frame index: {} " \ + "commit record number: {}." + log_message = log_message.format(FILE_TYPE.WAL, file_handle.file_type, frame_index, commit_record_number) + logger.error(log_message) + raise ValueError(log_message) + + self.frame_index = frame_index + self.frame_number = self.frame_index + 1 + self.commit_record_number = commit_record_number + + self.offset = self._get_write_ahead_log_frame_offset(self.frame_index, file_handle.header.page_size) + self.frame_size = WAL_FRAME_HEADER_LENGTH + file_handle.header.page_size + + wal_frame = file_handle.read_data(self.offset, self.frame_size) + self.header = WriteAheadLogFrameHeader(wal_frame[:WAL_FRAME_HEADER_LENGTH]) + self.commit_frame = True if self.header.page_size_after_commit else False + page_content = wal_frame[WAL_FRAME_HEADER_LENGTH:] + + if len(page_content) != file_handle.header.page_size: + log_message = "Page content was found to be: {} when expected to be: {} as declared in the wal file " \ + "header for frame index: {} commit record number: {}." + log_message = log_message.format(len(page_content), file_handle.header.page_size, + frame_index, commit_record_number) + logger.error(log_message) + raise WalParsingError(log_message) + + self.contains_sqlite_database_header = False + self.page_hex_type = page_content[0:1] + + if self.page_hex_type == MASTER_PAGE_HEX_ID: + self.page_hex_type = page_content[SQLITE_DATABASE_HEADER_LENGTH:SQLITE_DATABASE_HEADER_LENGTH + 1] + self.contains_sqlite_database_header = True + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding=""): + string = padding + "Frame Index: {}\n" \ + + padding + "Frame Number: {}\n" \ + + padding + "Commit Record Number: {}\n" \ + + padding + "Offset: {}\n" \ + + padding + "Frame Size: {}\n" \ + + padding + "Commit Frame: {}\n" \ + + padding + "Header:\n{}\n"\ + + padding + "Contains SQLite Database Header: {}\n" \ + + padding + "Page Hex Type (Hex): {}" + string = string.format(self.frame_index, + self.frame_number, + self.commit_record_number, + self.offset, + self.frame_size, + self.commit_frame, + self.header.stringify(padding + "\t"), + self.contains_sqlite_database_header, + hexlify(self.page_hex_type)) + return string + + @staticmethod + def _get_write_ahead_log_frame_offset(index, page_size): + wal_frame_size = WAL_FRAME_HEADER_LENGTH + page_size + return WAL_HEADER_LENGTH + index * wal_frame_size diff --git a/sqlite_dissect/file/wal/header.py b/sqlite_dissect/file/wal/header.py new file mode 100644 index 0000000..7acdd34 --- /dev/null +++ b/sqlite_dissect/file/wal/header.py @@ -0,0 +1,141 @@ +from logging import getLogger +from re import sub +from struct import unpack +from warnings import warn +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import WAL_FILE_FORMAT_VERSION +from sqlite_dissect.constants import WAL_FRAME_HEADER_LENGTH +from sqlite_dissect.constants import WAL_HEADER_LENGTH +from sqlite_dissect.constants import WAL_MAGIC_NUMBER_BIG_ENDIAN +from sqlite_dissect.constants import WAL_MAGIC_NUMBER_LITTLE_ENDIAN +from sqlite_dissect.exception import HeaderParsingError +from sqlite_dissect.file.header import SQLiteHeader +from sqlite_dissect.utilities import get_md5_hash + +""" + +header.py + +This script holds the header objects used for parsing the header of the WAL file and WAL frames. + +This script holds the following object(s): +WriteAheadLogHeader(SQLiteHeader) +WriteAheadLogFrameHeader(object) + +""" + + +class WriteAheadLogHeader(SQLiteHeader): + + def __init__(self, wal_header_byte_array): + + super(WriteAheadLogHeader, self).__init__() + + logger = getLogger(LOGGER_NAME) + + if len(wal_header_byte_array) != WAL_HEADER_LENGTH: + log_message = "The wal header byte array of size: {} is not the expected size of: {}." + log_message = log_message.format(len(wal_header_byte_array), WAL_HEADER_LENGTH) + logger.error(log_message) + raise ValueError(log_message) + + self.magic_number = unpack(b">I", wal_header_byte_array[0:4])[0] + + """ + + Note: The magic number specifies either big endian or little endian encoding for checksums. + + """ + + if self.magic_number not in [WAL_MAGIC_NUMBER_BIG_ENDIAN, WAL_MAGIC_NUMBER_LITTLE_ENDIAN]: + log_message = "The magic number: {} is valid.".format(self.magic_number) + logger.error(log_message) + raise HeaderParsingError(log_message) + + self.file_format_version = unpack(b">I", wal_header_byte_array[4:8])[0] + + if self.file_format_version != WAL_FILE_FORMAT_VERSION: + log_message = "An unsupported file format version was found: {} instead of the expected value: {}." + log_message = log_message.format(self.file_format_version, WAL_FILE_FORMAT_VERSION) + logger.error(log_message) + raise HeaderParsingError(log_message) + + self.page_size = unpack(b">I", wal_header_byte_array[8:12])[0] + self.checkpoint_sequence_number = unpack(b">I", wal_header_byte_array[12:16])[0] + + if self.checkpoint_sequence_number != 0: + log_message = "Checkpoint sequence number is {} instead of 0 and may cause inconsistencies in wal parsing." + log_message = log_message.format(self.checkpoint_sequence_number) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + self.salt_1 = unpack(b">I", wal_header_byte_array[16:20])[0] + self.salt_2 = unpack(b">I", wal_header_byte_array[20:24])[0] + self.checksum_1 = unpack(b">I", wal_header_byte_array[24:28])[0] + self.checksum_2 = unpack(b">I", wal_header_byte_array[28:32])[0] + + self.md5_hex_digest = get_md5_hash(wal_header_byte_array) + + def stringify(self, padding=""): + string = padding + "Magic Number: {}\n" \ + + padding + "File Format Version: {}\n" \ + + padding + "Page Size: {}\n" \ + + padding + "Checkpoint Sequence Number: {}\n" \ + + padding + "Salt 1: {}\n" \ + + padding + "Salt 2: {}\n" \ + + padding + "Checksum 1: {}\n" \ + + padding + "Checksum 2: {}\n" \ + + padding + "MD5 Hex Digest: {}" + return string.format(self.magic_number, + self.file_format_version, + self.page_size, + self.checkpoint_sequence_number, + self.salt_1, + self.salt_2, + self.checksum_1, + self.checksum_2, + self.md5_hex_digest) + + +class WriteAheadLogFrameHeader(object): + + def __init__(self, wal_frame_header_byte_array): + + logger = getLogger(LOGGER_NAME) + + if len(wal_frame_header_byte_array) != WAL_FRAME_HEADER_LENGTH: + log_message = "The wal frame header byte array of size: {} is not the expected size of: {}." + log_message = log_message.format(len(wal_frame_header_byte_array), WAL_FRAME_HEADER_LENGTH) + logger.error(log_message) + raise ValueError(log_message) + + self.page_number = unpack(b">I", wal_frame_header_byte_array[0:4])[0] + self.page_size_after_commit = unpack(b">I", wal_frame_header_byte_array[4:8])[0] + self.salt_1 = unpack(b">I", wal_frame_header_byte_array[8:12])[0] + self.salt_2 = unpack(b">I", wal_frame_header_byte_array[12:16])[0] + self.checksum_1 = unpack(b">I", wal_frame_header_byte_array[16:20])[0] + self.checksum_2 = unpack(b">I", wal_frame_header_byte_array[20:24])[0] + + self.md5_hex_digest = get_md5_hash(wal_frame_header_byte_array) + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding=""): + string = padding + "Page Number: {}\n" \ + + padding + "Page Size After Commit: {}\n" \ + + padding + "Salt 1: {}\n" \ + + padding + "Salt 2: {}\n" \ + + padding + "Checksum 1: {}\n" \ + + padding + "Checksum 2: {}\n" \ + + padding + "MD5 Hex Digest: {}" + return string.format(self.page_number, + self.page_size_after_commit, + self.salt_1, + self.salt_2, + self.checksum_1, + self.checksum_2, + self.md5_hex_digest) diff --git a/sqlite_dissect/file/wal/utilities.py b/sqlite_dissect/file/wal/utilities.py new file mode 100644 index 0000000..559faad --- /dev/null +++ b/sqlite_dissect/file/wal/utilities.py @@ -0,0 +1,48 @@ +from logging import getLogger +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.file.database.header import DatabaseHeader + +""" + +utilities.py + +This script holds utility functions for dealing with WAL specific objects such as comparing database header rather +than more general utility methods. + +This script holds the following function(s): +compare_database_headers(previous_database_header, new_database_header) + +""" + + +def compare_database_headers(previous_database_header, database_header): + + logger = getLogger(LOGGER_NAME) + + if not isinstance(previous_database_header, DatabaseHeader): + log_message = "The previous database header is not a Database Header but has a type of: {}." + log_message = log_message.format(type(previous_database_header)) + logger.error(log_message) + raise ValueError(log_message) + + if not isinstance(database_header, DatabaseHeader): + log_message = "The database header is not a Database Header but has a type of: {}." + log_message = log_message.format(type(database_header)) + logger.error(log_message) + raise ValueError(log_message) + + """ + + Since the two objects are the same, we are not worried about possible differences in what properties the + objects have. + + """ + + database_header_changes = {} + for key in previous_database_header.__dict__.keys(): + previous_value = getattr(previous_database_header, key) + value = getattr(database_header, key) + if previous_value != value: + database_header_changes[key] = (previous_value, value) + + return database_header_changes diff --git a/sqlite_dissect/file/wal/wal.py b/sqlite_dissect/file/wal/wal.py new file mode 100644 index 0000000..80a0158 --- /dev/null +++ b/sqlite_dissect/file/wal/wal.py @@ -0,0 +1,240 @@ +from logging import getLogger +from re import sub +from warnings import warn +from sqlite_dissect.constants import FILE_TYPE +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import WAL_FRAME_HEADER_LENGTH +from sqlite_dissect.constants import WAL_HEADER_LENGTH +from sqlite_dissect.exception import WalParsingError +from sqlite_dissect.file.file_handle import FileHandle +from sqlite_dissect.file.wal.frame import WriteAheadLogFrame + +""" + +frame.py + +This script holds the WAL objects used for parsing the WAL file. + +This script holds the following object(s): +WriteAheadLog(object) + +""" + + +class WriteAheadLog(object): + + def __init__(self, file_identifier, store_in_memory=False, file_size=None, strict_format_checking=True): + + """ + + Constructor. + + :param file_identifier: str or file The full file path to the file to be opened or the file object. + :param store_in_memory: boolean Tells this class to store it's particular version information in memory or not. + :param file_size: int Optional parameter to supply the file size. + :param strict_format_checking: boolean Specifies if the application should exit if structural validations fail. + + """ + + self.file_handle = FileHandle(FILE_TYPE.WAL, file_identifier, file_size=file_size) + self.store_in_memory = store_in_memory + self.strict_format_checking = strict_format_checking + + logger = getLogger(LOGGER_NAME) + + frame_size = (WAL_FRAME_HEADER_LENGTH + self.file_handle.header.page_size) + + self.number_of_frames = (self.file_handle.file_size - WAL_HEADER_LENGTH) / frame_size + + valid_frame_array = [] + invalid_frame_array = [] + commit_record_number = 1 + + """ + + Since we have the possibility of WAL files executing checkpoints and overwriting themselves, we can have + invalid frames trailing the valid frames. The calculations above will always prove true since the frames are + always the same size they will always fully overwrite. Therefore, we should never come across a situation + where a WAL file has partially overwritten WAL frames in it (assuming the file is not damaged itself). + + In order to keep track of the invalid frames, we index the starting and ending frame indices that we find those + frames that correlate to a particular salt 1 value together. Salt 1 values are incremented on checkpoint + operations. Therefore we can determine the order of how the invalid frames were stored into the file by + looking at the checkpoint number and correlating the offset of the salt 1 value from the salt 1 value in + the WAL file header. + + When we find invalid frames, we will set the commit record number to None for now until further implemented. + + Below we initialize dictionary of salt 1 value to a tuple where the first and second values apply to the first + invalid frame index found and last invalid frame index found for that salt 1 value. Due to the way WAL files + overwrite and commit we should always have at least one frame in this use case at if it is only one frame, or + the last frame found, should always be a commit frame (ie. where the database page size after commit is set). + + Also, if there are any entries in the invalid frame indices when a valid frame is found, an exception is raised + since this should never occur. + + """ + + # Initialize the dictionary + self.invalid_frame_indices = {} + + for frame_index in range(self.number_of_frames): + + frame = WriteAheadLogFrame(self.file_handle, frame_index, commit_record_number) + + # Check if the salt 1 values were different (invalid frame) + if frame.header.salt_1 != self.file_handle.header.salt_1: + + log_message = "Frame index: {} after commit record number: {} has salt 1 of {} when expected to " \ + "be: {} and is an invalid frame." + log_message = log_message.format(frame_index, commit_record_number - 1, frame.header.salt_1, + self.file_handle.header.salt_1) + logger.debug(log_message) + + # Check if this salt value was already put into the invalid frame indices dictionary + if frame.header.salt_1 in self.invalid_frame_indices: + + # Get the previous indices + indices = self.invalid_frame_indices[frame.header.salt_1] + + # Check to make sure this frame index is the next one in the array + if indices[1] + 1 != frame_index: + log_message = "Frame index: {} with salt 1 of {} when expected to be: {} after commit " \ + "record number: {} has a different frame index than the expected: {}." + log_message = log_message.format(frame_index, frame.header.salt_1, + self.file_handle.header.salt_1, commit_record_number - 1, + indices[1] + 1) + logger.error(log_message) + raise WalParsingError(log_message) + + # Add the updated indices for the WAL value into the invalid frame indices dictionary + self.invalid_frame_indices[frame.header.salt_1] = (indices[0], frame_index) + + # The salt value was not already put into the invalid frame indices dictionary + else: + + # Add the indices for the salt value into the invalid frame indices dictionary + self.invalid_frame_indices[frame.header.salt_1] = (frame_index, frame_index) + + # Update the commit record number to None (see above documentation and script header documentation) + frame.commit_record_number = None + + # Append the frame to the invalid frame array + invalid_frame_array.append(frame) + + # Check if the salt 2 values were different if the salt 1 values were the same (error) + elif frame.header.salt_2 != self.file_handle.header.salt_2: + + log_message = "Frame index: {} after commit record number: {} has salt 2 of {} when expected to " \ + "be: {} where the salt 1 values matched." + log_message = log_message.format(frame_index, commit_record_number - 1, frame.header.salt_1, + self.file_handle.header.salt_1) + logger.error(log_message) + raise WalParsingError(log_message) + + # The frame is a valid frame + else: + + # Make sure there are no entries in the invalid frame indices or else there was an error + if self.invalid_frame_indices: + log_message = "Frame index: {} in commit record number: {} follows invalid frames." + log_message = log_message.format(frame_index, commit_record_number) + logger.error(log_message) + raise WalParsingError(log_message) + + # Append the frame to the valid frame array and increment the commit record number for a commit frame + valid_frame_array.append(frame) + if frame.commit_frame: + commit_record_number += 1 + + self.frames = dict(map(lambda x: [x.frame_index, x], valid_frame_array)) + self.invalid_frames = dict(map(lambda x: [x.frame_index, x], invalid_frame_array)) + + # Check if we had invalid frames + if self.invalid_frames: + + # Print debug log messages on the WAL frame details + log_message = "The number of frames found in the wal file are: {} with {} valid frames between frame" \ + "indices {} and {} and {} invalid frames between frame indices {} and {}" + log_message = log_message.format(self.number_of_frames, len(self.frames), min(self.frames.keys()), + max(self.frames.keys()), len(self.invalid_frames), + min(self.invalid_frames.keys()), max(self.invalid_frames.keys())) + logger.debug(log_message) + + log_message = "The invalid frame indices pertaining to salt 1 values are: {}." + log_message = log_message.format(self.invalid_frame_indices) + logger.debug(log_message) + + """ + + Below we output a warning and a log message warning that implementation for invalid frames is not + handled or parsed yet. + + """ + + log_message = "The wal file contains {} invalid frames. Invalid frames are currently skipped and not " \ + "implemented which may cause loss in possible carved data at this time until implemented." + log_message = log_message.format(len(self.invalid_frames)) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + self.last_frame_commit_record = None + last_wal_frame_commit_record_index = max(self.frames.keys()) + while last_wal_frame_commit_record_index >= 0: + + """ + + Starting from the end of the file and working backwards, we find the last commit record in the file + to determine at which point the data was committed to the database file. Soon as we find that frame, + we break from the while loop. + + """ + + if self.frames[last_wal_frame_commit_record_index].header.page_size_after_commit != 0: + self.last_frame_commit_record = self.frames[last_wal_frame_commit_record_index] + break + else: + last_wal_frame_commit_record_index -= 1 + + if last_wal_frame_commit_record_index != len(self.frames) - 1: + + """ + + If the last WAL frame commit record index does not equal the number of frames, that means that there was + at least one entry in the WAL file beyond the last committed record. This use case has not been discovered + yet and a NotImplementedError will be raised here until the use case is handled. + + """ + + log_message = "The last wal frame commit record index: {} was not the last committed frame of in {} frames." + log_message = log_message.format(last_wal_frame_commit_record_index, len(self.frames)) + logger.error(log_message) + raise NotImplementedError(log_message) + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding="", print_frames=True): + string = padding + "File Handle:\n{}" + string = string.format(self.file_handle.stringify(padding + "\t")) + string += "\n" \ + + padding + "Number of Frames: {}\n" \ + + padding + "Number of Valid Frames: {}\n" \ + + padding + "Number of Invalid Frames: {}\n" \ + + padding + "Invalid Frames Indices: {}\n" \ + + padding + "Last Frame Commit Record Number: {}" + string = string.format(self.number_of_frames, + len(self.frames), + len(self.invalid_frames), + self.invalid_frame_indices, + self.last_frame_commit_record.frame_index + 1) + if print_frames: + for frame in self.frames.itervalues(): + string += "\n" + padding + "Frame:\n{}".format(frame.stringify(padding + "\t")) + if print_frames and self.invalid_frames: + for invalid_frame in self.invalid_frames.itervalues(): + string += "\n" + padding + "Invalid Frame:\n{}".format(invalid_frame.stringify(padding + "\t")) + return string diff --git a/sqlite_dissect/file/wal_index/README.md b/sqlite_dissect/file/wal_index/README.md new file mode 100644 index 0000000..7668f16 --- /dev/null +++ b/sqlite_dissect/file/wal_index/README.md @@ -0,0 +1,59 @@ + +# sqlite_dissect.file.wal_index + +This package will control parsing and access to the sqlite wal index files. + +- header.py +- wal_index.py + +TODO items for the "wal_index" package: + +- [ ] Finish UML class diagrams. + +
+ +### header.py +This script holds the header objects used for parsing the header of the wal index file. + +This script holds the following object(s): +- WriteAheadLogIndexHeader(SQLiteHeader) +- WriteAheadLogIndexSubHeader(SQLiteHeader) +- WriteAheadLogIndexCheckpointInfo(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Finish try/except exception handling for struct.error and ord in classes. +- [ ] Implement big endian parsing (if needed). +- [ ] Create arrays for salt and checksum values rather than separate variables? They are arrays in the sqlite c code. + ##### WriteAheadLogIndexHeader Class: + - [ ] Check the two sub headers against each other to ensure they are equal. + - [ ] Document and handle exceptions that may be raised from creating subcomponents better. + ##### WriteAheadLogIndexCheckpointInfo Class: + - [ ] Handle the use case of 0xffffffff which is defined as READMARK_NOT_USED. + - [ ] Handle the use case of the first reader mark always being 0. (Check this) + +
+ +### wal_index.py +This script holds the class to parse the wal index file. + +This script holds the following object(s): +- WriteAheadLogIndex(object) +

+ +TODO: +- [ ] Documentation improvements. +- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators. +- [ ] Finish try/except exception handling for struct.error in classes. +- [ ] Implement big endian parsing (if needed). + ##### WriteAheadLogIndex Class: + - [ ] This class was a test of parsing a (single page) wal index and needs to be fully implemented. + - [ ] Should this be incorporated with the version/version history somehow? + - [ ] Update to support a file object. + - [ ] Constants for static integers. + - [ ] Use cases for implementation of retrieving unallocated space for carving? + - [ ] Check logging statements for correctness. + - [ ] Account for the database text encoding in the file handle. + - [ ] The file_size arg may not be needed since it is in the file handle and may be removed diff --git a/sqlite_dissect/file/wal_index/__init__.py b/sqlite_dissect/file/wal_index/__init__.py new file mode 100644 index 0000000..124f7d1 --- /dev/null +++ b/sqlite_dissect/file/wal_index/__init__.py @@ -0,0 +1,10 @@ + +""" + +__init__.py + +This init script will initialize any needed logic for this package. + +This package will control parsing and access to the sqlite wal index files. + +""" diff --git a/sqlite_dissect/file/wal_index/header.py b/sqlite_dissect/file/wal_index/header.py new file mode 100644 index 0000000..a7e3eee --- /dev/null +++ b/sqlite_dissect/file/wal_index/header.py @@ -0,0 +1,252 @@ +from binascii import hexlify +from logging import getLogger +from re import sub +from struct import unpack +from sqlite_dissect.constants import ENDIANNESS +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import WAL_INDEX_CHECKPOINT_INFO_LENGTH +from sqlite_dissect.constants import WAL_INDEX_FILE_FORMAT_VERSION +from sqlite_dissect.constants import WAL_INDEX_HEADER_LENGTH +from sqlite_dissect.constants import WAL_INDEX_LOCK_RESERVED_LENGTH +from sqlite_dissect.constants import WAL_INDEX_NUMBER_OF_SUB_HEADERS +from sqlite_dissect.constants import WAL_INDEX_NUMBER_OF_FRAMES_BACKFILLED_IN_DATABASE_LENGTH +from sqlite_dissect.constants import WAL_INDEX_READER_MARK_LENGTH +from sqlite_dissect.constants import WAL_INDEX_READER_MARK_SIZE +from sqlite_dissect.constants import WAL_INDEX_SUB_HEADER_LENGTH +from sqlite_dissect.exception import HeaderParsingError +from sqlite_dissect.file.header import SQLiteHeader +from sqlite_dissect.utilities import get_md5_hash + +""" + +header.py + +This script holds the header objects used for parsing the header of the wal index file. + +This script holds the following object(s): +WriteAheadLogIndexHeader(SQLiteHeader) +WriteAheadLogIndexSubHeader(SQLiteHeader) +WriteAheadLogIndexCheckpointInfo(object) + +""" + + +class WriteAheadLogIndexHeader(SQLiteHeader): + + def __init__(self, wal_index_header_byte_array): + + super(WriteAheadLogIndexHeader, self).__init__() + + logger = getLogger(LOGGER_NAME) + + if len(wal_index_header_byte_array) != WAL_INDEX_HEADER_LENGTH: + log_message = "The wal index header byte array of size: {} is not the expected size of: {}." + log_message = log_message.format(len(wal_index_header_byte_array), WAL_INDEX_HEADER_LENGTH) + logger.error(log_message) + raise ValueError(log_message) + + """ + + Note: The sub header will always be followed by an exact copy of itself in the WAL index file header. + Therefore, there will always be two (WAL_INDEX_NUMBER_OF_SUB_HEADERS) headers. Instead of having two + separate sub header variables, it was decided to do an array for the two since it is similarly + implemented like this in the sqlite c code. + + """ + + self.sub_headers = [] + + for sub_header_index in range(WAL_INDEX_NUMBER_OF_SUB_HEADERS): + start_offset = sub_header_index * WAL_INDEX_SUB_HEADER_LENGTH + end_offset = start_offset + WAL_INDEX_SUB_HEADER_LENGTH + self.sub_headers.append(WriteAheadLogIndexSubHeader(sub_header_index, + wal_index_header_byte_array[start_offset:end_offset])) + + """ + + Note: Since both of the sub headers are the same, they should each have the same endianness as well as page + size and therefore it does not matter from which one we retrieve it from. + + """ + + # Set variables for this class for page size and endianness + self.page_size = self.sub_headers[0].page_size + self.endianness = self.sub_headers[0].endianness + + checkpoint_start_offset = WAL_INDEX_NUMBER_OF_SUB_HEADERS * WAL_INDEX_SUB_HEADER_LENGTH + checkpoint_end_offset = checkpoint_start_offset + WAL_INDEX_CHECKPOINT_INFO_LENGTH + wal_index_checkpoint_info_byte_array = wal_index_header_byte_array[checkpoint_start_offset: + checkpoint_end_offset] + self.checkpoint_info = WriteAheadLogIndexCheckpointInfo(wal_index_checkpoint_info_byte_array, self.endianness) + + lock_reserved_start_offset = checkpoint_start_offset + WAL_INDEX_CHECKPOINT_INFO_LENGTH + lock_reserved_end_offset = lock_reserved_start_offset + WAL_INDEX_LOCK_RESERVED_LENGTH + self.lock_reserved = wal_index_header_byte_array[lock_reserved_start_offset:lock_reserved_end_offset] + + self.md5_hex_digest = get_md5_hash(wal_index_header_byte_array) + + def stringify(self, padding=""): + string = padding + "Page Size: {}\n" \ + + padding + "MD5 Hex Digest: {}" + string = string.format(self.page_size, + self.md5_hex_digest) + for sub_header_index in range(len(self.sub_headers)): + string += "\n" + padding + "Sub Header:\n{}" + string = string.format(self.sub_headers[sub_header_index].stringify(padding + "\t")) + string += "\n" + padding + "Checkpoint Info:\n{}".format(self.checkpoint_info.stringify(padding + "\t")) + string += "\n" + padding + "Lock Reserved (Hex): {}".format(hexlify(self.lock_reserved)) + return string + + +class WriteAheadLogIndexSubHeader(SQLiteHeader): + + def __init__(self, index, wal_index_sub_header_byte_array): + + super(WriteAheadLogIndexSubHeader, self).__init__() + + logger = getLogger(LOGGER_NAME) + + if index < 0 or index > WAL_INDEX_NUMBER_OF_SUB_HEADERS: + log_message = "Invalid wal index sub header index: {}.".format(index) + logger.error(log_message) + raise ValueError(log_message) + + self.index = index + + if len(wal_index_sub_header_byte_array) != WAL_INDEX_SUB_HEADER_LENGTH: + log_message = "The wal index sub header byte array of size: {} is not the expected size of: {}." + log_message = log_message.format(len(wal_index_sub_header_byte_array), WAL_INDEX_SUB_HEADER_LENGTH) + logger.error(log_message) + raise ValueError(log_message) + + self.endianness = ENDIANNESS.LITTLE_ENDIAN + + # Retrieve the file format version in little endian + self.file_format_version = unpack(b"I", wal_index_sub_header_byte_array[0:4])[0] + + if self.file_format_version != WAL_INDEX_FILE_FORMAT_VERSION: + + log_message = "The file format version is invalid" + logger.error(log_message) + raise HeaderParsingError(log_message) + + else: + + self.endianness = ENDIANNESS.BIG_ENDIAN + + log_message = "The wal index file is in big endian which is currently not supported." + logger.error(log_message) + raise NotImplementedError(log_message) + + self.unused_padding_field = unpack(b" {}: page version {} at offset {} with {} cells" + string = string.format(b_tree_root_page.number, version_interface.get_page_version(b_tree_root_page.number), + b_tree_root_page.offset, len(b_tree_root_page.cells)) + elif isinstance(b_tree_root_page, IndexLeafPage): + string += "\n" + padding + "B-Tree Index Leaf Page -> {}: page version {} at offset {} with {} cells" + string = string.format(b_tree_root_page.number, version_interface.get_page_version(b_tree_root_page.number), + b_tree_root_page.offset, len(b_tree_root_page.cells)) + elif isinstance(b_tree_root_page, TableInteriorPage): + string += "\n" + padding + "B-Tree Table Interior Page -> {}: page version {} at offset {} with {} cells" + string = string.format(b_tree_root_page.number, version_interface.get_page_version(b_tree_root_page.number), + b_tree_root_page.offset, len(b_tree_root_page.cells)) + string += stringify_b_tree(version_interface, b_tree_root_page.right_most_page, padding + "\t") + for b_tree_interior_cell in b_tree_root_page.cells: + string += stringify_b_tree(version_interface, b_tree_interior_cell.left_child_page, padding + "\t") + elif isinstance(b_tree_root_page, IndexInteriorPage): + string += "\n" + padding + "B-Tree Index Interior Page -> {}: page version {} at offset {} with {} cells" + string = string.format(b_tree_root_page.number, version_interface.get_page_version(b_tree_root_page.number), + b_tree_root_page.offset, len(b_tree_root_page.cells)) + string += stringify_b_tree(version_interface, b_tree_root_page.right_most_page, padding + "\t") + for b_tree_interior_cell in b_tree_root_page.cells: + string += stringify_b_tree(version_interface, b_tree_interior_cell.left_child_page, padding + "\t") + else: + log_message = "The b-tree root page is not a b-tree root page type but instead: {} in version: {}." + log_message = log_message.format(b_tree_root_page.page_type, version_interface.number) + getLogger(LOGGER_NAME).error(log_message) + raise ValueError(log_message) + + if not isinstance(b_tree_root_page, TableInteriorPage): + for cell in b_tree_root_page.cells: + if cell.has_overflow: + overflow_padding = padding + overflow_page = cell.overflow_pages[cell.overflow_page_number] + overflow_padding += "\t" + string += "\n" + overflow_padding + "Overflow Page -> {}: page version {} at offset {}" + string = string.format(overflow_page.number, version_interface.get_page_version(overflow_page.number), + overflow_page.offset) + while overflow_page.next_overflow_page_number: + overflow_page = cell.overflow_pages[overflow_page.next_overflow_page_number] + overflow_padding += "\t" + string += "\n" + overflow_padding + "Overflow Page -> {}: page version {} at offset {}" + string = string.format(overflow_page.number, + version_interface.get_page_version(overflow_page.number), + overflow_page.offset) + + return string + + +def stringify_cell_record(cell, database_text_encoding, page_type): + if page_type == PAGE_TYPE.B_TREE_TABLE_LEAF: + + column_values = [] + for record_column in cell.payload.record_columns: + text_affinity = True if record_column.serial_type >= 13 and record_column.serial_type % 2 == 1 else False + value = record_column.value + if record_column.value: + if text_affinity: + column_values.append(value.decode(database_text_encoding, "replace").encode(UTF_8)) + else: + column_values.append(str(value)) + else: + column_values.append("NULL") + content = "(" + ", ".join(column_values) + ")" + return "#{}: {}".format(cell.row_id, content) + + elif page_type == PAGE_TYPE.B_TREE_INDEX_LEAF: + + column_values = [] + for record_column in cell.payload.record_columns: + text_affinity = True if record_column.serial_type >= 13 and record_column.serial_type % 2 == 1 else False + value = record_column.value + if record_column.value: + if text_affinity: + column_values.append(value.decode(database_text_encoding, "replace").encode(UTF_8)) + else: + column_values.append(str(value)) + else: + column_values.append("NULL") + content = "(" + ", ".join(column_values) + ")" + return content + + else: + log_message = "Invalid page type specified for stringify cell record: {}. Page type should " \ + "be either {} or {}." + log_message = log_message.format(page_type, PAGE_TYPE.B_TREE_TABLE_LEAF, PAGE_TYPE.B_TREE_INDEX_LEAF) + getLogger(LOGGER_NAME).error(log_message) + raise ValueError(log_message) + + +def stringify_cell_records(cells, database_text_encoding, page_type): + cell_records = set() + for cell in cells: + cell_records.add(stringify_cell_record(cell, database_text_encoding, page_type)) + return cell_records + + +def stringify_master_schema_version(version): + + string = "" + + for master_schema_entry in version.master_schema.master_schema_entries: + + entry_string = "Version: {} Added Master Schema Entry: Root Page Number: {} Type: {} Name: {} " \ + "Table Name: {} SQL: {}.\n" + entry_string = entry_string.format(version.version_number, master_schema_entry.root_page_number, + master_schema_entry.row_type, master_schema_entry.name, + master_schema_entry.table_name, master_schema_entry.sql) + string += entry_string + + return string + + +def stringify_master_schema_versions(version_history): + + string = "" + + master_schema_entries = {} + + for version_number, version in version_history.versions.iteritems(): + + if version.master_schema_modified: + + modified_master_schema_entries = dict(map(lambda x: [x.md5_hash_identifier, x], + version.master_schema.master_schema_entries)) + + for md5_hash_identifier, master_schema_entry in modified_master_schema_entries.iteritems(): + + if md5_hash_identifier not in master_schema_entries: + + added_string = "Version: {} Added Master Schema Entry: Root Page Number: {} Type: {} Name: {} " \ + "Table Name: {} SQL: {}.\n" + added_string = added_string.format(version_number, master_schema_entry.root_page_number, + master_schema_entry.row_type, master_schema_entry.name, + master_schema_entry.table_name, master_schema_entry.sql) + string += added_string + + master_schema_entries[md5_hash_identifier] = master_schema_entry + + elif master_schema_entry.root_page_number != master_schema_entries[ + md5_hash_identifier].root_page_number: + + previous_root_page_number = master_schema_entries[md5_hash_identifier].root_page_number + + updated_string = "Version: {} Updated Master Schema Entry: Root Page Number From: {} To: {} " \ + "Type: {} Name: {} Table Name: {} SQL: {}.\n" + updated_string = updated_string.format(version_number, previous_root_page_number, + master_schema_entry.root_page_number, + master_schema_entry.row_type, master_schema_entry.name, + master_schema_entry.table_name, master_schema_entry.sql) + string += updated_string + + master_schema_entries[md5_hash_identifier] = master_schema_entry + + for md5_hash_identifier, master_schema_entry in master_schema_entries.iteritems(): + + if md5_hash_identifier not in modified_master_schema_entries: + + removed_string = "Version: {} Removed Master Schema Entry: Root Page Number: {} Type: {} " \ + "Name: {} Table Name: {} SQL: {}.\n" + removed_string = removed_string.format(version_number, master_schema_entry.root_page_number, + master_schema_entry.row_type, master_schema_entry.name, + master_schema_entry.table_name, master_schema_entry.sql) + string += removed_string + + return string + + +def stringify_page_history(version_history, padding=""): + string = "" + for version_number in version_history.versions: + string += "\n" if string else "" + string += stringify_version_pages(version_history.versions[version_number], padding) + return string + + +def stringify_page_information(version, padding=""): + string = padding + "Page Breakdown:" + for page_type, page_array in get_page_breakdown(version.pages).iteritems(): + page_array_length = len(page_array) + string += "\n" + padding + "\t" + "{}: {} Page Numbers: {}" + string = string.format(page_type, page_array_length, page_array) + string += "\n" + padding + "Page Structure:\n{}".format(stringify_page_structure(version, padding + "\t")) + if version.pointer_map_pages: + string += "\n" + padding + "Pointer Map Entry Breakdown across {} Pages:".format(version.database_size_in_pages) + for pointer_map_entry_breakdown in get_pointer_map_entries_breakdown(version): + string += "\n" + padding + "\t" + "Pointer Map Page {}: Page {} -> {} ({}) had Pointer Page Type (Hex) {}" + string = string.format(pointer_map_entry_breakdown[0], pointer_map_entry_breakdown[1], + pointer_map_entry_breakdown[2], pointer_map_entry_breakdown[3], + pointer_map_entry_breakdown[4]) + return string + + +def stringify_page_structure(version, padding=""): + + string = padding + "{} Pages of {} bytes".format(version.database_size_in_pages, version.page_size) + + string += "\n" + padding + "Database Root Page:" + string += stringify_b_tree(version, version.root_page, padding + "\t") + + pointer_map_pages = version.pointer_map_pages + if pointer_map_pages: + for pointer_map_page in pointer_map_pages: + string += "\n" + padding + "Pointer Map Page -> {}".format(pointer_map_page.number) + + freelist_trunk_page = version.first_freelist_trunk_page + if freelist_trunk_page: + string += "\n" + padding + "Freelist Trunk Page -> {}".format(freelist_trunk_page.number) + freelist_padding = padding + "\t" + for freelist_leaf_page in freelist_trunk_page.freelist_leaf_pages: + string += "\n" + freelist_padding + "Freelist Leaf Page -> {}".format(freelist_leaf_page.number) + while freelist_trunk_page.next_freelist_trunk_page: + freelist_trunk_page = freelist_trunk_page.next_freelist_trunk_page + string += "\n" + freelist_padding + "Freelist Trunk Page -> {}".format(freelist_trunk_page.number) + freelist_padding += "\t" + for freelist_leaf_page in freelist_trunk_page.freelist_leaf_pages: + string += "\n" + freelist_padding + "Freelist Leaf Page -> {}".format(freelist_leaf_page.number) + + if version.master_schema: + string += "\n" + padding + "Master Schema Root Pages:" + for master_schema_root_page_number in version.master_schema.master_schema_b_tree_root_page_numbers: + master_schema_root_page = version.get_b_tree_root_page(master_schema_root_page_number) + string += stringify_b_tree(version, master_schema_root_page, padding + "\t") + + return string + + +def stringify_unallocated_space(version, padding="", include_empty_space=True): + string = "" + calculated_total_fragmented_bytes = 0 + for page_number, page in version.pages.iteritems(): + + unallocated_content = page.unallocated_content + if len(unallocated_content): + if (not include_empty_space and has_content(unallocated_content)) or include_empty_space: + string += "\n" if string else "" + string += padding + "Page #{}: {} Page Unallocated Space Start Offset: {} " \ + "End Offset: {} Size: {} Hex: [{}]" + string = string.format(page_number, page.page_type, page.unallocated_space_start_offset, + page.unallocated_space_end_offset, page.unallocated_space_length, + hexlify(page.unallocated_content)) + + if isinstance(page, BTreePage): + for freeblock in page.freeblocks: + freeblock_content = freeblock.content + if len(freeblock_content) and has_content(freeblock_content): + string += "\n" if string else "" + string += padding + "Page #{}: {} Page Freeblock #{}: Unallocated Space Start Offset: {} " \ + "End Offset: {} Size: {} Hex: [{}]" + string = string.format(page_number, page.page_type, freeblock.index, freeblock.start_offset, + freeblock.end_offset, freeblock.content_length, + hexlify(freeblock_content)) + + for fragment in page.fragments: + fragment_content = fragment.content + if fragment_content and has_content(fragment_content): + string += "\n" if string else "" + string += padding + "Page #{}: {} Page Fragment #{}: Unallocated Space Start Offset: {} " \ + "End Offset: {} Size: {} Hex: [{}]" + string = string.format(page_number, page.page_type, fragment.index, fragment.start_offset, + fragment.end_offset, fragment.byte_size, hexlify(fragment_content)) + calculated_total_fragmented_bytes += page.header.number_of_fragmented_free_bytes + + string += "\n" if string else "" + string += padding + "Calculated Total Fragmented Bytes: {}".format(calculated_total_fragmented_bytes) + return string + + +def stringify_version_pages(version, padding=""): + string = padding + "Version {} with {} of {} Pages: {}".format(version.version_number, + len(version.updated_page_numbers), + version.database_size_in_pages, + version.updated_page_numbers) + + page_versions = {} + for page_number, page_version_number in version.page_version_index.iteritems(): + if page_version_number in page_versions: + page_versions[page_version_number] = page_versions[page_version_number] + ", " + str(page_number) + else: + page_versions[page_version_number] = str(page_number) + + for version_number in reversed(range(version.version_number + 1)): + page_version_string = "\n" + padding + "\t" + "Version: {} has Pages: {}" + if version_number in page_versions: + string += page_version_string.format(version_number, page_versions[version_number]) + else: + string += page_version_string.format(version_number, str()) + return string diff --git a/sqlite_dissect/utilities.py b/sqlite_dissect/utilities.py new file mode 100644 index 0000000..5a8f2ca --- /dev/null +++ b/sqlite_dissect/utilities.py @@ -0,0 +1,248 @@ +from binascii import hexlify +from hashlib import md5 +from logging import getLogger +from re import compile +from struct import pack +from struct import unpack +from sqlite_dissect.constants import ALL_ZEROS_REGEX +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import OVERFLOW_HEADER_LENGTH +from sqlite_dissect.constants import BLOB_SIGNATURE_IDENTIFIER +from sqlite_dissect.constants import STORAGE_CLASS +from sqlite_dissect.constants import TEXT_SIGNATURE_IDENTIFIER +from sqlite_dissect.exception import InvalidVarIntError + +""" + +utilities.py + +This script holds general utility functions for reference by the sqlite carving library. + +This script holds the following function(s): +calculate_expected_overflow(overflow_byte_size, page_size) +decode_varint(byte_array, offset) +encode_varint(value) +get_class_instance(class_name) +get_md5_hash(string) +get_record_content(serial_type, record_body, offset=0) +get_serial_type_signature(serial_type) +get_storage_class(serial_type) +has_content(byte_array) + +""" + + +def calculate_expected_overflow(overflow_byte_size, page_size): + + overflow_pages = 0 + last_overflow_page_content_size = overflow_byte_size + + if overflow_byte_size > 0: + while overflow_byte_size > 0: + overflow_pages += 1 + last_overflow_page_content_size = overflow_byte_size + overflow_byte_size = overflow_byte_size - page_size + OVERFLOW_HEADER_LENGTH + + return overflow_pages, last_overflow_page_content_size + + +def decode_varint(byte_array, offset=0): + + unsigned_integer_value = 0 + varint_relative_offset = 0 + + for x in xrange(1, 10): + + varint_byte = ord(byte_array[offset + varint_relative_offset:offset + varint_relative_offset + 1]) + varint_relative_offset += 1 + + if x == 9: + unsigned_integer_value <<= 1 + unsigned_integer_value |= varint_byte + else: + msb_set = varint_byte & 0x80 + varint_byte &= 0x7f + unsigned_integer_value |= varint_byte + if msb_set == 0: + break + else: + unsigned_integer_value <<= 7 + + signed_integer_value = unsigned_integer_value + if signed_integer_value & 0x80000000 << 32: + signed_integer_value -= 0x10000000000000000 + + return signed_integer_value, varint_relative_offset + + +def encode_varint(value): + + max_allowed = 0x7fffffffffffffff + min_allowed = (max_allowed + 1) - 0x10000000000000000 + if value > max_allowed or value < min_allowed: + log_message = "The value: {} is not able to be cast into a 64 bit signed integer for encoding." + log_message = log_message.format(value) + getLogger(LOGGER_NAME).error(log_message) + raise InvalidVarIntError(log_message) + + byte_array = bytearray() + + value += 1 << 64 if value < 0 else 0 + + if value & 0xff000000 << 32: + + byte = value & 0xff + byte_array.insert(0, pack("B", byte)) + value >>= 8 + + for _ in xrange(8): + byte_array.insert(0, pack("B", (value & 0x7f) | 0x80)) + value >>= 7 + + else: + + while value: + byte_array.insert(0, pack("B", (value & 0x7f) | 0x80)) + value >>= 7 + + if len(byte_array) >= 9: + log_message = "The value: {} produced a varint with a byte array of length: {} beyond the 9 bytes " \ + "allowed for a varint." + log_message = log_message.format(value, len(byte_array)) + getLogger(LOGGER_NAME).error(log_message) + raise InvalidVarIntError(log_message) + + byte_array[-1] &= 0x7f + + return byte_array + + +def get_class_instance(class_name): + if class_name.find(".") != -1: + path_array = class_name.split(".") + module = ".".join(path_array[:-1]) + instance = __import__(module) + for section in path_array[1:]: + instance = getattr(instance, section) + return instance + else: + log_message = "Class name: {} did not specify needed modules in order to initialize correctly." + log_message = log_message.format(log_message) + getLogger(LOGGER_NAME).error(log_message) + raise ValueError(log_message) + + +def get_md5_hash(string): + md5_hash = md5() + md5_hash.update(string) + return md5_hash.hexdigest().upper() + + +def get_record_content(serial_type, record_body, offset=0): + + # NULL + if serial_type == 0: + content_size = 0 + value = None + + # 8-bit twos-complement integer + elif serial_type == 1: + content_size = 1 + value = unpack(b">b", record_body[offset:offset + content_size])[0] + + # Big-endian 16-bit twos-complement integer + elif serial_type == 2: + content_size = 2 + value = unpack(b">h", record_body[offset:offset + content_size])[0] + + # Big-endian 24-bit twos-complement integer + elif serial_type == 3: + content_size = 3 + value_byte_array = '\0' + record_body[offset:offset + content_size] + value = unpack(b">I", value_byte_array)[0] + if value & 0x800000: + value -= 0x1000000 + + # Big-endian 32-bit twos-complement integer + elif serial_type == 4: + content_size = 4 + value = unpack(b">i", record_body[offset:offset + content_size])[0] + + # Big-endian 48-bit twos-complement integer + elif serial_type == 5: + content_size = 6 + value_byte_array = '\0' + '\0' + record_body[offset:offset + content_size] + value = unpack(b">Q", value_byte_array)[0] + if value & 0x800000000000: + value -= 0x1000000000000 + + # Big-endian 64-bit twos-complement integer + elif serial_type == 6: + content_size = 8 + value = unpack(b">q", record_body[offset:offset + content_size])[0] + + # Big-endian IEEE 754-2008 64-bit floating point number + elif serial_type == 7: + content_size = 8 + value = unpack(b">d", record_body[offset:offset + content_size])[0] + + # Integer constant 0 (schema format == 4) + elif serial_type == 8: + content_size = 0 + value = 0 + + # Integer constant 1 (schema format == 4) + elif serial_type == 9: + content_size = 0 + value = 1 + + # These values are not used/reserved and should not be found in sqlite files + elif serial_type == 10 or serial_type == 11: + raise Exception() + + # A BLOB that is (N-12)/2 bytes in length + elif serial_type >= 12 and serial_type % 2 == 0: + content_size = (serial_type - 12) / 2 + value = record_body[offset:offset + content_size] + + # A string in the database encoding and is (N-13)/2 bytes in length. The nul terminator is omitted + elif serial_type >= 13 and serial_type % 2 == 1: + content_size = (serial_type - 13) / 2 + value = record_body[offset:offset + content_size] + + else: + log_message = "Invalid serial type: {} at offset: {} in record body: {}." + log_message = log_message.format(serial_type, offset, hexlify(record_body)) + getLogger(LOGGER_NAME).error(log_message) + raise ValueError(log_message) + + return content_size, value + + +def get_serial_type_signature(serial_type): + if serial_type >= 12: + if serial_type % 2 == 0: + return BLOB_SIGNATURE_IDENTIFIER + elif serial_type % 2 == 1: + return TEXT_SIGNATURE_IDENTIFIER + return serial_type + + +def get_storage_class(serial_type): + if serial_type == 0: + return STORAGE_CLASS.NULL + if serial_type in [1, 2, 3, 4, 5, 6, 8, 9]: + return STORAGE_CLASS.INTEGER + if serial_type == 7: + return STORAGE_CLASS.REAL + if serial_type >= 12 and serial_type % 2 == 0: + return STORAGE_CLASS.BLOB + if serial_type >= 13 and serial_type % 2 == 0: + return STORAGE_CLASS.TEXT + + +def has_content(byte_array): + pattern = compile(ALL_ZEROS_REGEX) + if pattern.match(hexlify(byte_array)): + return False + return True diff --git a/sqlite_dissect/version_history.py b/sqlite_dissect/version_history.py new file mode 100644 index 0000000..4d3336f --- /dev/null +++ b/sqlite_dissect/version_history.py @@ -0,0 +1,813 @@ +from logging import getLogger +from re import sub +from warnings import warn +from sqlite_dissect.carving.carver import SignatureCarver +from sqlite_dissect.constants import BASE_VERSION_NUMBER +from sqlite_dissect.constants import CELL_SOURCE +from sqlite_dissect.constants import COMMIT_RECORD_BASE_VERSION_NUMBER +from sqlite_dissect.constants import LOGGER_NAME +from sqlite_dissect.constants import MASTER_SCHEMA_ROW_TYPE +from sqlite_dissect.constants import PAGE_TYPE +from sqlite_dissect.exception import VersionParsingError +from sqlite_dissect.exception import WalCommitRecordParsingError +from sqlite_dissect.exception import WalFrameParsingError +from sqlite_dissect.file.database.page import BTreePage +from sqlite_dissect.file.database.utilities import aggregate_leaf_cells +from sqlite_dissect.file.database.utilities import get_pages_from_b_tree_page +from sqlite_dissect.file.schema.master import VirtualTableRow +from sqlite_dissect.file.wal.commit_record import WriteAheadLogCommitRecord +from sqlite_dissect.file.version_parser import VersionParser + +""" + +version_history.py + +This script holds the superclass objects used for parsing the database and write ahead log in a sequence of versions +throughout all of the commit records in the write ahead log. + +This script holds the following object(s): +VersionHistory(object) +VersionHistoryParser(VersionParser) (with VersionHistoryParserIterator(object) as an inner class) +Commit(object) + +""" + + +class VersionHistory(object): + + """ + + + + This class represents the SQL database and WAL commit records as a sequence of versions. This way the changes + from commit record to commit record can be viewed and worked with and each version has information in them that + lends to them being carved easier. Here version 0 (BASE_VERSION_NUMBER) is used to always represent the main + database and then 1 to N versions following the base version represent the commit records up to N. To note, + the final commit record, N, has the possibility of being half written and not committed depending if the + committed page size is set in one of the frames in the commit record or not. + + """ + + def __init__(self, database, write_ahead_log=None): + + logger = getLogger(LOGGER_NAME) + + # Set the database and write ahead log + self._database = database + self._write_ahead_log = write_ahead_log + + """ + + Initialize the versions in for them of: + versions[VERSION_NUMBER] = database where VERSION_NUMBER = BASE_VERSION_NUMBER (0) + versions[VERSION_NUMBER] = commit_record_VERSION_NUMBER where VERSION_NUMBER is 1 to N for N commit records. + + """ + + self.versions = {BASE_VERSION_NUMBER: self._database} + + if self._write_ahead_log: + + # Set the database text encoding to the write ahead log file if it was set in the database file + if self._database.database_text_encoding: + self._write_ahead_log.file_handle.database_text_encoding = self._database.database_text_encoding + + # Set the last database header and master schema to refer to + last_database_header = self._database.database_header + last_master_schema = self._database.master_schema + + # These two dictionaries will be updated and sent into every commit record + page_version_index = self._database.page_version_index + page_frame_index = {} + + # Setup variables for frame association with commit records + frames = [] + commit_record_number = COMMIT_RECORD_BASE_VERSION_NUMBER + + # Iterate through all of the frames in the write ahead log + for frame_index in range(len(self._write_ahead_log.frames)): + + # Set the frame + frame = self._write_ahead_log.frames[frame_index] + + # Make sure the frame index matches the frame + if frame_index != frame.frame_index: + log_message = "Current frame index: {} did not match the expected frame index: {} while parsing " \ + "frames for commit record version: {}." + log_message = log_message.format(frame_index, frame.frame_index, commit_record_number) + logger.error(log_message) + raise WalFrameParsingError(log_message) + + # Add the frame to the frames array + frames.append(frame) + + # Make sure the frame belongs to the commit record we are currently working on creating + if frame.commit_record_number != commit_record_number: + log_message = "Current frame commit record number: {} did not match the expected commit record " \ + "number : {}." + log_message = log_message.format(frame.commit_record_number, commit_record_number) + logger.error(log_message) + raise WalFrameParsingError(log_message) + + """ + + According to SQLite documentation, the frame with the page size after commit field in the header set + is the commit frame and therefore all frames before this one (up to the previous one) are considered + the commit record. No frames will appear beyond this frame with additional information in this commit + record. + + """ + + # Check if this frame is a commit frame + if frame.commit_frame: + + # Create the commit record since we now have all the frames for this commit record + commit_record = WriteAheadLogCommitRecord(commit_record_number, self._database, + self._write_ahead_log, frames, page_frame_index, + page_version_index, last_database_header, + last_master_schema, + store_in_memory=write_ahead_log.store_in_memory, + strict_format_checking=write_ahead_log. + strict_format_checking) + + if commit_record.database_header_modified: + last_database_header = commit_record.database_header + + if not last_database_header: + log_message = "Database header was detected as modified for commit record version: {} " \ + "but no database header was found." + log_message = log_message.format(commit_record_number) + logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + if commit_record.master_schema_modified: + last_master_schema = commit_record.master_schema + + if not last_master_schema: + log_message = "Master schema was detected as modified for commit record version: {} " \ + "but no master schema was found." + log_message = log_message.format(commit_record_number) + logger.error(log_message) + raise WalCommitRecordParsingError(log_message) + + # Set the page version and page frame dictionaries variables for the next commit record + page_frame_index = commit_record.page_frame_index + page_version_index = commit_record.page_version_index + + self.versions[commit_record_number] = commit_record + + # Increment the commit record number and clear the frames array (reset to an empty array). + commit_record_number += 1 + frames = [] + + # Check if there are remaining frames which indicates the last commit record was not committed + if len(frames) > 0: + + # Create the commit record + commit_record = WriteAheadLogCommitRecord(commit_record_number, self._database, self._write_ahead_log, + frames, page_frame_index, page_version_index, + last_database_header, last_master_schema, + store_in_memory=write_ahead_log.store_in_memory, + strict_format_checking=write_ahead_log.strict_format_checking) + + """ + + Note: We do not need to worry about setting the last database header or last master schema here. We + also do not need to worry about setting the page frame index or page version index. + + """ + + self.versions[commit_record_number] = commit_record + + """ + + Since we have not seen use cases where the write ahead log file has had additional frames beyond the + last frame that was a commit frame, we throw a warning here since this use case could result in + adverse logic. + + """ + + log_message = "Version (commit record): {} has additional frames beyond the last commit frame found " \ + "in the write ahead log and erroneous use cases may occur when parsing." + log_message = log_message.format(commit_record_number) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + # Set the number of versions + self.number_of_versions = len(self.versions) + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding="", print_versions=True): + string = "File Type: {}" + string = string.format(self.number_of_versions) + if print_versions: + for version in self.versions: + string += "\n" + padding + "Page:\n{}".format(version.stringify(padding + "\t")) + return string + + +class VersionHistoryParser(VersionParser): + + def __init__(self, version_history, master_schema_entry, + version_number=None, ending_version_number=None, signature=None, carve_freelist_pages=False): + + """ + + + + Note: The updated cells currently only apply to table leaf pages (table master schema entries that are not + "without rowid" tables). Therefore no index pages will have updated cells. This is due to the fact + that updates are determined off of the row id at the moment which is only available in the b-tree table + pages. However, it is important to note that even if the row id is the same and that cell is determined + to have been updated by this process, there is still a chance that it is not an offset and has error + in this assumption. Additional checking may need to be done into the file offsets and/or primary keys. + (Although file offsets can change as well as page numbers on updates or vacuuming.) Investigation needs + to be done more into table pages as well as how to determine updates for index pages. + + Note: If there are duplicate entries found in consecutive versions (ie. entries that did not change), those + will be left out and only reported in the first version they are found ("added"). The first version + to be parsed, whether that be the base version or one of the commit records, will have all entries + considered "added" and no deleted or updated entries. + + :param version_history: + :param master_schema_entry: + :param version_number: + :param ending_version_number: + + :return: + + :raise: + + """ + + # Call to the super class + super(VersionHistoryParser, self).__init__(version_history, master_schema_entry, + version_number, ending_version_number) + + logger = getLogger(LOGGER_NAME) + + self._versions = version_history.versions + + log_message = "Creating version history parser for master schema entry with name: {} table name: {} " \ + "row type: {} and sql: {} for version number: {} and ending version number: {}." + log_message = log_message.format(self.name, self.table_name, self.row_type, self.sql, + self.parser_starting_version_number, self.parser_ending_version_number) + logger.debug(log_message) + + self._virtual_table = isinstance(master_schema_entry, VirtualTableRow) + + if signature: + + if signature.name != self.name: + log_message = "Invalid signature name: {} for version history parser on master schema entry name: {}." + log_message = log_message.format(signature.name, self.name) + logger.error(log_message) + raise ValueError(log_message) + + if signature.row_type != self.row_type: + log_message = "Invalid signature row type: {} for signature name: {} for version history parser on " \ + "master schema entry name: {} and row type: {}." + log_message = log_message.format(signature.row_type, signature.name, self.name, self.row_type) + logger.error(log_message) + raise ValueError(log_message) + + if signature.row_type != MASTER_SCHEMA_ROW_TYPE.TABLE: + log_message = "Not carving version history parser for master schema entry with name: {} table " \ + "name: {} row type: {} and sql: {} for version number: {} and ending version number: " \ + "{} since the row type is not a {} type but: {}." + log_message = log_message.format(self.name, self.table_name, self.row_type, self.sql, + self.parser_starting_version_number, self.parser_ending_version_number, + MASTER_SCHEMA_ROW_TYPE.TABLE, signature.row_type) + logger.warn(log_message) + warn(log_message, RuntimeWarning) + + # Set the signature + self.signature = signature + + self.carve_freelist_pages = carve_freelist_pages + + if self.carve_freelist_pages and not self.signature: + log_message = "Carve freelist pages set with no signature defined. A signatures is needed in order to " \ + "carve freelist pages for master schema entry with name: {} table name: {} row type: {} " \ + "and sql: {} for version number: {} and ending version number: {}." + log_message = log_message.format(self.name, self.table_name, self.row_type, self.sql, + self.parser_starting_version_number, self.parser_ending_version_number) + logger.error(log_message) + raise ValueError(log_message) + + def __iter__(self): + if self.row_type not in [MASTER_SCHEMA_ROW_TYPE.TABLE, MASTER_SCHEMA_ROW_TYPE.INDEX]: + # Return an empty iterator + return iter([]) + elif self._virtual_table: + + """ + + In the case this is a virtual table, we check to see if the root page is 0. Additional use cases + handling for virtual tables needs to be investigated. For now, if a virtual table exists with a + root page of 0 we do not iterate through it and return a StopIteration() since we do not have anything + to iterate. We do throw a warning here (again) for informative purposes. + + Note: If all root page numbers in the root page number version index are not 0, an exception is raised. + + """ + + # Check to make sure all root page numbers are 0 as should be with virtual tables. + if not all(root_page_number == 0 for root_page_number in self.root_page_number_version_index.values()): + log_message = "Virtual table found with root page version index: {} where all root page numbers " \ + "are not equal to 0 in version history parser for master schema entry with " \ + "name: {} table name: {} row type: {} and sql: {} for version number: {} " \ + "and ending version number: {}." + log_message = log_message.format(self.root_page_number_version_index, + self.name, self.table_name, self.row_type, self.sql, + self.parser_starting_version_number, + self.parser_ending_version_number) + getLogger(LOGGER_NAME).error(log_message) + raise ValueError(log_message) + + log_message = "Virtual table found with root page 0 for in version history parser for master schema " \ + "entry with name: {} table name: {} row type: {} and sql: {} for version number: {} " \ + "and ending version number: {}. An iterator will not be returned since there " \ + "is no content." + log_message = log_message.format(self.name, self.table_name, self.row_type, self.sql, + self.parser_starting_version_number, self.parser_ending_version_number) + getLogger(LOGGER_NAME).warn(log_message) + warn(log_message, RuntimeWarning) + + # Return an empty iterator + return iter([]) + + elif self.parser_starting_version_number is not None and self.parser_ending_version_number is not None: + return self.VersionParserIterator(self.name, self._versions, self.page_type, + self.parser_starting_version_number, self.parser_ending_version_number, + self.root_page_number_version_index, + self.signature, self.carve_freelist_pages) + else: + # Return an empty iterator + return iter([]) + + def stringify(self, padding="", print_cells=True): + string = "" + for commit in self: + string += "\n" + padding + "Commit:\n{}".format(commit.stringify(padding + "\t", print_cells)) + return super(VersionHistoryParser, self).stringify(padding) + string + + class VersionParserIterator(object): + + """ + + + + Note: See VersionHistoryParser class documentation regarding entries returned from this iterator + (specifically on updates). + + """ + + def __init__(self, name, versions, page_type, parser_starting_version_number, parser_ending_version_number, + root_page_number_version_index, signature=None, carve_freelist_pages=False): + self._name = name + self._versions = versions + self._page_type = page_type + self._parser_starting_version_number = parser_starting_version_number + self._parser_ending_version_number = parser_ending_version_number + self._root_page_number_version_index = root_page_number_version_index + + # Set the signature + self._signature = signature + + self._carve_freelist_pages = carve_freelist_pages + + # Initialize the current cells + self._current_cells = {} + + # Initialize the carved cell md5 hex digests + self._carved_cell_md5_hex_digests = [] + + # Initialize the current b-tree page numbers + self._current_b_tree_page_numbers = [] + + self._current_version_number = self._parser_starting_version_number + + def __iter__(self): + return self + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding="", print_cells=True): + string = padding + "Page Type: {}\n" \ + + padding + "Parser Starting Version Number: {}\n" \ + + padding + "Parser Ending Version Number: {}\n" \ + + padding + "Root Page Number Version Index: {}\n" \ + + padding + "Current Version Number: {}\n" \ + + padding + "Current B-Tree Page Numbers: {}\n" \ + + padding + "Carve Freelist Pages: {}" + string = string.format(self._page_type, + self._parser_starting_version_number, + self._parser_ending_version_number, + self._current_version_number, + self._current_b_tree_page_numbers, + self._carve_freelist_pages) + if print_cells: + for current_cell in self._current_cells.itervalues(): + string += "\n" + padding + "Cell:\n{}".format(current_cell.stringify(padding + "\t")) + return string + + def next(self): + + if self._current_version_number <= self._parser_ending_version_number: + + version = self._versions[self._current_version_number] + root_page_number = self._root_page_number_version_index[self._current_version_number] + + # Create the commit object + commit = Commit(self._name, version.file_type, self._current_version_number, + version.database_text_encoding, self._page_type, root_page_number,self._current_b_tree_page_numbers) + + b_tree_updated = False + + # Check if this is the first version to be investigated + if self._current_version_number == self._parser_starting_version_number: + b_tree_updated = True + + # Check if the root page number changed + elif root_page_number != self._root_page_number_version_index[self._current_version_number - 1]: + b_tree_updated = True + + # Check if any of the pages changed (other than the root page specifically here) + elif [page_number for page_number in self._current_b_tree_page_numbers + if page_number in version.updated_b_tree_page_numbers]: + b_tree_updated = True + + # Parse the b-tree page structure if it was updated + if b_tree_updated: + + # Get the root page and root page numbers from the first version + root_page = version.get_b_tree_root_page(root_page_number) + b_tree_pages = get_pages_from_b_tree_page(root_page) + self._current_b_tree_page_numbers = [b_tree_page.number for b_tree_page in b_tree_pages] + + # Update the b-tree page numbers in the commit record + commit.b_tree_page_numbers = self._current_b_tree_page_numbers + + updated_b_tree_page_numbers = [page_number for page_number in self._current_b_tree_page_numbers + if page_number in version.updated_b_tree_page_numbers] + + # Set the updated b-tree page numbers in the commit object + commit.updated_b_tree_page_numbers = updated_b_tree_page_numbers + + """ + + Below we aggregate the cells together. This function returns the total of cells and then + a dictionary of cells indexed by their cell md5 hex digest to record. Here, we do not + want to ignore any entries since we want to be able to obtain those that were added along + with cells that were deleted and/or updated. Therefore, the total should match the length + of the cells returned. + + """ + + total, cells = aggregate_leaf_cells(root_page) + + if total != len(cells): + log_message = "The total aggregated leaf cells: {} does not match the length of the " \ + "cells parsed: {} for version: {} of page type: {} iterating between versions " \ + "{} and {} over b-tree page numbers: {} with updated b-tree pages: {}." + log_message = log_message.format(total, len(cells), self._current_version_number, + self._page_type, self._parser_starting_version_number, + self._parser_ending_version_number, + self._current_b_tree_page_numbers, + updated_b_tree_page_numbers) + getLogger(LOGGER_NAME).error(log_message) + raise VersionParsingError(log_message) + + """ + + Go through the cells and determine which cells have been added, deleted, and/or updated. + + """ + + # Copy the newly found cells to a new dictionary + added_cells = dict.copy(cells) + + # Initialize the deleted cells + deleted_cells = {} + + # Iterate through the current cells + for current_cell_md5, current_cell in self._current_cells.iteritems(): + + # Remove the cell from the added cells if it was already pre-existing + if current_cell_md5 in added_cells: + del added_cells[current_cell_md5] + + # The cell was in the previously current cells but now deleted + else: + deleted_cells[current_cell_md5] = current_cell + + # Set the current cells to this versions cells + self._current_cells = cells + + """ + + At this point we have the following two dictionaries: + added_cells: All of the cells that were found to be new in this version for this table/index. + deleted_cells: All of the cells that were found to be deleted in this version for this table/index. + + The current cells are set back to the cells for future version iterations to compare against. This + is set to the whole dictionary of cells and not the added cells since pre-existing cells can carry + over into consecutive versions. + + """ + + if self._page_type == PAGE_TYPE.B_TREE_TABLE_LEAF: + + # Organize a added cells dictionary keyed off of row id + added_cells_by_row_id = {added_cell.row_id: added_cell for added_cell in added_cells.values()} + + # Get the row ids of the cells that were updated by checking against the deleted cells + updated_cell_row_ids = [deleted_cell.row_id for deleted_cell in deleted_cells.values() + if deleted_cell.row_id in added_cells_by_row_id] + + # Get the cells that might possibly have been updated by comparing the row ids + updated_cells = {updated_cell.md5_hex_digest: updated_cell + for updated_cell in added_cells.values() + if updated_cell.row_id in updated_cell_row_ids} + + # Update the deleted cells to remove any possibly updated cells just determined + deleted_cells = {deleted_cell.md5_hex_digest: deleted_cell + for deleted_cell in deleted_cells.values() + if deleted_cell.row_id not in updated_cell_row_ids} + + # Before we can set the added cells, we need to remove the updated cells detected above + added_cells = {added_cell.md5_hex_digest: added_cell + for added_cell in added_cells.values() + if added_cell.md5_hex_digest not in updated_cells} + + # Set the added, updated, and deleted cells + commit.added_cells = added_cells + commit.updated_cells = updated_cells + commit.deleted_cells = deleted_cells + + """ + + Right now we only carve if the signature is specified and only from pages that were updated in + this particular b-tree in this version. + + Note: Once index page carving is implemented this section will need to be updated to correctly + address it. + + """ + + if self._signature: + log_message = "Carving table master schema entry name: {} for page type: {} for version: " \ + "{} with root page: {} between versions {} and {} over b-tree page " \ + "numbers: {} with updated b-tree pages: {}." + log_message = log_message.format(self._signature.name, self._page_type, + self._current_version_number, + root_page_number, self._parser_starting_version_number, + self._parser_ending_version_number, + self._current_b_tree_page_numbers, + updated_b_tree_page_numbers) + getLogger(LOGGER_NAME).debug(log_message) + + # Initialize the carved cells + carved_cells = [] + + b_tree_pages_by_number = {b_tree_page.number: b_tree_page for b_tree_page in b_tree_pages} + + for updated_b_tree_page_number in updated_b_tree_page_numbers: + + page = b_tree_pages_by_number[updated_b_tree_page_number] + + # For carving freeblocks make sure the page is a b-tree page and not overflow + if isinstance(page, BTreePage): + carvings = SignatureCarver.carve_freeblocks(version, CELL_SOURCE.B_TREE, + page.freeblocks, self._signature) + carved_cells.extend(carvings) + + # Carve unallocated space + carvings = SignatureCarver.carve_unallocated_space(version, CELL_SOURCE.B_TREE, + updated_b_tree_page_number, + page.unallocated_space_start_offset, + page.unallocated_space, + self._signature) + carved_cells.extend(carvings) + + # Remove all carved cells that may be duplicates from previous version carvings + carved_cells = {carved_cell.md5_hex_digest: carved_cell for carved_cell in carved_cells + if carved_cell.md5_hex_digest not in self._carved_cell_md5_hex_digests} + + # Update the carved cells in the commit object + commit.carved_cells.update(carved_cells) + + # Update the carved cell md5 hex digests + self._carved_cell_md5_hex_digests.extend([cell_md5_hex_digest + for cell_md5_hex_digest in carved_cells.keys()]) + + elif self._page_type == PAGE_TYPE.B_TREE_INDEX_LEAF: + + # Set the added cells + commit.added_cells = added_cells + + # As noted above, we will not define updates for index cells yet so just set the deleted cells + commit.deleted_cells = deleted_cells + + else: + log_message = "Invalid page type: {} found for version: {} iterating between versions {} " \ + "and {} over b-tree page numbers: {} with updated b-tree pages: {}." + log_message = log_message.format(self._page_type, self._current_version_number, + self._parser_starting_version_number, + self._parser_ending_version_number, + self._current_b_tree_page_numbers, + updated_b_tree_page_numbers) + getLogger(LOGGER_NAME).error(log_message) + raise VersionParsingError(log_message) + + """ + + Note: The outer class checks on if the signature is defined in relation to the carving of freelist + pages being set and handles it accordingly. Here we can assume that the signature is defined + if we are carving freelist pages. + + """ + + # See if we are also + if self._carve_freelist_pages: + + freelist_pages_updated = False + + # Check if this is the first version to be investigated + if self._current_version_number == self._parser_starting_version_number: + freelist_pages_updated = True + + # Check if the freelist pages were modified in this version + elif version.freelist_pages_modified: + freelist_pages_updated = True + + # Carve the freelist pages if any were updated + if freelist_pages_updated: + + """ + + Note: We only have to worry about caring the B_TREE_TABLE_LEAF pages right now since this is + the only page really supported in carving so far. The super class already prints the + needed warnings that carving will not occur if it is an B_TREE_INDEX_LEAF page. + + Note: As also stated above the signature by this point will be set. + + """ + + if self._page_type == PAGE_TYPE.B_TREE_TABLE_LEAF: + + # Populate the updated freelist pages into a dictionary keyed by page number + updated_freelist_pages = {} + freelist_trunk_page = version.first_freelist_trunk_page + while freelist_trunk_page: + if freelist_trunk_page.number in version.updated_page_numbers: + updated_freelist_pages[freelist_trunk_page.number] = freelist_trunk_page + for freelist_leaf_page in freelist_trunk_page.freelist_leaf_pages: + if freelist_leaf_page.number in version.updated_page_numbers: + updated_freelist_pages[freelist_leaf_page.number] = freelist_leaf_page + freelist_trunk_page = freelist_trunk_page.next_freelist_trunk_page + + # Update the commit object + commit.freelist_pages_carved = True + commit.updated_freelist_page_numbers = updated_freelist_pages.keys() + + log_message = "Carving freelist pages for table master schema entry name: {} " \ + "for page type: {} for version: {} with root page: {} between versions {} " \ + "and {} over updated freelist pages: {}." + log_message = log_message.format(self._signature.name, self._page_type, + self._current_version_number, + root_page_number, self._parser_starting_version_number, + self._parser_ending_version_number, + updated_freelist_pages.keys()) + getLogger(LOGGER_NAME).debug(log_message) + + # Initialize the carved cells + carved_cells = [] + + for freelist_page_number, freelist_page in updated_freelist_pages.iteritems(): + + # Carve unallocated space + carvings = SignatureCarver.carve_unallocated_space(version, CELL_SOURCE.FREELIST, + freelist_page_number, + freelist_page. + unallocated_space_start_offset, + freelist_page.unallocated_space, + self._signature) + + carved_cells.extend(carvings) + + # Remove all carved cells that may be duplicates from previous version carvings + carved_cells = {carved_cell.md5_hex_digest: carved_cell for carved_cell in carved_cells + if carved_cell.md5_hex_digest not in self._carved_cell_md5_hex_digests} + + # Update the carved cells in the commit object + commit.carved_cells.update(carved_cells) + + # Update the carved cell md5 hex digests + self._carved_cell_md5_hex_digests.extend([cell_md5_hex_digest + for cell_md5_hex_digest in carved_cells.keys()]) + + # Increment the current version number + self._current_version_number += 1 + + # Return the commit object + return commit + + else: + raise StopIteration() + + +class Commit(object): + + def __init__(self, name, file_type, version_number, database_text_encoding, page_type, root_page_number, + b_tree_page_numbers, updated_b_tree_page_numbers=None, freelist_pages_carved=False, + updated_freelist_page_numbers=None): + + """ + + + + Note: This may not be updated in the case where carved cells were found, but found to be duplicates of a + previous commit and therefore removed. + + :param name: + :param file_type: + :param version_number: + :param database_text_encoding: + :param page_type: + :param root_page_number: + :param b_tree_page_numbers: + :param updated_b_tree_page_numbers: + :param freelist_pages_carved: + :param updated_freelist_page_numbers: + + :return: + + """ + + self.name = name + self.file_type = file_type + self.version_number = version_number + self.database_text_encoding = database_text_encoding + self.page_type = page_type + self.root_page_number = root_page_number + + self.b_tree_page_numbers = b_tree_page_numbers + + self.updated_b_tree_page_numbers = updated_b_tree_page_numbers + self.freelist_pages_carved = freelist_pages_carved + self.updated_freelist_page_numbers = updated_freelist_page_numbers + self.added_cells = {} + self.deleted_cells = {} + self.updated_cells = {} + self.carved_cells = {} + + def __repr__(self): + return self.__str__().encode("hex") + + def __str__(self): + return sub("\t", "", sub("\n", " ", self.stringify())) + + def stringify(self, padding="", print_cells=True): + string = padding + "Version Number: {}\n" \ + + padding + "Database Text Encoding: {}\n" \ + + padding + "Page Type: {}\n" \ + + padding + "Root Page Number: {}\n" \ + + padding + "B-Tree Page Numbers: {}\n" \ + + padding + "Updated: {}\n" \ + + padding + "Updated B-Tree Page Numbers: {}\n" \ + + padding + "Freelist Pages Carved: {}\n" \ + + padding + "Updated Freelist Page Numbers: {}\n" + string = string.format(self.version_number, + self.database_text_encoding, + self.page_type, + self.root_page_number, + self.b_tree_page_numbers, + self.updated, + self.updated_b_tree_page_numbers, + self.freelist_pages_carved, + self.updated_freelist_page_numbers) + if print_cells: + for added_cell in self.added_cells.itervalues(): + string += "\n" + padding + "Added Cell:\n{}".format(added_cell.stringify(padding + "\t")) + for deleted_cell in self.deleted_cells.itervalues(): + string += "\n" + padding + "Deleted Cell:\n{}".format(deleted_cell.stringify(padding + "\t")) + for updated_cell in self.updated_cells.itervalues(): + string += "\n" + padding + "Updated Cell:\n{}".format(updated_cell.stringify(padding + "\t")) + for carved_cell in self.carved_cells.itervalues(): + string += "\n" + padding + "Carved Cell:\n{}".format(carved_cell.stringify(padding + "\t")) + return string + + @property + def updated(self): + return True if (self.added_cells or self.deleted_cells or self.updated_cells or self.carved_cells) else False