diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d0e7e45
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,23 @@
+# This .gitignore file specified the files to exclude from the git project.
+#
+
+# Pycharm Files
+.idea
+
+# VS Code Files
+.vscode
+
+# Python Files
+*.pyc
+*.pyo
+
+# Pyinstaller Files
+/build
+/dist
+
+# Packing files
+/sqlite_dissect.egg-info
+
+# Other
+/output
+/log
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..338d00f
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,11 @@
+# Change Log
+
+v0.0.6 (2021-07-29)
+------------------
+
+- Initial external release of application and source code
+- Parsing and recovery of SQLite database and WAL files
+- Started documentation of classes in README.md files with Mermaid
+- Added PyInstaller scripts and builds for windows and linux
+- Incorporated output options for SQLite, XLSX, and CSV
+- Added initial beta carving of journal files
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..134856b
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,23 @@
+DC3 SQLite Dissect Open Source License
+
+DC3 SQLite Dissect software was developed by the Department of Defense Cyber
+Crime Center (DC3). By delegated authority pursuant to Section 801(b) of Public Law
+113-66, DC3 grants the following license for this software:
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this
+software and associated documentation files (the "Software"), to deal in the Software
+without restriction, including without limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
+to whom the Software is furnished to do so, subject to the following condition:
+
+The above permission notice and the below warranty notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE DEVELOPERS, OR LICENSORS
+BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8e0e8f4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,333 @@
+# DC3 SQLite Dissect
+
+#### Version 0.0.6
+
+usage:
+
+ sqlite_dissect [-h] [-v] [-d OUTPUT_DIRECTORY] [-p FILE_PREFIX]
+ [-e EXPORT_TYPE] [-n | -w WAL | -j ROLLBACK_JOURNAL] [-r | EXEMPTED_TABLES]
+ [-s | -t] [-g] [-c] [-f] [-k] [-l LOG_LEVEL] [-i LOG_FILE] [--warnings]
+ SQLITE_FILE`
+
+SQLite Dissect is a SQLite parser with recovery abilities over SQLite databases
+and their accompanying journal files. If no options are set other than the file
+name, the default behaviour will be to check for any journal files and print to
+the console the output of the SQLite files. The directory of the SQLite file
+specified will be searched through to find the associated journal files. If
+they are not in the same directory as the specified file, they will not be found
+and their location will need to be specified in the command. SQLite carving
+will not be done by default. Please see the options below to enable carving.
+
+#### positional arguments:
+
+SQLITE_FILE
+
+The SQLite database file
+
+
+#### optional arguments:
+
+-h, --help
+
+show this help message and exit
+
+
+-v, --version
+
+display the version of SQLite Dissect
+
+
+-d OUTPUT_DIRECTORY, --directory OUTPUT_DIRECTORY
+
+directory to write output to (must be specified for outputs other than console text)
+
+
+-p FILE_PREFIX, --file-prefix FILE_PREFIX
+
+the file prefix to use on output files, default is the name of the SQLite file
+(the directory for output must be specified)
+
+
+-e EXPORT_TYPE, --export EXPORT_TYPE
+
+the format to export to {text, csv, sqlite, xlsx}
+(text written to console if -d is not specified)
+
+
+-n, --no-journal
+
+turn off automatic detection of journal files
+
+
+-w WAL, --wal WAL
+
+the WAL file to use instead of searching the SQLite file directory by default
+
+
+-j ROLLBACK_JOURNAL, --rollback-journal ROLLBACK_JOURNAL
+
+the rollback journal file to use instead of searching the SQLite file directory by default
+(under development, currently only outputs to csv, output directory needs to be specified)
+
+
+-r EXEMPTED_TABLES, --exempted-tables EXEMPTED_TABLES
+
+comma-delimited string of tables \[table1,table2,table3\] to exempt
+(only implemented and allowed for rollback journal parsing currently) ex.) table1,table2,table3
+
+
+-s, --schema
+
+output the schema to console, the initial schema found in the main database file
+
+
+-t, --schema-history
+
+output the schema history to console, prints the --schema information and write-head log changes
+
+
+-g, --signatures
+
+output the signatures generated to console
+
+
+-c, --carve
+
+carves and recovers table data
+
+
+-f, --carve-freelists
+
+carves freelist pages (carving must be enabled, under development)
+
+
+-b TABLES, --tables TABLES
+
+specified comma-delimited string of tables \[table1,table2,table3\] to carve
+ex.) table1,table2,table3
+
+
+-k, --disable-strict-format-checking
+
+disable strict format checks for SQLite databases
+(this may result in improperly parsed SQLite files)
+
+
+-l LOG_LEVEL, --log-level LOG_LEVEL
+
+level to log messages at {critical, error, warning, info, debug, off}
+
+
+-i LOG_FILE, --log-file LOG_FILE
+
+log file to write too, default is to write to console, ignored
+if log level set to off (appends if file already exists)
+
+
+--warnings
+
+enable runtime warnings
+
+
+### Example Usage:
+
+1. Print the version:
+
+
+ sqlite_dissect --version
+
+2. Parse a SQLite database and print the outputs to the screen:
+
+
+ sqlite_dissect [SQLITE_FILE]
+
+
+3. Parse a SQLite database and print schema history to a SQLite output file:
+
+
+ sqlite_dissect [SQLITE_FILE] --schema-history -d [OUTPUT_DIRECTORY] -e sqlite
+
+4. Parse a SQLite database and print the output to a SQLite file along with printing signatures and carving entries:
+
+
+ sqlite_dissect [SQLITE_FILE] --signatures -d [OUTPUT_DIRECTORY] -e sqlite --carve
+
+5. Parse a SQLite database and print the output to a SQLite file and carving entries, including freelists, for specific tables:
+
+
+ sqlite_dissect [SQLITE_FILE] -d [OUTPUT_DIRECTORY] -e sqlite --carve --carve-freelists -b [TABLES]
+
+6. Parse a SQLite database file and print the output to a xlsx workbook along with generating signatures and
+ carving entries. The schema history (schema updates throughout the WAL included if a WAL file detected) and
+ signatures will be printed to standard output. The log level will be set to debug and all log messages will be
+ output to the specified log file.
+
+
+ sqlite_dissect [SQLITE_FILE] -d [OUTPUT_DIRECTORY] -e xlsx --schema-history --carve --signatures --log-level debug -i [LOG_FILE]
+
+7. Parse a SQLite database file along with a specified rollback journal file and send the output to CSV files.
+ (CSV is the only output option currently implemented for rollback journal files.)
+
+
+ sqlite_dissect [SQLITE_FILE] -d [OUTPUT_DIRECTORY] -e csv --carve -j [ROLLBACK_JOURNAL]
+
+### Description
+
+This application focuses on carving by analyzing the allocated content within each of the SQLite
+database tables and creating signatures. Where there is no content in the table, the signature
+is based off of analyzing the create table statement in the master schema table. The signature
+contains the series of possible serial types that can be stored within the file for that table.
+This signature is then applied to the unallocated content and freeblocks of the table b-tree in
+the file. This includes both interior and leaf table b-tree pages for that table. The signatures
+are only applied to the pages belonging to the particular b-tree page it was generated from due
+to initial research showing that the pages when created or pulled from the freelist set are
+overwritten with zeros for the unallocated portions. Fragments within the pages can be reported
+on but due to the size (<4 bytes), are not carved. Due to the fact that entries are added into
+tables in SQLite from the end of the page and moving toward the beginning, the carving works
+in the same manner in order to detect previously partial overwritten entries better. This
+carving can also be applied to the set of freelist pages within the SQLite file if specified
+but the freelist pages are treated as sets of unallocated data currently with the exception
+of the freelist page metadata.
+
+The carving process does not currently account for index b-trees as the more pertinent information
+is included in the table b-trees. Additionally, there are some table b-trees that are not currently
+supported. This includes tables that are "without row_id", virtual, or internal schema objects.
+These are unique cases which are slightly more rare use cases or don't offer as much as the
+main tables do. By default all tables will be carved if they do not fall into one of these cases.
+You can send in a specific list of tables to be carved.
+
+This application is written in the hopes that many of these use cases can be addressed in the future
+and is scalable to those use cases. Although one specific type of signature is preferred by default
+in the application, SQLite Dissect generates multiple versions of a signature and can eventually
+support carving by specifying other signatures or providing your own. Since SQLite Dissect generates
+the signature based off of existing data within the SQLite files automatically there is no need to
+supply SQLite Dissect a signature for a particular schema or application. This could be implemented
+though to allow possibly more specific/targeted carving of SQLite files through this application.
+
+Journal carving is supported primarily for WAL files. If a WAL file is found, this application will
+parse through each of the commit records in sequence and assign a version to them. This is the same
+as timelining that some applications use to explain this. Rollback journals are treated as a full
+unallocated block currently and only support export to csv files.
+
+SQLite Dissect can support output to various forms: text, csv, xlsx, and sqlite. Due to certain
+constraints on what can be written to some file types, certain modifications need to be made. For
+instance, when writing SQLite columns such as row_id that are already going to pre-exist in the table
+for export to a SQLite file. In cases like these, we need to preface the columns with "sd_" so
+they will not conflict with the actual row_id column. This also applies to internal schema objects,
+so if certain SQLite tables are requested to be written to a SQLite file, than these will be prefaced
+with a "iso_" so they will not conflict with similar internal schema objects that may already exist
+in the SQLite file bring written to. In xlsx or csv, due to a "=" symbol indicating a type of equation,
+these are prefaced with a " " character to avoid this issue. More details can be found in the
+code documentation of the export classes themselves.
+
+SQLite Dissect opens the file as read only and acts as a read only interpreter when parsing and carving
+the SQLite file. This is to ensure no changes are made to the files being analyzed. The only use
+of the sqlite3 libraries in python are to write the output to a SQLite file if that option is
+specified for output.
+
+#### Additional Notes:
+1. SQLite Dissect currently only works on a SQLite database or a SQLite database along with a journal
+ (WAL or rollback) file. Journal files by themselves are not supported yet.
+
+#### Currently not implemented:
+1. Signatures and carving are not implemented for "without rowid" tables or indexes. This will not cause an error
+ but will skip signature generation and carving processes.
+2. Signatures and carving are not implemented for virtual tables. This will not cause an error but will skip
+ signature generation and carving processes. `Note: Even though virtual tables are skipped, virtual tables may
+ create other non-virtual tables which are not skipped. Currently nothing ties back these tables back to the
+ virtual table that created them.`
+3. Invalidated frames in WAL files are currently skipped and not parsed. `Note: This applies to previous WAL records
+ that were previously written to the SQLite database.`
+4. Signatures generated are only reflective of the base/initial schema in the SQLite database.
+
+#### Known issues and errors:
+1. A use case may occur on generating a very small signature due to a table with very few columns resulting in many
+ false positives and longer parsing time.
+2. Due to current handling queuing of data objects to be printed in addition to #1 above, a memory issue may occur with
+ carving some tables.
+
+#### Future implementation:
+1. Export binary objects to separate files during export instead of being written to text files.
+2. Print out sets of data that were unallocated or in freeblocks that did not have successful carvings.
+3. Fix issues with schemas with comments.
+4. Handle "altered column" table signatures where detected.
+5. Implement handling of invalidated WAL frames.
+6. The ability to de-dupe carved entries to those in allocated space (in cases such as those where the b-tree was migrated).
+
+# Library Scripts
+
+High level scripts that are used to access the rest of the library from and provide the base application for executing
+SQLite Dissect when built.
+
+- api_usage.py
+- example.py
+- setup.py
+- sqlite_dissect.py
+
+
+
+### api_usage.py
+
+This script shows an example of the api usage for a specific test file.
+
+TODO:
+- [ ] Documentation improvements.
+
+
+
+### example.py
+
+This script shows examples of how this library can be used.
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Implement additional export methods.
+
+
+
+### setup.py
+
+This script will be used to setup the sqlite_dissect package for use in python environments.
+
+>Note: To compile a distribution for the project run "python setup.py sdist" in the directory this file is located in.
+
+>Note: openpyxl is needed for the xlsx export and will install jdcal, et-xmlfile \["openpyxl>=2.4.0b1"\]
+
+>Note: PyInstaller is used for generation of executables but not included in this setup.py script and will
+> install altgraph, dis3, macholib, pefile, pypiwin32, pywin32 as dependencies. \[pyinstaller==3.6 needs to be used
+> for Python 2.7 since the newer versions of PyInstaller of 4.0+ require Python 3.6\] Information on how to run
+> PyInstaller is included in the spec files under the pyinstaller directory. Four files are here, two for windows
+> and two for linux, both for x64 platforms. The two different files for each allow you to build it as one single
+> file or a directory of decompressed files. Since the one file extracts to a temp directory in order to run, on
+> some systems this may be blocked and therefore the directory of files is preferred.
+
+
+
+### sqlite_dissect.py
+
+This script will act as the command line script to run this library as a stand-alone application.
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Implement append, overwrite, etc. options for the log file if specified.
+- [ ] Incorporate signature generation input and output files once implemented.
+- [ ] Incorporate "store in memory" arguments (currently set to False, more in depth operations may want it True).
+- [ ] Support for multiple export types simultaneously.
+- [ ] Implement multiple passes/depths.
+- [ ] Update string comparisons.
+- [ ] Test use cases for exempted tables with rollback journal and when combined with specified tables.
+- [ ] Check on name vs table_name properties of the master schema entry.
+- [ ] Test cases where the schema changes throughout the WAL file.
+- [ ] Investigate handling of virtual and "without rowid" tables when creating table signatures through the interface.
+- [ ] Documentation on "without rowid" tables and indexes in references to carving in help documentation.
+- [ ] Make sure to address/print unallocated space (especially uncarved) from updated page numbers in commit records.
+- [ ] Research if there can be journal files with a zero length database file or zero-length journal files.
+- [ ] Research if there can be combinations and of multiple rollback journal and WAL files with the SQLite database.
+- [ ] Validate initial research that allocation of freelist pages to a b-tree results in a wipe of the page data.
+- [ ] Add additional logging messages to the master schema entries skipped in signature generation.
+- [ ] Integrate in the SQLite Forensic Corpus into tests.
+- [ ] Look into updating terminology for versioning to timelining.
+- [ ] Update code for compatibility with Python 3.
+- [ ] Create a pip distribution.
+- [ ] Create PyUnit tests.
+- [ ] Create a GUI.
diff --git a/_version.py b/_version.py
new file mode 100644
index 0000000..a3fed4c
--- /dev/null
+++ b/_version.py
@@ -0,0 +1,10 @@
+
+"""
+
+_version.py
+
+This script identifies the version of the sqlite dissect library.
+
+"""
+
+__version__ = "0.0.6"
diff --git a/api_usage.py b/api_usage.py
new file mode 100644
index 0000000..320449a
--- /dev/null
+++ b/api_usage.py
@@ -0,0 +1,81 @@
+import logging
+import os
+import sqlite_dissect.constants as sqlite_constants
+import sqlite_dissect.interface as sqlite_interface
+
+"""
+
+api-usage.py
+
+This script shows an example of the api usage for a specific test file.
+
+"""
+
+# Setup logging
+logging_level = logging.ERROR
+logging_format = '%(levelname)s %(asctime)s [%(pathname)s] %(funcName)s at line %(lineno)d: %(message)s'
+logging_date_format = '%d %b %Y %H:%M:%S'
+logging.basicConfig(level=logging_level, format=logging_format, datefmt=logging_date_format)
+
+# Setup console logging
+console_logger = logging.StreamHandler()
+console_logger.setLevel(logging_level)
+console_logger.setFormatter(logging.Formatter(logging_format, logging_date_format))
+logging.getLogger(sqlite_constants.LOGGER_NAME).addHandler(console_logger)
+
+"""
+
+API Usage
+
+The three fields below need to be filled in and are currently hardcoded:
+file_name: The SQLite file to investigate (and associated WAL file if it exists in the same directory)
+table_name: The table in the file to create a signature of and carve against the SQLite file with.
+column_names: The columns in the table we are interested in printing out carved data from.
+
+Note: Below will carve entries from the b-tree page of the table and the freelists. The use case of cross b-tree
+ carving is not yet implemented yet in SQLite Dissect.
+
+"""
+
+# Specify the file details
+file_name = "FILE_NAME"
+table_name = "TABLE_NAME"
+column_names = ["COLUMN_ONE", "COLUMN_TWO"]
+
+# Create the database
+database = sqlite_interface.create_database(file_name)
+
+# Create the write ahead log
+wal_file_name = file_name + sqlite_constants.WAL_FILE_POSTFIX
+write_ahead_log = sqlite_interface.create_write_ahead_log(wal_file_name) if os.path.exists(wal_file_name) else None
+
+# Create the version history
+version_history = sqlite_interface.create_version_history(database, write_ahead_log)
+
+# Create the signature we are interested in carving
+table_signature = sqlite_interface.create_table_signature(table_name, database, version_history)
+
+# Account for "without rowid"/virtual table signatures until supported
+if not table_signature:
+ print("Table signature not supported (\"without rowid\" table or virtual table)")
+ exit(0)
+
+# Get the column indices of the columns we are interested in
+column_name_indices = {}
+for column_name in column_names:
+ column_name_indices[column_name] = sqlite_interface.get_column_index(column_name, table_name, version_history)
+
+# Get a version history iterator for the table
+carve_freelists = True
+table_history_iterator = sqlite_interface.get_version_history_iterator(table_name, version_history,
+ table_signature, carve_freelists)
+# Iterate through the commits in the history for this table
+for commit in table_history_iterator:
+ # The table was only modified if the commit was updated for this table and make sure there were carved cells
+ if commit.updated and commit.carved_cells:
+ carved_cells = commit.carved_cells
+ for carved_cell in carved_cells.itervalues():
+ for column_name in column_name_indices.keys():
+ record_column = carved_cell.payload.record_columns[column_name_indices.get(column_name)]
+ print("Commit version: %s table record column: %s has serial type: %s with value of: \"%s\"." %\
+ (commit.version_number, column_name, record_column.serial_type, record_column.value))
diff --git a/example.py b/example.py
new file mode 100644
index 0000000..ea71833
--- /dev/null
+++ b/example.py
@@ -0,0 +1,546 @@
+from getopt import getopt
+from logging import WARNING
+from logging import basicConfig
+from os import makedirs
+from os.path import basename
+from os.path import exists
+from os.path import normpath
+from os.path import sep
+from re import sub
+from sys import argv
+from sqlite_dissect.carving.carver import SignatureCarver
+from sqlite_dissect.carving.signature import Signature
+from sqlite_dissect.constants import BASE_VERSION_NUMBER
+from sqlite_dissect.constants import CELL_LOCATION
+from sqlite_dissect.constants import CELL_SOURCE
+from sqlite_dissect.constants import EXPORT_TYPES
+from sqlite_dissect.constants import MASTER_SCHEMA_ROW_TYPE
+from sqlite_dissect.constants import ROLLBACK_JOURNAL_POSTFIX
+from sqlite_dissect.constants import WAL_FILE_POSTFIX
+from sqlite_dissect.constants import WAL_INDEX_POSTFIX
+from sqlite_dissect.export.csv_export import CommitCsvExporter
+from sqlite_dissect.file.database.database import Database
+from sqlite_dissect.file.database.page import BTreePage
+from sqlite_dissect.file.database.utilities import get_pages_from_b_tree_page
+from sqlite_dissect.file.journal.jounal import RollbackJournal
+from sqlite_dissect.file.schema.master import OrdinaryTableRow
+from sqlite_dissect.file.schema.master import VirtualTableRow
+from sqlite_dissect.file.utilities import validate_page_version_history
+from sqlite_dissect.file.wal.wal import WriteAheadLog
+from sqlite_dissect.file.wal_index.wal_index import WriteAheadLogIndex
+from sqlite_dissect.interface import carve_table
+from sqlite_dissect.interface import create_database
+from sqlite_dissect.interface import create_table_signature
+from sqlite_dissect.interface import create_version_history
+from sqlite_dissect.interface import create_write_ahead_log
+from sqlite_dissect.interface import export_table_or_index_version_history_to_csv
+from sqlite_dissect.interface import export_table_or_index_version_history_to_sqlite
+from sqlite_dissect.interface import export_version_history_to_csv
+from sqlite_dissect.interface import export_version_history_to_sqlite
+from sqlite_dissect.interface import get_index_names
+from sqlite_dissect.interface import get_table_names
+from sqlite_dissect.interface import get_version_history_iterator
+from sqlite_dissect.interface import select_all_from_index
+from sqlite_dissect.interface import select_all_from_table
+from sqlite_dissect.output import stringify_cell_records
+from sqlite_dissect.output import stringify_master_schema_versions
+from sqlite_dissect.output import stringify_page_information
+from sqlite_dissect.output import stringify_unallocated_space
+from sqlite_dissect.version_history import VersionHistory
+from sqlite_dissect.version_history import VersionHistoryParser
+
+"""
+
+example.py
+
+This script shows examples of how this library can be used.
+
+"""
+
+# Setup logging
+logging_level = WARNING
+logging_format = '%(levelname)s %(asctime)s [%(pathname)s] %(funcName)s at line %(lineno)d: %(message)s'
+logging_data_format = '%d %b %Y %H:%M:%S'
+basicConfig(level=logging_level, format=logging_format, datefmt=logging_data_format)
+
+file_name = None
+export_directory = None
+export_type = None
+opts, args = getopt(argv[1:], "f:e:t:")
+for opt, arg in opts:
+ if opt == "-f":
+ file_name = arg
+ elif opt == "-e":
+ export_directory = arg
+ elif opt == "-t":
+ export_type = arg
+
+"""
+
+Note: Currently only the csv export_type is supported in this example. The csv and sqlite export_types are used in
+ the API example below. Other specified types are currently ignored.
+
+"""
+
+if (export_directory and not export_type) or (not export_directory and export_type):
+ print("The export directory (-e) and export type (-t) both need to be defined if either one is specified.")
+ print("Export types are: {}.".format([export_type for export_type in EXPORT_TYPES]))
+ exit(1)
+
+if export_type and export_type.upper() not in EXPORT_TYPES:
+ print("Invalid export type: {}.".format(export_type))
+ print("Export types are: {}.".format(",".join([export_type.lower() for export_type in EXPORT_TYPES])))
+ exit(1)
+
+if not file_name:
+ print("Please execute the application specifying the file name.")
+ exit(1)
+elif not exists(file_name):
+ print("File: {} does not exist.".format(file_name))
+ exit(1)
+else:
+ print("Starting to parse and carve: {}.\n".format(file_name))
+
+file_prefix = basename(normpath(file_name))
+padding = "\t"
+
+"""
+
+Load the Database File.
+
+"""
+
+database_file = Database(file_name)
+print("Database File:\n{}\n".format(database_file.stringify(padding, False, False)))
+print("Page Information:\n{}\n".format(stringify_page_information(database_file, padding)))
+
+"""
+
+Check if the Write-Ahead Log File exists and load it if it does.
+
+"""
+
+wal_file = None
+wal_file_name = file_name + WAL_FILE_POSTFIX
+if exists(wal_file_name):
+ wal_file = WriteAheadLog(wal_file_name)
+ print("WAL File:\n{}\n".format(wal_file.stringify(padding, False)))
+else:
+ print("No WAL File Found.\n")
+
+"""
+
+Check if the Write-Ahead Log Index File exists and load it if it does.
+
+"""
+
+wal_index_file = None
+wal_index_file_name = file_name + WAL_INDEX_POSTFIX
+if exists(wal_index_file_name):
+ wal_index_file = WriteAheadLogIndex(wal_index_file_name)
+ print("WAL Index File:\n{}\n".format(wal_index_file.stringify(padding)))
+else:
+ print("No WAL Index File Found.\n")
+
+"""
+
+Check if the Rollback Journal File exists and load it if it does.
+
+"""
+
+rollback_journal_file = None
+rollback_journal_file_name = file_name + ROLLBACK_JOURNAL_POSTFIX
+if exists(rollback_journal_file_name):
+ rollback_journal_file = RollbackJournal(rollback_journal_file_name)
+ print("Rollback Journal File:\n{}\n".format(rollback_journal_file.stringify(padding)))
+else:
+ print("No Rollback Journal File Found.\n")
+
+"""
+
+Print Unallocated Non-Zero Space from the Database File.
+
+"""
+
+unallocated_non_zero_space = stringify_unallocated_space(database_file, padding, False)
+print("Unallocated Non-Zero Space from the Database File:\n{}\n".format(unallocated_non_zero_space))
+
+"""
+
+Create the version history from the database and WAL file (even if the WAL file was not found).
+
+"""
+
+version_history = VersionHistory(database_file, wal_file)
+
+print("Number of versions: {}\n".format(version_history.number_of_versions))
+
+print("Validating Page Version History...")
+page_version_history_validated = validate_page_version_history(version_history)
+print("Validating Page Version History (Check): {}\n".format(page_version_history_validated))
+if not page_version_history_validated:
+ print("Error in validating page version history.")
+ exit(1)
+
+print("Version History of Master Schemas:\n")
+for version_number, version in version_history.versions.iteritems():
+ if version.master_schema_modified:
+ master_schema_entries = version.master_schema.master_schema_entries
+ if master_schema_entries:
+ print("Version {} Master Schema Entries:".format(version_number))
+ for master_schema_entry in master_schema_entries:
+ string = padding + "Master Schema Entry: Root Page Number: {} Type: {} Name: {} " \
+ "Table Name: {} SQL: {}."
+ print(string.format(master_schema_entry.root_page_number, master_schema_entry.row_type,
+ master_schema_entry.name, master_schema_entry.table_name,
+ master_schema_entry.sql))
+
+print("Version History:\n")
+for version_number, version in version_history.versions.iteritems():
+ print("Version: {} has updated page numbers: {}.".format(version_number, version.updated_page_numbers))
+ print("Page Information:\n{}\n".format(stringify_page_information(version, padding)))
+
+last_version = version_history.number_of_versions - 1
+print("Version: {} has updated page numbers: {}.".format(version_history.number_of_versions - 1,
+ last_version.updated_page_numbers))
+print("Page Information:\n{}\n".format(stringify_page_information(last_version, padding)))
+
+print("Version History of Master Schemas:\n{}\n".format(stringify_master_schema_versions(version_history)))
+
+print("Master Schema B-Trees (Index and Table) Version Histories:")
+for master_schema_entry in database_file.master_schema.master_schema_entries:
+ if master_schema_entry.row_type in [MASTER_SCHEMA_ROW_TYPE.INDEX, MASTER_SCHEMA_ROW_TYPE.TABLE] and \
+ not isinstance(master_schema_entry, VirtualTableRow) and \
+ not (isinstance(master_schema_entry, OrdinaryTableRow) and master_schema_entry.without_row_id):
+ version_history_parser = VersionHistoryParser(version_history, master_schema_entry)
+ page_type = version_history_parser.page_type
+ string = "Master schema entry: {} type: {} on page type: {}:"
+ string = string.format(version_history_parser.row_type, master_schema_entry.name, page_type,
+ version_history_parser.root_page_number_version_index)
+
+ print(string)
+ for commit in version_history_parser:
+ if commit.updated:
+ string = "Updated in version: {} with root page number: {} on b-tree page numbers: {} " \
+ "and updated root b-tree page numbers: {}:"
+ string = string.format(commit.version_number, commit.root_page_number, commit.b_tree_page_numbers,
+ commit.updated_b_tree_page_numbers)
+ print(string)
+ for added_cell_string in stringify_cell_records(commit.added_cells.values(),
+ database_file.database_text_encoding, page_type):
+ print("Added: {}".format(added_cell_string))
+ for updated_cell_string in stringify_cell_records(commit.updated_cells.values(),
+ database_file.database_text_encoding, page_type):
+ print("Updated: {}".format(updated_cell_string))
+ for deleted_cell_string in stringify_cell_records(commit.deleted_cells.values(),
+ database_file.database_text_encoding, page_type):
+ print("Deleted: {}".format(deleted_cell_string))
+ for carved_cell_string in stringify_cell_records(commit.carved_cells.values(),
+ database_file.database_text_encoding, page_type):
+ print("Carved: {}".format(carved_cell_string))
+ print("\n")
+
+signatures = {}
+for master_schema_entry in database_file.master_schema.master_schema_entries:
+
+ """
+
+ Due to current implementation limitations we are restricting signature generation to table row types.
+
+ """
+
+ if master_schema_entry.row_type == MASTER_SCHEMA_ROW_TYPE.TABLE:
+ signature = Signature(version_history, master_schema_entry)
+ signatures[master_schema_entry.name] = signature
+ print("Signature:\n{}\n".format(signature.stringify(padding + "\t", False, False, False)))
+ else:
+ string = "No signature will be generated for master schema entry type: {} with name: {} on " \
+ "table name: {} and sql: {}"
+ string = string.format(master_schema_entry.row_type, master_schema_entry.name, master_schema_entry.table_name,
+ master_schema_entry.sql)
+ print(string + "\n")
+
+print("Carving base version (main SQLite database file):")
+version = version_history.versions[BASE_VERSION_NUMBER]
+
+carved_records = {}
+for master_schema_entry in database_file.master_schema.master_schema_entries:
+
+ """
+
+ Due to current implementation limitations we are restricting carving to table row types.
+
+ Note: This is not allowing "without rowid" or virtual tables until further testing is done. (Virtual tables
+ tend to have a root page number of 0 with no data stored in the main table. Further investigation
+ is needed.)
+
+ """
+
+ if master_schema_entry.row_type == MASTER_SCHEMA_ROW_TYPE.TABLE \
+ and not isinstance(master_schema_entry, VirtualTableRow) and not master_schema_entry.without_row_id:
+
+ b_tree_pages = get_pages_from_b_tree_page(version.get_b_tree_root_page(master_schema_entry.root_page_number))
+ b_tree_page_numbers = [b_tree_page.number for b_tree_page in b_tree_pages]
+
+ string = "Carving Table Entry: Name: {} root page: {} on page numbers: {}"
+ print(string.format(master_schema_entry.name, master_schema_entry.root_page_number, b_tree_page_numbers))
+
+ carved_records[master_schema_entry.name] = []
+ for b_tree_page_number in b_tree_page_numbers:
+ page = database_file.pages[b_tree_page_number]
+ source = CELL_SOURCE.B_TREE
+
+ # For carving freeblocks make sure the page is a b-tree page and not overflow
+ if isinstance(page, BTreePage):
+ carved_cells = SignatureCarver.carve_freeblocks(version, source, page.freeblocks,
+ signatures[master_schema_entry.name])
+ carved_records[master_schema_entry.name].extend(carved_cells)
+ carved_cells = SignatureCarver.carve_unallocated_space(version, source, b_tree_page_number,
+ page.unallocated_space_start_offset,
+ page.unallocated_space,
+ signatures[master_schema_entry.name])
+
+ carved_records[master_schema_entry.name].extend(carved_cells)
+
+ else:
+ string = "Not carving master schema entry row type: {} name: {} table name: {} and sql: {} since it is not " \
+ "a normal table."
+ string = string.format(master_schema_entry.row_type, master_schema_entry.name, master_schema_entry.table_name,
+ master_schema_entry.sql)
+ print(string)
+print("\n")
+
+print("Carved Entries:\n")
+for master_schema_entry_name, carved_cells in carved_records.iteritems():
+
+ print("Table Master Schema Entry Name {}:".format(master_schema_entry_name))
+
+ carved_freeblock_records_total = len([carved_cell for carved_cell in carved_cells
+ if carved_cell.location == CELL_LOCATION.FREEBLOCK])
+
+ print("Recovered {} entries from freeblocks:".format(carved_freeblock_records_total))
+
+ for carved_cell in carved_cells:
+ if carved_cell.location == CELL_LOCATION.FREEBLOCK:
+ payload = carved_cell.payload
+ cell_record_column_values = [str(record_column.value) if record_column.value else "NULL"
+ for record_column in payload.record_columns]
+ string = "{}: {} Index: ({}, {}, {}, {}): ({})"
+ string = string.format(carved_cell.page_number, carved_cell.index, carved_cell.file_offset,
+ payload.serial_type_definition_start_offset,
+ payload.serial_type_definition_end_offset,
+ payload.cutoff_offset, " , ".join(cell_record_column_values))
+ print(string)
+
+ carved_unallocated_space_records_total = len([carved_cell for carved_cell in carved_cells
+ if carved_cell.location == CELL_LOCATION.UNALLOCATED_SPACE])
+ print("Recovered {} entries from unallocated space:".format(carved_unallocated_space_records_total))
+
+ for carved_cell in carved_cells:
+ if carved_cell.location == CELL_LOCATION.UNALLOCATED_SPACE:
+ payload = carved_cell.payload
+ cell_record_column_values = [str(record_column.value) if record_column.value else "NULL"
+ for record_column in payload.record_columns]
+ string = "{}: {} Index: ({}, {}, {}, {}): ({})"
+ string = string.format(carved_cell.page_number, carved_cell.index, carved_cell.file_offset,
+ payload.serial_type_definition_start_offset,
+ payload.serial_type_definition_end_offset,
+ payload.cutoff_offset, " , ".join(cell_record_column_values))
+ print(string)
+
+ print("\n")
+print("\n")
+
+print("Master Schema B-Trees (Index and Table) Version Histories Including Carvings:")
+for master_schema_entry in database_file.master_schema.master_schema_entries:
+ if master_schema_entry.row_type in [MASTER_SCHEMA_ROW_TYPE.INDEX, MASTER_SCHEMA_ROW_TYPE.TABLE]:
+
+ # We only have signatures of the tables (not indexes)
+ signature = signatures[master_schema_entry.name] \
+ if master_schema_entry.row_type == MASTER_SCHEMA_ROW_TYPE.TABLE else None
+
+ version_history_parser = VersionHistoryParser(version_history, master_schema_entry, None, None, signature)
+ page_type = version_history_parser.page_type
+ string = "Master schema entry: {} type: {} on page type: {}:"
+ string = string.format(master_schema_entry.name, version_history_parser.row_type, page_type,
+ version_history_parser.root_page_number_version_index)
+ print(string)
+ for commit in version_history_parser:
+ if commit.updated:
+ string = "Updated in version: {} with root page number: {} on b-tree page numbers: {} " \
+ "and updated root b-tree page numbers: {}:"
+ string = string.format(commit.version_number, commit.root_page_number, commit.b_tree_page_numbers,
+ commit.updated_b_tree_page_numbers)
+ print(string)
+ for added_cell_string in stringify_cell_records(commit.added_cells.values(),
+ database_file.database_text_encoding, page_type):
+ print("Added: {}".format(added_cell_string))
+ for updated_cell_string in stringify_cell_records(commit.updated_cells.values(),
+ database_file.database_text_encoding, page_type):
+ print("Updated: {}".format(updated_cell_string))
+ for deleted_cell_string in stringify_cell_records(commit.deleted_cells.values(),
+ database_file.database_text_encoding, page_type):
+ print("Deleted: {}".format(deleted_cell_string))
+ for carved_cell_string in stringify_cell_records(commit.carved_cells.values(),
+ database_file.database_text_encoding, page_type):
+ print("Carved: {}".format(carved_cell_string))
+ print("\n")
+
+if export_type and export_type.upper() == EXPORT_TYPES.CSV:
+ csv_prefix_file_name = basename(normpath(file_prefix))
+ commit_csv_exporter = CommitCsvExporter(export_directory, csv_prefix_file_name)
+ print("Exporting SQLite Master Schema B-Trees (Index and Table) Version Histories "
+ "(Including Carvings) to CSV Directory: {}.".format(export_directory))
+ for master_schema_entry in database_file.master_schema.master_schema_entries:
+ if master_schema_entry.row_type in [MASTER_SCHEMA_ROW_TYPE.INDEX, MASTER_SCHEMA_ROW_TYPE.TABLE]:
+
+ # We only have signatures of the tables (not indexes)
+ signature = signatures[master_schema_entry.name] \
+ if master_schema_entry.row_type == MASTER_SCHEMA_ROW_TYPE.TABLE else None
+
+ carve_freelist_pages = True if signature else False
+
+ version_history_parser = VersionHistoryParser(version_history, master_schema_entry,
+ None, None, signature, carve_freelist_pages)
+ page_type = version_history_parser.page_type
+ for commit in version_history_parser:
+ commit_csv_exporter.write_commit(master_schema_entry, commit)
+print("\n")
+
+"""
+
+Below are examples on using the interface.
+
+The functions used from the interface script are documented below (taken from documentation in the interface script):
+create_database(file_name, file_object=None, store_in_memory=False, strict_format_checking=True)
+create_write_ahead_log(file_name, file_object=None)
+create_version_history(database, write_ahead_log=None)
+get_table_names(database)
+get_index_names(database)
+select_all_from_table(table_name, version)
+select_all_from_index(index_name, version)
+create_table_signature(table_name, version, version_history=None)
+carve_table(table_name, signature, version)
+get_version_history_iterator(table_or_index_name, version_history, signature=None)
+export_table_or_index_version_history_to_csv(export_directory, version_history,
+ table_or_index_name, signature=None, carve_freelist_pages=False)
+export_version_history_to_csv(export_directory, version_history, signatures=None, carve_freelist_pages=False)
+
+"""
+
+print("Example interface usage:\n")
+
+# Create the database
+database = create_database(file_name)
+
+# Create the write ahead log
+write_ahead_log = create_write_ahead_log(file_name + WAL_FILE_POSTFIX) if exists(file_name + WAL_FILE_POSTFIX) else None
+
+# Create the version history
+version_history = create_version_history(database, write_ahead_log)
+
+# Get all of the table names
+table_names = get_table_names(database)
+print("Table Names: {}\n".format(table_names))
+
+# Get all of the cells in each table and print the number of cells (rows) for each table
+for table_name in table_names:
+ select_all_data = select_all_from_table(table_name, database)
+ print("Table: {} has {} rows in the database file.".format(table_name, len(select_all_data)))
+print("\n")
+
+# Get all of the index names
+index_names = get_index_names(database)
+print("Index Names: {}".format(index_names))
+print("\n")
+
+# Get all of the cells in each index and print the number of cells (rows) for each index
+for index_name in index_names:
+ select_all_data = select_all_from_index(index_name, database)
+ print("Index: {} has {} rows in the database file.".format(index_name, len(select_all_data)))
+print("\n")
+
+# Get all of the signatures (for tables only - not including "without rowid" and virtual tables)
+signatures = {}
+for table_name in table_names:
+ # Specify the version history here to parse through all versions for signature generation
+ table_signature = create_table_signature(table_name, database, version_history)
+ # Account for "without rowid" table signatures until supported
+ if table_signature:
+ signatures[table_name] = table_signature
+
+# Carve each table with the generated signature and print the number of carved cells (rows) per table
+for table_name in table_names:
+ if table_name in signatures:
+ carved_cells = carve_table(table_name, signatures[table_name], database)
+ print("Found {} carved cells for table: {} in the database file.".format(len(carved_cells), table_name))
+print("\n")
+
+# Combine names for index and tables (they are unique) and get the version history iterator for each
+names = []
+names.extend(table_names)
+names.extend(index_names)
+for name in names:
+ signature = signatures[name] if name in signatures else None
+ version_history_iterator = get_version_history_iterator(name, version_history, signature)
+ for commit in version_history_iterator:
+ string = "For: {} commit: {} for version: {}.".format(name, commit.updated, commit.version_number)
+ if commit.updated:
+ string += " Carved Cells: {}.".format(True if commit.carved_cells else False)
+ print(string)
+print("\n")
+
+# Check to make sure exporting variables were setup correctly for csv
+if export_type and export_type.upper() == EXPORT_TYPES.CSV:
+
+ # Create two directories for the two types csv files can be exported through the interface
+ export_version_directory = export_directory + sep + "csv_version"
+ if not exists(export_version_directory):
+ makedirs(export_version_directory)
+ export_version_history_directory = export_directory + sep + "csv_version_history"
+ if not exists(export_version_history_directory):
+ makedirs(export_version_history_directory)
+
+ # Iterate through all index and table names and export their version history to a csv file (one at a time)
+ for name in names:
+ print("Exporting {} to {} as {}.".format(name, export_version_directory, export_type))
+ export_table_or_index_version_history_to_csv(export_version_directory, version_history, name, None, False)
+ print("\n")
+
+ # Export all index and table histories to csv files while supplying signatures to carve tables and carving freelists
+ print("Exporting history to {} with carvings as {}.".format(export_version_history_directory, export_type))
+ export_version_history_to_csv(export_version_history_directory, version_history, signatures.values(), True)
+ print("\n")
+
+# Check to make sure exporting variable were setup correctly for SQLite
+if export_type and export_type.upper() == EXPORT_TYPES.SQLITE:
+
+ # Create two directories for the two types SQLite files can be exported through the interface
+ export_version_directory = export_directory + sep + "sqlite_version"
+ if not exists(export_version_directory):
+ makedirs(export_version_directory)
+ export_version_history_directory = export_directory + sep + "sqlite_version_history"
+ if not exists(export_version_history_directory):
+ makedirs(export_version_history_directory)
+
+ # Currently the file name is taken from the base version name
+ sqlite_base_file_name = basename(normpath(file_prefix))
+ sqlite_file_postfix = "-sqlite-dissect.db3"
+
+ # Iterate through all index and table names and export their version history to a csv file (one at a time)
+ for name in names:
+ fixed_master_schema_name = sub(" ", "_", name)
+ master_schema_entry_file_name = sqlite_base_file_name + "-" + fixed_master_schema_name + sqlite_file_postfix
+ print("Exporting {} to {} in {} as {}.".format(name, master_schema_entry_file_name, export_version_directory,
+ export_type))
+ export_table_or_index_version_history_to_sqlite(export_version_directory, master_schema_entry_file_name,
+ version_history, name)
+ print("\n")
+
+ # Export all index and table histories to csv files while supplying signatures to carve tables and carving freelists
+ sqlite_file_name = sqlite_base_file_name + sqlite_file_postfix
+ print("Exporting history to {} in {} with carvings as {}.".format(sqlite_file_name,
+ export_version_history_directory, export_type))
+ export_version_history_to_sqlite(export_version_history_directory, sqlite_file_name, version_history,
+ signatures.values(), True)
+ print("\n")
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..e7090cb
--- /dev/null
+++ b/main.py
@@ -0,0 +1,769 @@
+import warnings
+from argparse import ArgumentParser
+from logging import CRITICAL
+from logging import DEBUG
+from logging import ERROR
+from logging import INFO
+from logging import WARNING
+from logging import basicConfig
+from logging import getLogger
+from os.path import basename
+from os.path import exists
+from os.path import getsize
+from os.path import normpath
+from os.path import sep
+from time import time
+from warnings import warn
+from _version import __version__
+from sqlite_dissect.carving.rollback_journal_carver import RollBackJournalCarver
+from sqlite_dissect.carving.signature import Signature
+from sqlite_dissect.constants import BASE_VERSION_NUMBER
+from sqlite_dissect.constants import EXPORT_TYPES
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import MASTER_SCHEMA_ROW_TYPE
+from sqlite_dissect.constants import ROLLBACK_JOURNAL_POSTFIX
+from sqlite_dissect.constants import WAL_FILE_POSTFIX
+from sqlite_dissect.exception import SqliteError
+from sqlite_dissect.export.csv_export import CommitCsvExporter
+from sqlite_dissect.export.sqlite_export import CommitSqliteExporter
+from sqlite_dissect.export.text_export import CommitConsoleExporter
+from sqlite_dissect.export.text_export import CommitTextExporter
+from sqlite_dissect.export.xlsx_export import CommitXlsxExporter
+from sqlite_dissect.file.database.database import Database
+from sqlite_dissect.file.journal.jounal import RollbackJournal
+from sqlite_dissect.file.schema.master import OrdinaryTableRow
+from sqlite_dissect.file.wal.wal import WriteAheadLog
+from sqlite_dissect.output import stringify_master_schema_version
+from sqlite_dissect.output import stringify_master_schema_versions
+from sqlite_dissect.version_history import VersionHistory
+from sqlite_dissect.version_history import VersionHistoryParser
+
+"""
+
+sqlite_dissect.py
+
+This script will act as the command line script to run this library as a stand-alone application.
+
+"""
+
+
+def main(args):
+
+ # Handle the logging and warning settings
+ if not args.log_level:
+ raise SqliteError("Error in setting up logging: no log level determined.")
+
+ # Get the logging level
+ logging_level_arg = args.log_level
+ logging_level = logging_level_arg
+ if logging_level_arg != "off":
+ if logging_level_arg == "critical":
+ logging_level = CRITICAL
+ elif logging_level_arg == "error":
+ logging_level = ERROR
+ elif logging_level_arg == "warning":
+ logging_level = WARNING
+ elif logging_level_arg == "info":
+ logging_level = INFO
+ elif logging_level_arg == "debug":
+ logging_level = DEBUG
+ else:
+ raise SqliteError("Invalid option for logging: {}.".format(logging_level_arg))
+
+ # Setup logging
+ logging_format = '%(levelname)s %(asctime)s [%(pathname)s] %(funcName)s at line %(lineno)d: %(message)s'
+ logging_data_format = '%d %b %Y %H:%M:%S'
+ basicConfig(level=logging_level, format=logging_format, datefmt=logging_data_format, filename=args.log_file)
+
+ logger = getLogger(LOGGER_NAME)
+ logger.debug("Setup logging using the log level: {}.".format(logging_level))
+ logger.info("Using options: {}".format(args))
+
+ if args.warnings:
+
+ # Turn warnings on if it was specified
+ warnings.filterwarnings("always")
+
+ logger.info("Warnings have been turned on.")
+
+ else:
+
+ # Ignore warnings by default
+ warnings.filterwarnings("ignore")
+
+ # Execute argument checks (inclusive)
+ if args.carve_freelists and not args.carve:
+ raise SqliteError("Freelist carving cannot be enabled (--carve-freelists) without enabling "
+ "general carving (--carve).")
+ if args.export.upper() != EXPORT_TYPES.TEXT and not args.directory:
+ raise SqliteError("The directory needs to be specified (--directory) if an export type other than text "
+ "is specified (--export).")
+ if args.file_prefix and not args.directory:
+ raise SqliteError("The directory needs to be specified (--directory) if a file prefix is "
+ "specified (--file-prefix).")
+
+ # Setup the export type
+ export_type = EXPORT_TYPES.TEXT
+ if args.export:
+ export_type = args.export.upper()
+
+ # Setup the strict format checking
+ strict_format_checking = True
+ if args.disable_strict_format_checking:
+ strict_format_checking = False
+
+ # Setup the file prefix which taken from the base version name unless the file_prefix argument is set
+ file_prefix = basename(normpath(args.sqlite_file))
+ if args.file_prefix:
+ file_prefix = args.file_prefix
+
+ if not file_prefix:
+ # The file prefix is taken from the base version name if not specified
+ file_prefix = basename(normpath(args.sqlite_file))
+
+ # Setup the directory if specified
+ output_directory = None
+ if args.directory:
+ if not exists(args.directory):
+ raise SqliteError("Unable to find output directory: {}.".format(args.directory))
+ output_directory = args.directory
+
+ logger.debug("Determined export type to be {} with file prefix: {} and output directory: {}"
+ .format(export_type, file_prefix, output_directory))
+
+ # Obtain the SQLite file
+ if not exists(args.sqlite_file):
+ raise SqliteError("Unable to find SQLite file: {}.".format(args.sqlite_file))
+
+ """
+
+ If the file is a zero length file, we set a flag indicating it and check to make sure there are no associated wal
+ or journal files before just exiting out stating that the file was empty. If a (non-zero length) wal or journal
+ file is found, an exception will be thrown. However, if the no-journal option is specified, the journal files will
+ not be checked, and the program will exit.
+
+ Note: It is currently believed that there cannot be a zero length SQLite database file with a wal or journal file.
+ That is why an exception is thrown here but needs to be investigated to make sure.
+
+ """
+
+ # See if the SQLite file is zero-length
+ zero_length_sqlite_file = False
+ if getsize(args.sqlite_file) == 0:
+ zero_length_sqlite_file = True
+
+ # Obtain the wal or rollback_journal file if found (or if specified)
+ wal_file_name = None
+ rollback_journal_file_name = None
+ if not args.no_journal:
+ if args.wal:
+ if not exists(args.wal):
+ raise SqliteError("Unable to find wal file: {}.".format(args.wal))
+ wal_file_name = args.wal
+ elif args.rollback_journal:
+ if not exists(args.rollback_journal):
+ raise SqliteError("Unable to find rollback journal file: {}.".format(args.rollback_journal))
+ rollback_journal_file_name = args.rollback_journal
+ else:
+ if exists(args.sqlite_file + WAL_FILE_POSTFIX):
+ wal_file_name = args.sqlite_file + WAL_FILE_POSTFIX
+ if exists(args.sqlite_file + ROLLBACK_JOURNAL_POSTFIX):
+ rollback_journal_file_name = args.sqlite_file + ROLLBACK_JOURNAL_POSTFIX
+
+ # Exempted tables are only supported currently for rollback journal files
+ rollback_journal_exempted_tables = []
+ if args.exempted_tables:
+ if not rollback_journal_file_name:
+ raise SqliteError("Exempted tables are only supported for use with rollback journal parsing.")
+ rollback_journal_exempted_tables = args.exempted_tables.split(",")
+
+ # See if the wal file is zero-length
+ zero_length_wal_file = False
+ if wal_file_name and getsize(wal_file_name) == 0:
+ zero_length_wal_file = True
+
+ # See if the rollback journal file is zero-length
+ zero_length_rollback_journal_file = False
+ if rollback_journal_file_name and getsize(rollback_journal_file_name) == 0:
+ zero_length_rollback_journal_file = True
+
+ # Check if the SQLite file is zero length
+ if zero_length_sqlite_file:
+
+ if wal_file_name and not zero_length_wal_file:
+
+ """
+
+ Here we throw an exception if we find a wal file with content with no content in the original SQLite file.
+ It is not certain this use case can occur and investigation needs to be done to make certain. There have
+ been scenarios where there will be a database header with no schema or content in a database file with a
+ WAL file that has all the schema entries and content but this is handled differently.
+
+ """
+
+ raise SqliteError("Found a zero length SQLite file with a wal file: {}. Unable to parse.".format(args.wal))
+
+ elif zero_length_wal_file:
+ print("File: {} with wal file: {} has no content. Nothing to parse."
+ .format(args.sqlite_file, wal_file_name))
+ exit(0)
+
+ elif rollback_journal_file_name and not zero_length_rollback_journal_file:
+
+ """
+
+ Here we will only have a rollback journal file. Currently, since we need to have the database file to parse
+ signatures from, we cannot solely carve on the journal file alone.
+
+ """
+
+ raise SqliteError("Found a zero length SQLite file with a rollback journal file: {}. Unable to parse."
+ .format(args.rollback_journal))
+
+ elif zero_length_rollback_journal_file:
+ print("File: {} with rollback journal file: {} has no content. Nothing to parse."
+ .format(args.sqlite_file, rollback_journal_file_name))
+ exit(0)
+
+ else:
+ print("File: {} has no content. Nothing to parse.".format(args.sqlite_file))
+ exit(0)
+
+ # Make sure that both of the journal files are not found
+ if rollback_journal_file_name and wal_file_name:
+
+ """
+
+ Since the arguments have you specify the journal file in a way that you can only set the wal or rollback journal
+ file name, this case can only occur from finding both of the files on the file system for both wal and rollback
+ journal when there is no journal options specified. Since the SQLite database cannot be set to use both wal and
+ journal files in the same running, we determine this to be an error and throw and exception up.
+
+ There may be a case where the mode was changed at some point and there is a single SQLite file with one or more
+ journal files in combination of rollback journal and WAL files. More research would have to take place in this
+ scenario and also take into the account of this actually occurring since in most cases it is set statically
+ by the application SQLite database owner.
+
+ """
+
+ raise SqliteError("Found both a rollback journal: {} and wal file: {}. Only one journal file should exist. "
+ "Unable to parse.".format(args.rollback_journal, args.wal))
+
+ # Print a message parsing is starting and log the start time for reporting at the end on amount of time to run
+ print("\nParsing: {}...".format(args.sqlite_file))
+ start_time = time()
+
+ # Create the database and wal/rollback journal file (if existent)
+ database = Database(args.sqlite_file, strict_format_checking=strict_format_checking)
+
+ write_ahead_log = None
+ if wal_file_name and not zero_length_wal_file:
+ write_ahead_log = WriteAheadLog(wal_file_name, strict_format_checking=strict_format_checking)
+
+ rollback_journal_file = None
+ if rollback_journal_file_name and not zero_length_rollback_journal_file:
+ rollback_journal_file = RollbackJournal(rollback_journal_file_name)
+
+ # Create the version history (this is currently only supported for the WAL)
+ version_history = VersionHistory(database, write_ahead_log)
+
+ # Check if the master schema was asked for
+ if args.schema:
+
+ # print the master schema of the database
+ print("\nDatabase Master Schema:\n{}".format(stringify_master_schema_version(database)))
+ print("Continuing to parse...")
+
+ # Check if the schema history was asked for
+ if args.schema_history:
+
+ # print the master schema version history
+ print("\nVersion History of Master Schemas:\n{}".format(stringify_master_schema_versions(version_history)))
+ print("Continuing to parse...")
+
+ # Get the signature options
+ print_signatures = args.signatures
+
+ # Get the carving options
+ carve = args.carve
+ carve_freelists = args.carve_freelists
+
+ # Check to see if carve freelists was set without setting carve
+ if not carve and carve_freelists:
+ log_message = "The carve option was not set but the carve_freelists option was. Disabling carve_freelists. " \
+ "Please specify the carve option to enable."
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ # Specific tables to be carved
+ specified_tables_to_carve = []
+ if args.tables:
+ specified_tables_to_carve = args.tables.split(",")
+
+ if rollback_journal_exempted_tables and specified_tables_to_carve:
+ for table in rollback_journal_exempted_tables:
+ if table in specified_tables_to_carve:
+ print("Table: {} found in both exempted and specified tables. Please update the arguments correctly."
+ .format(table))
+ exit(0)
+
+ # See if we need to generate signatures
+ generate_signatures = True if (carve or print_signatures) else False
+ signatures = None
+
+ # Get all of the signatures (for tables only - not including "without rowid" and virtual tables)
+ if generate_signatures:
+
+ signatures = {}
+ logger.debug("Generating table signatures.")
+
+ for master_schema_entry in database.master_schema.master_schema_entries:
+
+ # Only account for the specified tables
+ if specified_tables_to_carve and master_schema_entry.name not in specified_tables_to_carve:
+ continue
+
+ """
+
+ Due to current implementation limitations we are restricting carving to table row types.
+
+ Note: This is not allowing "without rowid" or virtual tables until further testing is done.
+ (Virtual tables tend to have a root page number of 0 with no data stored in the main table. Further
+ investigation is needed.)
+
+ Note: Table internal schema objects will not be accounted for. These are tables that start with "sqlite_"
+ and are used for internal use to SQLite itself. These have never known to produce any forensic
+ pertinent data.
+
+ """
+
+ if isinstance(master_schema_entry, OrdinaryTableRow):
+
+ if master_schema_entry.without_row_id:
+ log_message = "A `without row_id` table was found: {} and will not have a signature generated " \
+ "for carving since it is not supported yet.".format(master_schema_entry.table_name)
+ logger.info(log_message)
+ continue
+
+ if master_schema_entry.internal_schema_object:
+ log_message = "A `internal schema` table was found: {} and will not have a signature generated " \
+ "for carving since it is not supported yet.".format(master_schema_entry.table_name)
+ logger.info(log_message)
+ continue
+
+ signatures[master_schema_entry.name] = Signature(version_history, master_schema_entry)
+
+ if print_signatures:
+ print("\nSignature:\n{}".format(signatures[master_schema_entry.name]
+ .stringify("\t", False, False, False)))
+
+ """
+
+ Note: Master schema entries (schema) are all pulled from the base version (the SQLite database file). Currently,
+ the master schema entries are taken from the base version. Even though schema additions are handled in the
+ WAL file for existing tables, tables added in the WAL have not been accounted for yet.
+
+ """
+
+ # Export to text
+ if export_type == EXPORT_TYPES.TEXT:
+ print_text(output_directory, file_prefix, export_type, carve, carve_freelists,
+ specified_tables_to_carve, version_history, signatures, logger)
+
+ # Export to csv
+ elif export_type == EXPORT_TYPES.CSV:
+ print_csv(output_directory, file_prefix, export_type, carve, carve_freelists,
+ specified_tables_to_carve, version_history, signatures, logger)
+
+ # Export to sqlite
+ elif export_type == EXPORT_TYPES.SQLITE:
+ print_sqlite(output_directory, file_prefix, export_type, carve, carve_freelists,
+ specified_tables_to_carve, version_history, signatures, logger)
+
+ # Export to xlsx
+ elif export_type == EXPORT_TYPES.XLSX:
+ print_xlsx(output_directory, file_prefix, export_type, carve, carve_freelists,
+ specified_tables_to_carve, version_history, signatures, logger)
+
+ # The export type was not found (this should not occur due to the checking of argparse)
+ else:
+ raise SqliteError("Invalid option for export type: {}.".format(export_type))
+
+ # Carve the rollback journal if found and carving is not specified
+ if rollback_journal_file and not carve:
+ print("Rollback journal file found: {}. Rollback journal file parsing is under development and "
+ "currently only supports carving. Please rerun with the --carve option for this output.")
+
+ # Carve the rollback journal if found and carving is specified
+ if rollback_journal_file and carve:
+
+ if not output_directory:
+
+ print("Rollback journal file found: {}. Rollback journal file carving is under development and "
+ "currently only outputs to CSV. Due to this, the output directory needs to be specified. Please"
+ "rerun with a output directory specified in order for this to complete.")
+
+ else:
+
+ print("Carving rollback journal file: {}. Rollback journal file carving is under development and "
+ "currently only outputs to CSV. Any export type specified will be overridden for this.")
+
+ carve_rollback_journal(output_directory, rollback_journal_file, rollback_journal_file_name,
+ specified_tables_to_carve, rollback_journal_exempted_tables,
+ version_history, signatures, logger)
+
+ print("Finished in {} seconds.".format(round(time() - start_time, 2)))
+
+
+def print_text(output_directory, file_prefix, export_type, carve, carve_freelists, specified_tables_to_carve,
+ version_history, signatures, logger):
+
+ if output_directory:
+
+ file_postfix = ".txt"
+ text_file_name = file_prefix + file_postfix
+
+ # Export all index and table histories to a text file while supplying signature to carve with
+ print("\nExporting history as {} to {}{}{}...".format(export_type, output_directory, sep, text_file_name))
+ logger.debug("Exporting history as {} to {}{}{}."
+ .format(export_type, output_directory, sep, text_file_name))
+
+ with CommitTextExporter(output_directory, text_file_name) as commit_text_exporter:
+
+ for master_schema_entry in version_history.versions[BASE_VERSION_NUMBER]\
+ .master_schema.master_schema_entries:
+
+ # Only account for the specified tables
+ if specified_tables_to_carve and master_schema_entry.name not in specified_tables_to_carve:
+ continue
+
+ if master_schema_entry.row_type in [MASTER_SCHEMA_ROW_TYPE.INDEX, MASTER_SCHEMA_ROW_TYPE.TABLE]:
+
+ signature = None
+ if carve:
+ signature = signatures[master_schema_entry.name] if master_schema_entry.name in signatures\
+ else None
+
+ if not signature and master_schema_entry.row_type is MASTER_SCHEMA_ROW_TYPE.TABLE \
+ and not master_schema_entry.without_row_id \
+ and not master_schema_entry.internal_schema_object:
+ print("Unable to find signature for: {}. This table will not be carved."
+ .format(master_schema_entry.name))
+ logger.error("Unable to find signature for: {}. This table will not be carved."
+ .format(master_schema_entry.name))
+
+ if signature:
+ version_history_parser = VersionHistoryParser(version_history, master_schema_entry, None, None,
+ signature, carve_freelists)
+ else:
+ version_history_parser = VersionHistoryParser(version_history, master_schema_entry)
+
+ page_type = version_history_parser.page_type
+ commit_text_exporter.write_header(master_schema_entry, page_type)
+
+ for commit in version_history_parser:
+ commit_text_exporter.write_commit(commit)
+
+ else:
+
+ # Export all index and table histories to csv files while supplying signature to carve with
+ logger.debug("Exporting history to {} as {}.".format("console", export_type))
+
+ for master_schema_entry in version_history.versions[BASE_VERSION_NUMBER].master_schema.master_schema_entries:
+
+ # Only account for the specified tables
+ if specified_tables_to_carve and master_schema_entry.name not in specified_tables_to_carve:
+ continue
+
+ if master_schema_entry.row_type in [MASTER_SCHEMA_ROW_TYPE.INDEX, MASTER_SCHEMA_ROW_TYPE.TABLE]:
+
+ signature = None
+ if carve:
+ signature = signatures[master_schema_entry.name] if master_schema_entry.name in signatures else None
+
+ if not signature and master_schema_entry.row_type is MASTER_SCHEMA_ROW_TYPE.TABLE \
+ and not master_schema_entry.without_row_id \
+ and not master_schema_entry.internal_schema_object:
+ print("Unable to find signature for: {}. This table will not be carved."
+ .format(master_schema_entry.name))
+ logger.error("Unable to find signature for: {}. This table will not be carved."
+ .format(master_schema_entry.name))
+
+ if signature:
+ version_history_parser = VersionHistoryParser(version_history, master_schema_entry, None, None,
+ signature, carve_freelists)
+ else:
+ version_history_parser = VersionHistoryParser(version_history, master_schema_entry)
+
+ page_type = version_history_parser.page_type
+ CommitConsoleExporter.write_header(master_schema_entry, page_type)
+
+ for commit in version_history_parser:
+ CommitConsoleExporter.write_commit(commit)
+
+
+def print_csv(output_directory, file_prefix, export_type, carve, carve_freelists, specified_tables_to_carve,
+ version_history, signatures, logger):
+
+ # Export all index and table histories to csv files while supplying signature to carve with
+ print("\nExporting history as {} to {}...".format(export_type, output_directory))
+ logger.debug("Exporting history to {} as {}.".format(output_directory, export_type))
+
+ commit_csv_exporter = CommitCsvExporter(output_directory, file_prefix)
+
+ for master_schema_entry in version_history.versions[BASE_VERSION_NUMBER].master_schema.master_schema_entries:
+
+ # Only account for the specified tables
+ if specified_tables_to_carve and master_schema_entry.name not in specified_tables_to_carve:
+ continue
+
+ if master_schema_entry.row_type in [MASTER_SCHEMA_ROW_TYPE.INDEX, MASTER_SCHEMA_ROW_TYPE.TABLE]:
+
+ signature = None
+ if carve:
+ signature = signatures[master_schema_entry.name] if master_schema_entry.name in signatures else None
+
+ if not signature and master_schema_entry.row_type is MASTER_SCHEMA_ROW_TYPE.TABLE \
+ and not master_schema_entry.without_row_id \
+ and not master_schema_entry.internal_schema_object:
+ print("Unable to find signature for: {}. This table will not be carved."
+ .format(master_schema_entry.name))
+ logger.error("Unable to find signature for: {}. This table will not be carved."
+ .format(master_schema_entry.name))
+
+ if signature:
+ version_history_parser = VersionHistoryParser(version_history, master_schema_entry, None, None,
+ signature, carve_freelists)
+ else:
+ version_history_parser = VersionHistoryParser(version_history, master_schema_entry)
+
+ for commit in version_history_parser:
+ commit_csv_exporter.write_commit(master_schema_entry, commit)
+
+
+def print_sqlite(output_directory, file_prefix, export_type, carve, carve_freelists,
+ specified_tables_to_carve, version_history, signatures, logger):
+
+ file_postfix = "-sqlite-dissect.db3"
+ sqlite_file_name = file_prefix + file_postfix
+
+ print("\nExporting history as {} to {}{}{}...".format(export_type, output_directory, sep, sqlite_file_name))
+ logger.debug("Exporting history as {} to {}{}{}.".format(export_type, output_directory, sep, sqlite_file_name))
+
+ with CommitSqliteExporter(output_directory, sqlite_file_name) as commit_sqlite_exporter:
+
+ for master_schema_entry in version_history.versions[BASE_VERSION_NUMBER].master_schema.master_schema_entries:
+
+ # Only account for the specified tables
+ if specified_tables_to_carve and master_schema_entry.name not in specified_tables_to_carve:
+ continue
+
+ if master_schema_entry.row_type in [MASTER_SCHEMA_ROW_TYPE.INDEX, MASTER_SCHEMA_ROW_TYPE.TABLE]:
+
+ signature = None
+ if carve:
+ signature = signatures[master_schema_entry.name] if master_schema_entry.name in signatures else None
+
+ if not signature and master_schema_entry.row_type is MASTER_SCHEMA_ROW_TYPE.TABLE \
+ and not master_schema_entry.without_row_id \
+ and not master_schema_entry.internal_schema_object:
+ print("Unable to find signature for: {}. This table will not be carved."
+ .format(master_schema_entry.name))
+ logger.error("Unable to find signature for: {}. This table will not be carved."
+ .format(master_schema_entry.name))
+
+ if signature:
+ version_history_parser = VersionHistoryParser(version_history, master_schema_entry, None, None,
+ signature, carve_freelists)
+ else:
+ version_history_parser = VersionHistoryParser(version_history, master_schema_entry)
+
+ for commit in version_history_parser:
+ commit_sqlite_exporter.write_commit(master_schema_entry, commit)
+
+
+def print_xlsx(output_directory, file_prefix, export_type, carve, carve_freelists, specified_tables_to_carve,
+ version_history, signatures, logger):
+
+ file_postfix = ".xlsx"
+ xlsx_file_name = file_prefix + file_postfix
+
+ # Export all index and table histories to a xlsx workbook while supplying signature to carve with
+ print("\nExporting history as {} to {}{}{}...".format(export_type, output_directory, sep, xlsx_file_name))
+ logger.debug("Exporting history as {} to {}{}{}.".format(export_type, output_directory, sep, xlsx_file_name))
+
+ with CommitXlsxExporter(output_directory, xlsx_file_name) as commit_xlsx_exporter:
+
+ for master_schema_entry in version_history.versions[BASE_VERSION_NUMBER].master_schema.master_schema_entries:
+
+ # Only account for the specified tables
+ if specified_tables_to_carve and master_schema_entry.name not in specified_tables_to_carve:
+ continue
+
+ if master_schema_entry.row_type in [MASTER_SCHEMA_ROW_TYPE.INDEX, MASTER_SCHEMA_ROW_TYPE.TABLE]:
+
+ signature = None
+ if carve:
+ signature = signatures[master_schema_entry.name] if master_schema_entry.name in signatures else None
+
+ if not signature and master_schema_entry.row_type is MASTER_SCHEMA_ROW_TYPE.TABLE \
+ and not master_schema_entry.without_row_id \
+ and not master_schema_entry.internal_schema_object:
+ print("Unable to find signature for: {}. This table will not be carved."
+ .format(master_schema_entry.name))
+ logger.error("Unable to find signature for: {}. This table will not be carved."
+ .format(master_schema_entry.name))
+
+ if signature:
+ version_history_parser = VersionHistoryParser(version_history, master_schema_entry, None, None,
+ signature, carve_freelists)
+ else:
+ version_history_parser = VersionHistoryParser(version_history, master_schema_entry)
+
+ for commit in version_history_parser:
+ commit_xlsx_exporter.write_commit(master_schema_entry, commit)
+
+
+def carve_rollback_journal(output_directory, rollback_journal_file, rollback_journal_file_name,
+ specified_tables_to_carve, rollback_journal_exempted_tables,
+ version_history, signatures, logger):
+
+ """
+
+ Carve the Rollback Journal file (Under Development)
+
+ Note: Since there is no normal parsing of the rollback journal file implemented yet, this is only done when
+ carving is specified. Also, since we are blindly carving each page in the rollback journal currently,
+ we are not checking for pointer map pages, freelist pages, and so on. Therefore, we do not care about the
+ carve_freelist_pages option here. The rollback journal file is being carved as it were all unallocated space.
+
+ """
+
+ csv_prefix_rollback_journal_file_name = basename(normpath(rollback_journal_file_name))
+ print("Exporting rollback journal carvings as CSV to {}...".format(output_directory))
+ logger.debug("Exporting rollback journal carvings as csv to output directory: {}.".format(output_directory))
+
+ commit_csv_exporter = CommitCsvExporter(output_directory, csv_prefix_rollback_journal_file_name)
+
+ for master_schema_entry in version_history.versions[BASE_VERSION_NUMBER].master_schema.master_schema_entries:
+
+ # Only account for the specified tables
+ if specified_tables_to_carve and master_schema_entry.name not in specified_tables_to_carve:
+ continue
+
+ if master_schema_entry.name in rollback_journal_exempted_tables:
+ logger.debug("Skipping exempted table: {} from rollback journal parsing.".format(master_schema_entry.name))
+ continue
+
+ """
+
+ Only account for OrdinaryTableRow objects (not VirtualTableRow objects) that are not "without rowid" tables.
+ All signatures generated will not be outside this criteria either.
+
+ """
+
+ if isinstance(master_schema_entry, OrdinaryTableRow) and not master_schema_entry.without_row_id:
+
+ signature = None
+ if signatures and master_schema_entry.name in signatures:
+ signature = signatures[master_schema_entry.name]
+
+ # Make sure we found the error but don't error out if we don't. Alert the user.
+ if not signature and master_schema_entry.row_type is MASTER_SCHEMA_ROW_TYPE.TABLE \
+ and not master_schema_entry.without_row_id \
+ and not master_schema_entry.internal_schema_object:
+ print("Unable to find signature for: {}. This table will not be carved from the rollback journal."
+ .format(master_schema_entry.name))
+ logger.error("Unable to find signature for: {}. This table will not be carved from the "
+ "rollback journal.".format(master_schema_entry.name))
+
+ else:
+
+ # Carve the rollback journal with the signature
+ carved_commits = RollBackJournalCarver.carve(rollback_journal_file,
+ version_history.versions[BASE_VERSION_NUMBER],
+ master_schema_entry, signature)
+
+ for commit in carved_commits:
+ commit_csv_exporter.write_commit(master_schema_entry, commit)
+
+
+if __name__ == "__main__":
+
+ description = "SQLite Dissect is a SQLite parser with recovery abilities over SQLite databases " \
+ "and their accompanying journal files. If no options are set other than the file " \
+ "name, the default behaviour will be to check for any journal files and print to " \
+ "the console the output of the SQLite files. The directory of the SQLite file " \
+ "specified will be searched through to find the associated journal files. If " \
+ "they are not in the same directory as the specified file, they will not be found " \
+ "and their location will need to be specified in the command. SQLite carving " \
+ "will not be done by default. Please see the options below to enable carving."
+
+ parser = ArgumentParser(description=description)
+
+ parser.add_argument("sqlite_file", metavar="SQLITE_FILE", help="The SQLite database file")
+
+ parser.add_argument("-v", "--version", action="version", version="version {version}".format(version=__version__),
+ help="display the version of SQLite Dissect")
+ parser.add_argument("-d", "--directory", metavar="OUTPUT_DIRECTORY", help="directory to write output to "
+ "(must be specified for outputs other "
+ "than console text)")
+ parser.add_argument("-p", "--file-prefix", default="", metavar="FILE_PREFIX",
+ help="the file prefix to use on output files, default is the name of the SQLite "
+ "file (the directory for output must be specified)")
+ parser.add_argument("-e", "--export", choices=["text", "csv", "sqlite", "xlsx"], default="text",
+ metavar="EXPORT_TYPE",
+ help="the format to export to {text, csv, sqlite, xlsx} (text written to console if -d "
+ "is not specified)")
+
+ journal_group = parser.add_mutually_exclusive_group()
+ journal_group.add_argument("-n", "--no-journal", action="store_true", default=False,
+ help="turn off automatic detection of journal files")
+ journal_group.add_argument("-w", "--wal",
+ help="the wal file to use instead of searching the SQLite file directory by default")
+ journal_group.add_argument("-j", "--rollback-journal",
+ help="the rollback journal file to use in carving instead of searching the SQLite file "
+ "directory by default (under development, currently only outputs to csv, output "
+ "directory needs to be specified)")
+
+ parser.add_argument("-r", "--exempted-tables", metavar="EXEMPTED_TABLES",
+ help="comma-delimited string of tables [table1,table2,table3] to exempt (only implemented "
+ "and allowed for rollback journal parsing currently) ex.) table1,table2,table3")
+
+ parser.add_argument("-s", "--schema", action="store_true",
+ help="output the schema to console, the initial schema found in the main database file")
+ parser.add_argument("-t", "--schema-history", action="store_true",
+ help="output the schema history to console, prints the --schema information and "
+ "write-head log changes")
+
+ parser.add_argument("-g", "--signatures", action="store_true",
+ help="output the signatures generated to console")
+
+ parser.add_argument("-c", "--carve", action="store_true", default=False,
+ help="carves and recovers table data")
+ parser.add_argument("-f", "--carve-freelists", action="store_true", default=False,
+ help="carves freelist pages (carving must be enabled, under development)")
+
+ parser.add_argument("-b", "--tables", metavar="TABLES",
+ help="specified comma-delimited string of tables [table1,table2,table3] to carve "
+ "ex.) table1,table2,table3")
+
+ parser.add_argument("-k", "--disable-strict-format-checking", action="store_true", default=False,
+ help="disable strict format checks for SQLite databases "
+ "(this may result in improperly parsed SQLite files)")
+
+ logging_group = parser.add_mutually_exclusive_group()
+ logging_group.add_argument("-l", "--log-level", default="off",
+ choices=["critical", "error", "warning", "info", "debug", "off"],
+ metavar="LOG_LEVEL",
+ help="level to log messages at {critical, error, warning, info, debug, off}")
+ parser.add_argument("-i", "--log-file", default=None, metavar="LOG_FILE",
+ help="log file to write too, default is to "
+ "write to console, ignored if log "
+ "level set to off (appends if file "
+ "already exists)")
+
+ parser.add_argument("--warnings", action="store_true", default=False, help="enable runtime warnings")
+
+ # Call the main function
+ main(parser.parse_args())
diff --git a/pyinstaller/sqlite_dissect_linux-x64_onedir.spec b/pyinstaller/sqlite_dissect_linux-x64_onedir.spec
new file mode 100644
index 0000000..c400d13
--- /dev/null
+++ b/pyinstaller/sqlite_dissect_linux-x64_onedir.spec
@@ -0,0 +1,41 @@
+# Initially generated with the "pyinstaller main.py" command. Altered after for minor changes.
+# Consecutively run after modifications from the project root directory as:
+# pyinstaller pyinstaller\sqlite_dissect_linux-x64_onedir.spec
+# Please see https://github.com/pyinstaller/pyinstaller/issues/5540 if errors with the ldconfig are encountered.
+# -*- mode: python -*-
+
+import PyInstaller.config
+
+PyInstaller.config.CONF['distpath'] = "./dist/linux-x64"
+
+block_cipher = None
+
+
+a = Analysis(['../main.py'],
+ pathex=[],
+ binaries=[],
+ datas=[],
+ hiddenimports=[],
+ hookspath=[],
+ runtime_hooks=[],
+ excludes=[],
+ win_no_prefer_redirects=False,
+ win_private_assemblies=False,
+ cipher=block_cipher)
+pyz = PYZ(a.pure, a.zipped_data,
+ cipher=block_cipher)
+exe = EXE(pyz,
+ a.scripts,
+ exclude_binaries=True,
+ name='sqlite_dissect',
+ debug=False,
+ strip=False,
+ upx=True,
+ console=True )
+coll = COLLECT(exe,
+ a.binaries,
+ a.zipfiles,
+ a.datas,
+ strip=False,
+ upx=True,
+ name='sqlite_dissect')
diff --git a/pyinstaller/sqlite_dissect_linux-x64_onefile.spec b/pyinstaller/sqlite_dissect_linux-x64_onefile.spec
new file mode 100644
index 0000000..82dd684
--- /dev/null
+++ b/pyinstaller/sqlite_dissect_linux-x64_onefile.spec
@@ -0,0 +1,37 @@
+# Initially generated with the "pyinstaller main.py --onefile" command. Altered after for minor changes.
+# Consecutively run after modifications from the project root directory as:
+# pyinstaller pyinstaller\sqlite_dissect_linux-x64_onefile.spec
+# Please see https://github.com/pyinstaller/pyinstaller/issues/5540 if errors with the ldconfig are encountered.
+# -*- mode: python -*-
+
+import PyInstaller.config
+
+PyInstaller.config.CONF['distpath'] = "./dist/linux-x64/bin"
+
+block_cipher = None
+
+
+a = Analysis(['../main.py'],
+ pathex=[],
+ binaries=[],
+ datas=[],
+ hiddenimports=[],
+ hookspath=[],
+ runtime_hooks=[],
+ excludes=[],
+ win_no_prefer_redirects=False,
+ win_private_assemblies=False,
+ cipher=block_cipher)
+pyz = PYZ(a.pure, a.zipped_data,
+ cipher=block_cipher)
+exe = EXE(pyz,
+ a.scripts,
+ a.binaries,
+ a.zipfiles,
+ a.datas,
+ name='sqlite_dissect',
+ debug=False,
+ strip=False,
+ upx=True,
+ runtime_tmpdir=None,
+ console=True )
diff --git a/pyinstaller/sqlite_dissect_win-x86_64_onedir.spec b/pyinstaller/sqlite_dissect_win-x86_64_onedir.spec
new file mode 100644
index 0000000..9daa043
--- /dev/null
+++ b/pyinstaller/sqlite_dissect_win-x86_64_onedir.spec
@@ -0,0 +1,40 @@
+# Initially generated with the "pyinstaller main.py" command. Altered after for minor changes.
+# Consecutively run after modifications from the project root directory as:
+# pyinstaller pyinstaller\sqlite_dissect_win-x86_64_onedir.spec
+# -*- mode: python -*-
+
+import PyInstaller.config
+
+PyInstaller.config.CONF['distpath'] = "./dist/win-x86_64"
+
+block_cipher = None
+
+
+a = Analysis(['../main.py'],
+ pathex=[],
+ binaries=[],
+ datas=[],
+ hiddenimports=[],
+ hookspath=[],
+ runtime_hooks=[],
+ excludes=[],
+ win_no_prefer_redirects=False,
+ win_private_assemblies=False,
+ cipher=block_cipher)
+pyz = PYZ(a.pure, a.zipped_data,
+ cipher=block_cipher)
+exe = EXE(pyz,
+ a.scripts,
+ exclude_binaries=True,
+ name='sqlite_dissect',
+ debug=False,
+ strip=False,
+ upx=True,
+ console=True )
+coll = COLLECT(exe,
+ a.binaries,
+ a.zipfiles,
+ a.datas,
+ strip=False,
+ upx=True,
+ name='sqlite_dissect')
diff --git a/pyinstaller/sqlite_dissect_win-x86_64_onefile.spec b/pyinstaller/sqlite_dissect_win-x86_64_onefile.spec
new file mode 100644
index 0000000..1ca52aa
--- /dev/null
+++ b/pyinstaller/sqlite_dissect_win-x86_64_onefile.spec
@@ -0,0 +1,36 @@
+# Initially generated with the "pyinstaller main.py --onefile" command. Altered after for minor changes.
+# Consecutively run after modifications from the project root directory as:
+# pyinstaller pyinstaller\sqlite_dissect_win-x86_64_onefile.spec
+# -*- mode: python -*-
+
+import PyInstaller.config
+
+PyInstaller.config.CONF['distpath'] = "./dist/win-x86_64/bin"
+
+block_cipher = None
+
+
+a = Analysis(['../main.py'],
+ pathex=[],
+ binaries=[],
+ datas=[],
+ hiddenimports=[],
+ hookspath=[],
+ runtime_hooks=[],
+ excludes=[],
+ win_no_prefer_redirects=False,
+ win_private_assemblies=False,
+ cipher=block_cipher)
+pyz = PYZ(a.pure, a.zipped_data,
+ cipher=block_cipher)
+exe = EXE(pyz,
+ a.scripts,
+ a.binaries,
+ a.zipfiles,
+ a.datas,
+ name='sqlite_dissect',
+ debug=False,
+ strip=False,
+ upx=True,
+ runtime_tmpdir=None,
+ console=True )
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..c265a6f
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,41 @@
+from setuptools import setup
+from _version import __version__
+
+"""
+
+setup.py
+
+This script will be used to setup the sqlite_dissect package for use in python environments.
+
+Note: To compile a distribution for the project run "python setup.py sdist" in the directory this file is located in.
+
+Note: openpyxl is needed for the xlsx export and will install jdcal and et-xmlfile ["openpyxl>=2.4.0b1"]
+
+Note: PyInstaller is used for generation of executables but not included in this setup.py script and will
+ install altgraph, dis3, macholib, pefile, pypiwin32, pywin32 as dependencies. [pyinstaller==3.6 needs to be used
+ for Python 2.7 since the newer versions of PyInstaller of 4.0+ require Python 3.6] Information on how to run
+ PyInstaller is included in the spec files under the pyinstaller directory. Four files are here, two for windows
+ and two for linux, both for x64 platforms. The two different files for each allow you to build it as one single
+ file or a directory of decompressed files. Since the one file extracts to a temp directory in order to run, on
+ some systems this may be blocked and therefore the directory of files is preferred.
+
+"""
+
+setup(name="sqlite_dissect",
+ version=__version__,
+ url="https://github.com/Defense-Cyber-Crime-Center/sqlite-dissect",
+ description="This package allows parsing and carving of sqlite files",
+ author="Defense Cyber Crime Center (DC3)",
+ author_email="TSD@dc3.mil",
+ packages=["sqlite_dissect",
+ "sqlite_dissect.file",
+ "sqlite_dissect.file.database",
+ "sqlite_dissect.file.journal",
+ "sqlite_dissect.file.schema",
+ "sqlite_dissect.file.wal",
+ "sqlite_dissect.file.wal_index",
+ "sqlite_dissect.carving",
+ "sqlite_dissect.export"],
+ install_requires=["openpyxl>=2.4.0b1"],
+ zip_safe=False
+ )
diff --git a/sqlite_dissect/README.md b/sqlite_dissect/README.md
new file mode 100644
index 0000000..a02d3b5
--- /dev/null
+++ b/sqlite_dissect/README.md
@@ -0,0 +1,205 @@
+
+# sqlite_dissect
+
+This package will have scripts for overall usage throughout the SQLite Dissect library allowing the functionality
+to parse through the data and access to underlying functions through an interface.
+
+The init script will initialize the logger for this library with a NullHandler to prevent unexpected output
+from applications that may not be implementing logging. It will also ignore warnings reported by the python
+warning by default. (Warnings are also thrown to the logger when they occur in addition to the warnings
+framework.)
+
+>Note: This library will use warnings for things that may not be fully implemented or handled yet. (In other cases,
+> NotImplementedErrors may be raised.) To turn off warnings use the "-W ignore" option. See the Python
+> documentation for further options.
+
+- constants.py
+- exception.py
+- interface.py
+- output.py
+- utilities.py
+- version_history.py
+
+TODO items for the "sqlite_dissect" package:
+
+- [ ] Finish UML class diagrams.
+- [ ] \_\_init\_\_.py: Create a raise exception function to call to reduce lines of code that will log inside of it.
+- [ ] \_\_init\_\_.py: Create global static variables to be used for store_in_memory, strict_format_checking, etc.
+- [ ] \_\_init\_\_.py: Implement strict_format_checking into journal, other types besides database, wal
+- [ ] \_\_init\_\_.py: Investigate differences in use of logging.warn vs. warning.warn.
+- [ ] \_\_init\_\_.py: Create custom warnings for the library.
+
+
+
+### constants.py
+
+This script holds constants defined for reference by the sqlite carving library. Additionally, a class has been
+added to this script for constant enumerations.
+
+This script holds the following object(s):
+- Enum(MutableMapping)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+
+
+
+### exception.py
+
+This script holds the custom exceptions used in this library.
+
+This script holds the following object(s):
+- SqliteError(Exception)
+- ParsingError(SqliteError)
+- HeaderParsingError(ParsingError)
+- MasterSchemaParsingError(ParsingError)
+- MasterSchemaRowParsingError(MasterSchemaParsingError)
+- PageParsingError(ParsingError)
+- BTreePageParsingError(PageParsingError)
+- CellParsingError(BTreePageParsingError)
+- RecordParsingError(CellParsingError)
+- VersionParsingError(ParsingError)
+- DatabaseParsingError(VersionParsingError)
+- WalParsingError(VersionParsingError)
+- WalFrameParsingError(WalParsingError)
+- WalCommitRecordParsingError(WalParsingError)
+- SignatureError(SqliteError)
+- CarvingError(SqliteError)
+- CellCarvingError(CarvingError)
+- InvalidVarIntError(CarvingError)
+- OutputError(SqliteError)
+- ExportError(SqliteError)
+
+
+TODO:
+- [ ] Documentation improvements.
+
+
+
+### interface.py
+
+This script acts as a simplified interface for common operations for the sqlite carving library.
+
+This script holds the following object(s):
+- create_database(file_identifier, store_in_memory=False, strict_format_checking=True)
+- create_write_ahead_log(file_name, file_object=None)
+- create_version_history(database, write_ahead_log=None)
+- get_table_names(database)
+- get_index_names(database)
+- select_all_from_table(table_name, version)
+- select_all_from_index(index_name, version)
+- create_table_signature(table_name, version, version_history=None)
+- carve_table(table_name, signature, version)
+- get_version_history_iterator(table_or_index_name, version_history, signature=None)
+- export_table_or_index_version_history_to_csv(export_directory, version_history, table_or_index_name, signature=None, carve_freelist_pages=False)
+- export_version_history_to_csv(export_directory, version_history, signatures=None, carve_freelist_pages=False)
+- export_table_or_index_version_history_to_sqlite(export_directory, sqlite_file_name, version_history, table_or_index_name, signature=None, carve_freelist_pages=False):
+- export_version_history_to_sqlite(export_directory, sqlite_file_name, version_history, signatures=None, carve_freelist_pages=False):
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Account for schema changes across the versions.
+- [ ] Implement index signatures.
+- [ ] Update documentation on the BASE_VERSION_NUMBER where it is used.
+- [ ] create_table_signature: Note on how the version history is recommended if possible.
+
+
+
+### output.py
+
+This script holds general output functions used for debugging, logging, and general output for the
+sqlite carving library.
+
+This script holds the following object(s):
+- get_page_breakdown(pages)
+- get_pointer_map_entries_breakdown(version)
+- stringify_b_tree(version_interface, b_tree_root_page, padding="")
+- stringify_cell_record(cell, database_text_encoding, page_type)
+- stringify_cell_records(cells, database_text_encoding, page_type)
+- stringify_master_schema_version(version)
+- stringify_master_schema_versions(version_history)
+- stringify_page_history(version_history, padding="")
+- stringify_page_information(version, padding="")
+- stringify_page_structure(version, padding="")
+- stringify_unallocated_space(version, padding="", include_whitespace=True, whitespace_threshold=0)
+- stringify_version_pages(version, padding="")
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Implement better exception handling when parsing objects.
+- [ ] Make sure different encodings are handled in every function in this script where applicable.
+- [ ] get_pointer_map_entries_breakdown: Handle the pointer map page breakdown tuple better.
+- [ ] stringify_unallocated_space: Implement a whitespace threshold for trimming, etc.
+
+
+
+### utilities.py
+
+This script holds general utility functions for reference by the sqlite carving library.
+
+This script holds the following object(s):
+- calculate_expected_overflow(overflow_byte_size, page_size)
+- decode_varint(byte_array, offset)
+- encode_varint(value)
+- get_class_instance(class_name)
+- get_md5_hash(string)
+- get_record_content(serial_type, record_body, offset=0)
+- get_serial_type_signature(serial_type)
+- get_storage_class(serial_type)
+- has_content(byte_array)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Implement try/except exception handling for struct.error and ord.
+- [ ] The varint related functions only work in big endian. Are there use cases for little endian?
+
+
+
+### version_history.py
+
+This script holds the superclass objects used for parsing the database and write ahead log in a sequence of versions
+throughout all of the commit records in the write ahead log.
+
+This script holds the following object(s):
+- VersionHistory(object)
+- VersionHistoryParser(VersionParser) (with VersionHistoryParserIterator(object) as an inner class)
+- Commit(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Incorporate journal files once they are implemented.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Handle exceptions that may be raised from creating and working with objects better.
+ ##### VersionHistory Class:
+ - [ ] Better exception handling when creating objects such as commit records, etc.
+ - [ ] Investigate what occurs if the last commit record is not committed (warning currently thrown).
+ ##### VersionHistoryParser Class:
+ - [ ] Support the same master schema entry being deleted and then re-added (Keep in mind row id).
+ - [ ] How to handle master schema entries not found in specified versions?
+ - [ ] Support for virtual table modules of master schema entry table type.
+ - [ ] Support for "without rowid" tables (index b-tree pages).
+ - [ ] Support for index b-trees that are internal schema objects with no SQL.
+ - [ ] Investigate issues with same rows in index b-tree leaf pages that might get deleted.
+ - [ ] Track pages being moved to the freelist to account for carving with other signatures?
+ - [ ] Handle master schema entries that have no entries (view, trigger, etc.) in the iterator.
+ - [ ] Handle master schema entries that are not supported yet (virtual, etc.) in the iterator.
+ - [ ] Use accounted for cell digests for deleted cells in the aggregate leaf cells function?
+ - [ ] How to detect index leaf page cell updates (file offset may not work and no row id).
+ - [ ] Is checking on the row id sufficient for detecting updates on table leaf pages for cells.
+ - [ ] Does this class belong here and should carving be incorporated or separate to this class?
+ - [ ] Have a better way to specify if carving was enabled or not (possibly in Commit?).
+ - [ ] VersionParserIterator: Investigate what to return for version with no modification.
+ - [ ] VersionParserIterator: Extend carving capabilities beyond tables once implemented.
+ - [ ] VersionParserIterator: Check carvings are correctly being detected as duplicates per md5.
+ - [ ] VersionParserIterator: Use dictionary comprehension for added and deleted cells for loops.
+ ##### Commit Class:
+ - [ ] Handle the updated property differently depending on differences in b-tree and freelist changes.
diff --git a/sqlite_dissect/__init__.py b/sqlite_dissect/__init__.py
new file mode 100644
index 0000000..2114bac
--- /dev/null
+++ b/sqlite_dissect/__init__.py
@@ -0,0 +1,48 @@
+import logging
+import warnings
+from sqlite_dissect.constants import LOGGER_NAME
+
+"""
+
+__init__.py
+
+This package will have scripts for overall usage throughout the SQLite Dissect library allowing the functionality
+to parse through the data and access to underlying functions through an interface.
+
+This init script will initialize the logger for this library with a NullHandler to prevent unexpected output
+from applications that may not be implementing logging. It will also ignore warnings reported by the python
+warning by default. (Warnings are also thrown to the logger when they occur in addition to the warnings
+framework.)
+
+Note: This library will use warnings for things that may not be fully implemented or handled yet. (In other cases,
+ NotImplementedErrors may be raised.) To turn off warnings use the "-W ignore" option. See the Python
+ documentation for further options.
+
+"""
+
+
+# Import interface as api
+from sqlite_dissect.interface import *
+
+
+def null_logger():
+ try:
+
+ # Import the NullHandler from the logging package
+ from logging import NullHandler
+
+ except ImportError:
+
+ # Make our own if an error occurring while importing
+ class NullHandler(logging.Handler):
+
+ def emit(self, record):
+ pass
+
+ # Get the logger from the LOGGER_NAME constant and add the NullHandler to it
+ logging.getLogger(LOGGER_NAME).addHandler(NullHandler())
+
+ logging.getLogger(LOGGER_NAME).propagate = False
+
+ # Ignore warnings by default
+ warnings.filterwarnings("ignore")
diff --git a/sqlite_dissect/carving/README.md b/sqlite_dissect/carving/README.md
new file mode 100644
index 0000000..ee7ac5f
--- /dev/null
+++ b/sqlite_dissect/carving/README.md
@@ -0,0 +1,224 @@
+
+# sqlite_dissect.carving
+
+This package will control signature generation and carving of SQLite files.
+
+- carved_cell.py
+- carver.py
+- rollback_journal_carver.py
+- signature.py
+- utilities.py
+
+TODO items for the "carving" package:
+
+- [ ] Finish UML class diagrams.
+
+
+
+### carved_cell.py
+
+This script holds the objects used for carving cells from the unallocated and freeblock space in SQLite
+b-tree pages used in conjunction with other classes in the carving package. These objects subclass their
+respective higher level SQLite database object type and add to them while parsing the data in a different way.
+
+This script holds the following object(s):
+- CarvedBTreeCell(BTreeCell)
+- CarvedRecord(Payload)
+- CarvedRecordColumn(RecordColumn)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Investigate a way to account for overflow.
+- [ ] Investigate if fragments exist, have any affect on carving.
+- [ ] Subclass CarvedBTreeCell for b-tree cell types.
+- [ ] Subclass CarvedRecord for freeblock and unallocated space algorithms for carving.
+- [ ] Handle multi-byte varints (blob and text serial types) better.
+- [ ] How to account for use cases where carved data is all 0x00 bytes.
+- [ ] Handle use cases where the primary key is an integer and negative resulting in negative (9-byte) varints.
+- [ ] Fix the start and end offset and account for the freeblock, freeblock_size, and next_freeblock_offset.
+- [ ] For the first serial types need to cross reference first column if integer primary key in table b-tree leaf table == null 00
+- [ ] For the first serial types need to cross reference with row signatures (if not schema) (prob + focued + schema + first removed on unalloc etc)
+- [ ] Address the row_id as being set initially to "Unknown" which was temporarily added for consistence with other cells (b-tree) and need to check other use cases.
+- [ ] Check that the payload size is less than the length or else partial entry.
+- [ ] Add better logging.
+- [ ] Calculate or analyze MD5s of headers.
+- [ ] Figure out how MD5 hashes will work on carved record, carved record columns, and carved b-tree cells.
+- [ ] Look into the calculated body content size assuming one (correct) entry in the signature.
+- [ ] Address header/body/etc byte sizes.
+- [ ] Check size of record columns to expected columns.
+ ##### CarvedBTreeCell Class
+ - [ ] Remove the first column serial types now that the signature is sent in?
+ - [ ] handle the version and page version number correctly in reference to journal file parsing.
+ ##### CarvedRecord Class
+ - [ ] See if basing the first_serial_type off of other carved cells if found before redoing unallocated/freeblocks if possible.
+ - [ ] When checking the signature, see if is there a better way to utilize it if there are no entries like switching to the schema signature (b-tree leaf?).
+ - [ ] Address the truncated record column index/column name.
+ - [ ] Handle cutoff_offset relation to truncated and indexing.
+ - [ ] Handle overflow.
+ - [ ] Fragment parsing.
+ - [ ] Subclass types of cells, freeblock.
+ - [ ] What if the assumed preceding serial type is not in the first serial types sign (use prob?).
+ - [ ] Address issues that can occur when first_serial_type_varint_length != -1.
+ - [ ] Need documentation on how the serial type is always obtainable for freeblocks at least only if the next two bytes != size (ie. sub freeblock) if the start offset >= 2 and it is a freeblock.
+ - [ ] Check the equals (>= and <) for start offset >= 2 and is a freeblock while iterating through the carved record columns.
+ - [ ] Update debugging messages (for example, after except like with InvalidVarIntError)
+ - [ ] If string or blob may be able to iterate backwards until proper offsets are found and look into other use cases.
+ - [ ] Document use cases for first_column_serial_types (4?).
+ - [ ] Report size of missing data/columns/etc if truncated for carved_record_column objects.
+ - [ ] Look into sending unallocated byte size in the constructor for carved_record_column objects.
+ - [ ] Specify if the unallocated information is included or overwritten in the header for carved_record_column objects.
+ - [ ] Document after adjusting the serial type definition size off of the first serial type specified for carved_record_column objects.
+ - [ ] Need documentation on the "32" number [ (9 - 4) + 9 + 9 + 9 ] = up to 32 bytes preceding (derived header byte size).
+ - [ ] Using the simplified_probabilistic_signature can give bad data.
+ - [ ] Fix when the serial type is 12 or 13. If the signatures is -1 or -2 should be 0->57 (min/max).
+ - [ ] Try doing a reverse search for row id and payload length (assuming 1 varint length for row id).
+ - [ ] Derive differences between derived payload and actual payload if actual is not found (and other fields).
+ - [ ] Need to reverse search for row id and payload length (assuming 1 varint length for row id).
+ ##### CarvedRecordColumn Class
+ - [ ] Incorporate absolute offsets.
+ - [ ] Calculate and set the md5 hex digest.
+ - [ ] Handle the value and md5 hex digest (and probably others) so values are sent into \_\_init\_\_?
+ - [ ] Handle table interior, index interior, index leaf, and additional use cases.
+ - [ ] Make sure string values are in the correct text encoding for the database.
+ - [ ] Use \_\_slots\_\_ or some other way to reduce memory since many of these objects will be created.
+ - [ ] Update documentation around the no bytes preceding note.
+
+
+
+### carver.py
+
+This script holds carver objects for identifying and parsing out cells from unallocated and
+freeblock space in SQLite b-tree pages.
+
+This script holds the following object(s):
+- SignatureCarver(Carver)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] On some files (ex. talk.sqlite), lots of "empty space" signatures were printed. Fix these use cases.
+- [ ] Account for changing schemas (schema cookie, etc.).
+- [ ] Investigate if there is a way to handle fragments (fragment "sizes" can be > 3).
+- [ ] Better handling of errors thrown while generating carved cells.
+- [ ] Handle use cases where the primary key is an integer and negative resulting in negative (9-byte) varints.
+- [ ] Investigate if there is any need to account for different database encodings.
+ ##### SignatureCarver Class
+ - [ ] Incorporate altered tables within the signature in carving, not just the full signature.
+ - [ ] Address overflow.
+ - [ ] Specify which signatures to carve with (if important or schema vs simplified)?
+ - [ ] Currently matches are done in reverse for better findings. Should this also be done in order?
+ - [ ] Update the cutoff offset based on the earliest offset found in the carved b-tree cell.
+ - [ ] Remove the cutoff offset by sending in a truncated data array in to the CarvedBTreeCell?
+ - [ ] Change the first column serial types from an array to boolean since signature is now sent in.
+ - [ ] carve_freeblocks: Handle use cases where the first serial type in the record header exists.
+ - [ ] carve_freeblocks: Check why originally there was an exception if the first serial types > 1.
+ - [ ] carve_freeblocks: Handle multi-byte varints in the first serial types (warning currently raised).
+ - [ ] carve_freeblocks: Apply additional use cases to the use of the cutoff offset.
+ - [ ] carve_freeblocks: Check why search was used if len(signature) == 2 and -1/-2 in signature\[1\].
+ - [ ] carve_unallocated_space: Address carving of the cell pointer array for deleted cells.
+ - [ ] carve_unallocated_space: Handle carving of freeblocks (see documentation in section of code).
+ - [ ] carve_unallocated_space: Handle varint first serial type (see documentation in section of code).
+ - [ ] carve_unallocated_space: Support for other cell types than b-tree table leaf cells.
+ - [ ] carve_unallocated_space: Address parsing of fields such as payload size, row id, etc.
+ - [ ] carve_unallocated_space: Update partial carving indices (see documentation in section of code).
+ - [ ] carve_unallocated_space: Have an option for partial/freeblock carving of unallocated space?
+ - [ ] carve_unallocated_space: Revise the partial carving algorithm.
+
+
+
+### rollback_journal_carver.py
+
+This script carves through a journal file with the specified master schema entry and signature and returns the entries.
+
+This script holds the following object(s):
+- RollBackJournalCarver(Carver)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Investigate possible alternatives to computing or reading the database page size from the journal file.
+
+
+
+### signature.py
+
+This script holds the objects for the signature generation of SQLite table and index b-trees for carving.
+
+This script holds the following object(s):
+- Signature(VersionParser)
+- SchemaColumnSignature(object)
+- TableColumnSignature(object)
+- TableRowSignature(object)
+- ColumnSignature(object)
+- ColumnFixedLengthSignature(ColumnSignature)
+- ColumnVariableLengthSignature(ColumnSignature)
+- ColumnReducedVariableLengthSignature(ColumnVariableLengthSignature)
+- ColumnNonReducedVariableLengthSignature(ColumnVariableLengthSignature)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Handle exceptions that may be raised from creating and working with objects such as signatures better.
+- [ ] Incorporate any column and/or table constraint use cases that may affect the signature.
+- [ ] Create superclass for schema, table row, and table column signatures?
+- [ ] Create constants for serial type arrays in signatures?
+- [ ] Updated signature classes to take in a column signature argument instead of sending in individual fields of it.
+- [ ] Right now signatures are only derived from leaf pages. Interior pages should be have signatures as well.
+- [ ] Have a way to send in a maximum amount of (unique) records to generate the signature from (reduces time)?
+- [ ] Have an extension to the Epilog XSD that can be used for signature exportation.
+- [ ] Have a way to merge like signatures from external files.
+- [ ] Investigate if it is better to put the altered columns flag in a master schema associated class or leave here?
+ ##### Signature Class
+ - [ ] Create a field that has a max number of rows to look at to determine a signature to reduce time?
+ - [ ] Test and investigation on how to handle virtual tables with signatures.
+ - [ ] Note on how table interior pages cannot have (serial type header) signatures since no records exist.
+ - [ ] Change the signature to take in a master schema entry identifier instead of the entry itself?
+ - [ ] Signatures need to be made for the master schema pages.
+ - [ ] Check support for index b-tree pages and ensure it is working correctly (warning currently raised).
+ - [ ] The accounted_for_cell_digests may not work for index pages since there is no row id.
+ - [ ] There may not be a page type in reference to a virtual table since it is not required to have pages.
+ - [ ] Support for virtual table modules of master schema entry table type.
+ - [ ] Support for index b-trees that are internal schema objects with no SQL (warning currently raised).
+ - [ ] Check to make sure index b-tree internal schema objects can not have column definitions (SQL).
+ - [ ] How do 0 serial types (NULL) work with signatures (like epilog signatures)?
+ - [ ] Combines simple (or focused) and schema epilog signatures for a more complete epilog signature?
+ - [ ] Check 8 and 9 serial type on non-integer storage classes for simplified and focused epilog signatures.
+ - [ ] Is there a use case for only parsing the schema signature and nothing else?
+ - [ ] How to handle master schema entries not found in specified versions?
+ - [ ] Have a b-tree page type (either table or index).
+ - [ ] Investigate better ways for probability calculations between altered columns and column breakdown.
+ - [ ] How does defaulting fields work in reference to virtual tables. How does is the signature generated?
+ ##### SchemaColumnSignature Class
+ - [ ] Handle NULL serial types in the recommended signatures.
+ - [ ] Incorporate NOT NULL column constraints (and other uses - primary key?) as not having a 0.
+
+
+
+### utilities.py
+
+This script holds carving utility functions for reference by the SQLite carving module.
+
+This script holds the following object(s):
+- decode_varint_in_reverse(byte_array, offset)
+- calculate_body_content_size(serial_type_header)
+- calculate_serial_type_definition_content_length_min_max(simplified_serial_types, allowed_varint_length=5)
+- calculate_serial_type_varint_length_min_max(simplified_serial_types)
+- generate_regex_for_simplified_serial_type(simplified_serial_type)
+- generate_signature_regex(signature, skip_first_serial_type=False)
+- get_content_size(serial_type)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Handle use cases where the primary key is an integer and negative resulting in negative (9-byte) varints.
+- [ ] decode_varint_in_reverse: Handle the 9 byte varints correctly.
+- [ ] decode_varint_in_reverse: Should the InvalidVarIntError be logged as an error?
+- [ ] decode_varint_in_reverse: Document on how conclusiveness/truncation can not be certain.
+- [ ] generate_regex_for_simplified_serial_type: Fix to account for 9 byte varint serial types.
+- [ ] generate_signature_regex: Account for small signatures.
+- [ ] generate_signature_regex: Account for regular expressions that skip the first byte of a multi-byte serial type.
diff --git a/sqlite_dissect/carving/__init__.py b/sqlite_dissect/carving/__init__.py
new file mode 100644
index 0000000..a4a5cb7
--- /dev/null
+++ b/sqlite_dissect/carving/__init__.py
@@ -0,0 +1,10 @@
+
+"""
+
+__init__.py
+
+This init script will initialize any needed logic for this package.
+
+This package will control signature generation and carving of SQLite files.
+
+"""
diff --git a/sqlite_dissect/carving/carved_cell.py b/sqlite_dissect/carving/carved_cell.py
new file mode 100644
index 0000000..3cbb263
--- /dev/null
+++ b/sqlite_dissect/carving/carved_cell.py
@@ -0,0 +1,898 @@
+from struct import unpack
+from warnings import warn
+from sqlite_dissect.carving.utilities import calculate_body_content_size
+from sqlite_dissect.carving.utilities import calculate_serial_type_definition_content_length_min_max
+from sqlite_dissect.carving.utilities import decode_varint_in_reverse
+from sqlite_dissect.carving.utilities import get_content_size
+from sqlite_dissect.constants import BLOB_SIGNATURE_IDENTIFIER
+from sqlite_dissect.constants import CELL_LOCATION
+from sqlite_dissect.constants import FILE_TYPE
+from sqlite_dissect.constants import TEXT_SIGNATURE_IDENTIFIER
+from sqlite_dissect.exception import CellCarvingError
+from sqlite_dissect.exception import InvalidVarIntError
+from sqlite_dissect.file.database.page import BTreeCell
+from sqlite_dissect.file.database.payload import Payload
+from sqlite_dissect.file.database.payload import RecordColumn
+from sqlite_dissect.utilities import decode_varint
+from sqlite_dissect.utilities import encode_varint
+from sqlite_dissect.utilities import get_md5_hash
+from sqlite_dissect.utilities import get_record_content
+from sqlite_dissect.utilities import get_serial_type_signature
+
+"""
+
+carved_cell.py
+
+This script holds the objects used for carving cells from the unallocated and freeblock space in SQLite
+b-tree pages used in conjunction with other classes in the carving package. These objects subclass their
+respective higher level SQLite database object type and add to them while parsing the data in a different way.
+
+This script holds the following object(s):
+CarvedBTreeCell(BTreeCell)
+CarvedRecord(Payload)
+CarvedRecordColumn(RecordColumn)
+
+"""
+
+
+class CarvedBTreeCell(BTreeCell):
+
+ """
+
+ This class will be responsible for carving a b-tree cell to the best it can out of a block of data either from
+ unallocated data or freeblocks. Since the header to freeblocks can be overwritten meaning at most the first
+ serial type identifier could be overwritten in the record, a list of first column serial types can be specified.
+ The header of the record is in the following form:
+ [ HEADER [ HEADER_BYTE_SIZE SERIAL_TYPE_1 ... SERIAL_TYPE_N] ][ BODY [ BODY_CONTENT_1 ... BODY_CONTENT_N ] ]
+ For table leaf cells which are mainly being focused on here the cell is in the following format.
+
+ Since unallocated space can contain freeblocks, this class will be used for both use cases of carving from
+ unallocated space and freeblocks.
+
+ If the carved b-tree cell has first column serial types set, a probabilistic flag will be set in both the carved
+ b-tree cell, record, and record column indicating that not all fields were completely deterministic.
+
+ Table interior, index interior, index leaf pages, and additional use cases still need to be accounted for.
+
+
+ """
+
+ def __init__(self, version, file_offset, source, page_number, location, index, data,
+ serial_type_definition_start_offset, serial_type_definition_end_offset, cutoff_offset,
+ number_of_columns, signature, first_column_serial_types=None, freeblock_size=None):
+
+ """
+
+
+
+ Note: The md5 hex digest is set to the md5 hash of the data between the start offset and end offset determined
+ after the carving of the payload. It is important to note that these offsets may not be correct and
+ therefore the md5 hex digest is a best guess at what it may be.
+
+ :param version:
+ :param file_offset:
+ :param source:
+ :param page_number:
+ :param location:
+ :param index:
+ :param data:
+ :param serial_type_definition_start_offset:
+ :param serial_type_definition_end_offset:
+ :param cutoff_offset:
+ :param number_of_columns:
+ :param signature:
+ :param first_column_serial_types:
+ :param freeblock_size:
+
+ :return:
+
+ """
+
+ """
+
+ Below we initialize the super constructor by sending in the version number of the version sent in to be the
+ page version number. The location will specify where the cell was carved from, either freeblocks in b-tree
+ cells or unallocated space in b-tree pages or any other pages. The index will be 0..N for freeblock carvings
+ or just 0 for unallocated space. The offset for the serial type definition start will be sent in as the offset,
+ however this will be updated as needed when carving processes are run against the preceding data, if applicable,
+ to determine payload length, row id, payload header size, and the first serial type in the payload header
+ depending on the size of the varint between those fields and which fields are needed depending on the cells
+ being parsed:
+ 1.) Table Leaf B-Tree Cell: PAYLOAD_LENGTH_VARINT ROW_ID_VARINT PAYLOAD [OVERFLOW_PAGE_NUMBER]
+ 2.) Table Interior B-Tree Cell: LEFT_CHILD_POINTER INTEGER_KEY_VARINT (the integer key is a row id) (no payload)
+ 3.) Index Leaf B-Tree Cell: PAYLOAD_LENGTH_VARINT PAYLOAD [OVERFLOW_PAGE_NUMBER]
+ 4.) Index Interior B-Tree Cell: LEFT_CHILD_POINTER PAYLOAD_LENGTH_VARINT PAYLOAD [OVERFLOW_PAGE_NUMBER]
+
+ Better support needs to be done for supporting other cell types than the table leaf cell which is focused on
+ here.
+
+ """
+
+ super(CarvedBTreeCell, self).__init__(version, version.version_number, file_offset, page_number,
+ index, serial_type_definition_start_offset, source, location)
+
+ """
+
+ Since versioning is not implemented for rollback journal files we are going to set the version number to -1
+ here. This is done since rollback journals store previous data to what is in the SQLite database file as
+ opposed to the WAL file where the most recent data in the WAL file reflects the most current state.
+
+ """
+
+ if source is FILE_TYPE.ROLLBACK_JOURNAL:
+ self.version_number = -1
+ self.page_version_number = -1
+
+ self.payload = CarvedRecord(location, data, serial_type_definition_start_offset,
+ serial_type_definition_end_offset, cutoff_offset, number_of_columns, signature,
+ first_column_serial_types, freeblock_size, version.page_size)
+
+ """
+
+ After calling the above super constructor and setting the payload, we are left with a few more fields that
+ need to be accounted for in the BTreeCell class. These fields are as follows:
+ 1.) self.start_offset: This is originally set to the serial_type_definition_start_offset through the super
+ constructor but needs to be updated based on what is determined after carving the
+ payload.
+ 2.) self.end_offset: Updated after carving of the payload.
+ 3.) self.byte_size: Calculated from the start and end offset after carving of the payload.
+ 4.) self.md5_hex_digest: This is set to the md5 hash of the data between the start offset and end offset
+ determined after the carving of the payload. It is important to note that these
+ offsets may not be correct and therefore the md5 hex digest is a best guess at what
+ it may be.
+
+ """
+
+ self.start_offset = self.payload.cell_start_offset
+ self.end_offset = self.payload.cell_end_offset
+
+ self.byte_size = self.end_offset - self.start_offset
+ self.md5_hex_digest = get_md5_hash(data[self.start_offset:self.end_offset])
+
+ """
+
+ Additionally to the fields in the BTreeCell class, we add truncated fields to signify if the record was
+ truncated at either the beginning or ending.
+
+ """
+
+ self.truncated_beginning = self.payload.truncated_beginning
+ self.truncated_ending = self.payload.truncated_ending
+
+ self.row_id = "Unknown"
+
+ def stringify(self, padding=""):
+ string = "\n"\
+ + padding + "Truncated Beginning: {}\n" \
+ + padding + "Truncated Ending: {}"
+ string = string.format(self.truncated_beginning,
+ self.truncated_ending)
+ return super(CarvedBTreeCell, self).stringify(padding) + string
+
+
+class CarvedRecord(Payload):
+
+ def __init__(self, location, data, serial_type_definition_start_offset, serial_type_definition_end_offset,
+ cutoff_offset, number_of_columns, signature, first_column_serial_types=None,
+ freeblock_size=None, page_size=None):
+
+ super(CarvedRecord, self).__init__()
+
+ """
+
+ Note: The overflow fields below will stay their default values of False and None initialized in the super
+ class:
+
+ self.has_overflow = False
+ self.bytes_on_first_page = None
+ self.overflow_byte_size = None
+
+ There is a TODO in reference to figuring out the best way to handle overflow. Keep in mind that a lot
+ of times the end portion of a cell may be overwritten, especially in a freeblock, since SQLite adds cells
+ from the ending of the unallocated or freeblock content which would in turn overwrite the four byte
+ overflow page number. However, it is possible to calculate if the entry had overflow if the payload
+ size is correctly determined.
+
+ """
+
+ self.start_offset = None
+ self.byte_size = None
+ self.end_offset = None
+
+ self.header_byte_size = None
+ self.header_byte_size_varint_length = None
+ self.header_start_offset = None
+ self.header_end_offset = None
+ self.body_start_offset = None
+ self.body_end_offset = None
+
+ self.md5_hex_digest = None
+
+ self.location = location
+ self.serial_type_definition_start_offset = serial_type_definition_start_offset
+ self.serial_type_definition_end_offset = serial_type_definition_end_offset
+ self.number_of_columns = number_of_columns
+ self.first_column_serial_types = first_column_serial_types
+ self.freeblock_size = freeblock_size
+ self.serial_type_definition_size = \
+ self.serial_type_definition_end_offset - self.serial_type_definition_start_offset
+
+ self.cutoff_offset = cutoff_offset
+ self.truncated_beginning = False
+ self.truncated_ending = False
+
+ record_column_md5_hash_strings = [""] * self.number_of_columns
+
+ column_index = 0
+ body_byte_size = 0
+
+ serial_type_definition_content_size = calculate_body_content_size(
+ data[self.serial_type_definition_start_offset:self.serial_type_definition_end_offset])
+
+ if self.serial_type_definition_start_offset == 0:
+
+ if self.location == CELL_LOCATION.UNALLOCATED_SPACE:
+ warn("unsupported", RuntimeWarning)
+
+ """
+
+ We do not know what the header amount could have been here. We could check in reference to the
+ header + byte array ( == 10 for table leaf cell) but we do not seem to gain a lot from this.
+
+ We could also use probability on row and columns to figure out what the first column type is here
+ (using a row signatures) or apply probability on the record and record column.
+
+ """
+
+ elif self.location == CELL_LOCATION.FREEBLOCK:
+
+ # All 4 fields are 1 byte
+ header_byte_size_varint_length = 1
+ header_byte_size = header_byte_size_varint_length + self.serial_type_definition_size + 1
+ payload_byte_size = self.freeblock_size - 2
+ body_content_size = payload_byte_size - header_byte_size
+
+ first_serial_type_varint_length = 1
+ first_serial_type_content_size = body_content_size - serial_type_definition_content_size
+
+ if first_serial_type_content_size > int('1111111', 2):
+ warn("first serial type too big", RuntimeWarning)
+
+ matching_serial_types = []
+ for serial_type in self.first_column_serial_types:
+ if get_content_size(serial_type) == first_serial_type_content_size or serial_type in \
+ [BLOB_SIGNATURE_IDENTIFIER, TEXT_SIGNATURE_IDENTIFIER]:
+ matching_serial_types.append(serial_type)
+
+ if len(matching_serial_types) > 1:
+ warn("multiple matching, need to use probability")
+
+ elif len(matching_serial_types) == 1:
+
+ first_serial_type = matching_serial_types[0]
+
+ self.serial_type_signature += str(get_serial_type_signature(first_serial_type))
+
+ record_column_md5_hash_strings[column_index] = ""
+
+ self.serial_type_definition_size += first_serial_type_varint_length
+
+ first_carved_record_column = CarvedRecordColumn(column_index, first_serial_type,
+ first_serial_type_varint_length,
+ first_serial_type_content_size)
+ first_carved_record_column.truncated_first_serial_type = True
+ self.truncated_beginning = True
+ self.record_columns.append(first_carved_record_column)
+ column_index += 1
+ body_byte_size += first_serial_type_content_size
+
+ else:
+ warn("could not find matching serial types", RuntimeWarning)
+
+ else:
+ raise CellCarvingError()
+
+ elif self.serial_type_definition_start_offset == 1:
+
+ if self.location == CELL_LOCATION.UNALLOCATED_SPACE:
+ warn("unsupported", RuntimeWarning)
+
+ """
+
+ A way to address this may be checking if the signature does not have a -1 or -2 (blob or string), then
+ check the single byte to get the serial type and then check this against the signatures. If it does
+ not, then use the probability but we will not know hte length of the type unless the cutoff is
+ correctly implemented. Freeblocks do not count since the size may not match (since they need two bytes)
+ but you may be able to check on one byte.
+
+ """
+
+ elif self.location == CELL_LOCATION.FREEBLOCK:
+
+ """
+
+ The row id was 2 varint length in bytes 128 <= x <= 16383 or payload >= 2 varint bytes (or both)
+ or header size. Use cases for this need to be investigated further.
+
+ """
+
+ first_serial_type, first_serial_type_varint_length = \
+ decode_varint(data, self.serial_type_definition_start_offset - 1)
+
+ if first_serial_type_varint_length != 1:
+ raise CellCarvingError()
+
+ if get_serial_type_signature(first_serial_type) in self.first_column_serial_types:
+
+ self.serial_type_definition_size += first_serial_type_varint_length
+
+ first_serial_type_content_size = get_content_size(first_serial_type)
+
+ header_byte_size_varint_length = 1
+
+ if self.serial_type_definition_size >= int('1111111' * 1, 2):
+ header_byte_size_varint_length += 1
+ elif self.serial_type_definition_size >= int('1111111' * 2, 2):
+ header_byte_size_varint_length += 2
+
+ header_byte_size = self.serial_type_definition_size + header_byte_size_varint_length
+
+ body_content_size = serial_type_definition_content_size + first_serial_type_content_size
+
+ payload_byte_size = header_byte_size + body_content_size
+
+ self.serial_type_signature += str(get_serial_type_signature(first_serial_type))
+
+ record_column_md5_hash_strings[column_index] = data[self.serial_type_definition_start_offset - 1:
+ self.serial_type_definition_start_offset]
+
+ first_carved_record_column = CarvedRecordColumn(column_index, first_serial_type,
+ first_serial_type_varint_length,
+ first_serial_type_content_size)
+
+ self.record_columns.append(first_carved_record_column)
+ column_index += 1
+ body_byte_size += first_serial_type_content_size
+
+ else:
+ warn("unable to find serial type with 1 preceding", RuntimeWarning)
+
+ else:
+ raise CellCarvingError()
+
+ elif self.serial_type_definition_start_offset >= 2:
+
+ if self.location == CELL_LOCATION.UNALLOCATED_SPACE:
+ warn("unsupported unallocated space with serial type definition start offset >= 2", RuntimeWarning)
+
+ elif self.location == CELL_LOCATION.FREEBLOCK:
+
+ """
+
+ There are three use cases that can occur here:
+ 1.) Everything was overwritten up to this point and there is nothing more to carve
+ 2.) Freeblock cutting off beginning with size up to the first serial type
+ 3.) Freeblock cutting off beginning but not the first serial type and the header size/row id may still
+ be in tact somewhat (payload must be overwritten partially in best case)
+
+ """
+
+ # First check first byte against serial types but also parse freeblock size and check which is best
+ freeblock_size = unpack(b">H", data[self.serial_type_definition_start_offset - 2:
+ self.serial_type_definition_start_offset])[0]
+ freeblock_first_serial_type_min, freeblock_first_serial_type_max = \
+ calculate_serial_type_definition_content_length_min_max(None, 1)
+
+ header_byte_size_varint_length = 1
+ header_byte_size = header_byte_size_varint_length + self.serial_type_definition_size + 1
+
+ body_content_size_min = serial_type_definition_content_size + freeblock_first_serial_type_min
+ body_content_size_max = serial_type_definition_content_size + freeblock_first_serial_type_max
+
+ payload_size_min = header_byte_size + body_content_size_min
+ payload_size_max = header_byte_size + body_content_size_max
+
+ freeblock_size_valid = False
+ if freeblock_size >= payload_size_min and freeblock_size <= payload_size_max:
+ freeblock_size_valid = True
+
+ next_free_block_offset = None
+ if freeblock_size_valid and self.serial_type_definition_start_offset >= 4:
+ next_free_block_offset = unpack(b">H", data[self.serial_type_definition_start_offset - 4:
+ self.serial_type_definition_start_offset - 2])[0]
+ if next_free_block_offset >= page_size:
+ freeblock_size_valid = False
+
+ """
+
+ Check first serial types not in freeblock size first byte.
+
+ """
+
+ # Check freeblock size valid over first serial type
+ if freeblock_size_valid:
+
+ # All 4 fields are 1 byte
+ header_byte_size_varint_length = 1
+ header_byte_size = header_byte_size_varint_length + self.serial_type_definition_size + 1
+ payload_byte_size = freeblock_size - 2
+ body_content_size = payload_byte_size - header_byte_size
+
+ first_serial_type_varint_length = 1
+ first_serial_type_content_size = body_content_size - serial_type_definition_content_size
+
+ if first_serial_type_content_size > int('1111111', 2):
+ warn("first serial type too big", RuntimeWarning)
+
+ matching_serial_types = []
+ for serial_type in self.first_column_serial_types:
+ if get_content_size(serial_type) == first_serial_type_content_size or serial_type in \
+ [BLOB_SIGNATURE_IDENTIFIER, TEXT_SIGNATURE_IDENTIFIER]:
+ matching_serial_types.append(serial_type)
+
+ if len(matching_serial_types) > 1:
+ warn("multiple matching, need to use probability")
+
+ elif len(matching_serial_types) == 1:
+
+ first_serial_type = matching_serial_types[0]
+
+ self.serial_type_signature += str(get_serial_type_signature(first_serial_type))
+
+ record_column_md5_hash_strings[column_index] = ""
+
+ self.serial_type_definition_size += first_serial_type_varint_length
+
+ first_carved_record_column = CarvedRecordColumn(column_index, first_serial_type,
+ first_serial_type_varint_length,
+ first_serial_type_content_size)
+ first_carved_record_column.truncated_first_serial_type = True
+ self.truncated_beginning = True
+ self.record_columns.append(first_carved_record_column)
+ column_index += 1
+ body_byte_size += first_serial_type_content_size
+
+ else:
+ warn("could not find matching serial types", RuntimeWarning)
+
+ else:
+
+ """
+
+ There are two main use cases here:
+ 1.) single byte varint 00-09
+ 2.) multi byte varint (if in signature)
+
+ A possible third use case may be a inner freeblock.
+
+ """
+
+ simplified_variable_length_serial_types = [BLOB_SIGNATURE_IDENTIFIER, TEXT_SIGNATURE_IDENTIFIER]
+ text_or_blob_serial_type = \
+ any(i in first_column_serial_types for i in simplified_variable_length_serial_types)
+
+ if not text_or_blob_serial_type:
+
+ freeblock_size = None
+
+ # Check the previous two bytes if they exist:
+ if self.serial_type_definition_start_offset >= 3:
+ freeblock_size = unpack(b">H", data[self.serial_type_definition_start_offset - 3:
+ self.serial_type_definition_start_offset - 1])[0]
+
+ """
+
+ The row id was 2 varint length in bytes 128 <= x <= 16383 or payload >= 2 varint bytes (or both)
+ or header size. Use cases for this need to be investigated further.
+
+ """
+
+ first_serial_type, first_serial_type_varint_length = \
+ decode_varint(data, self.serial_type_definition_start_offset - 1)
+
+ if first_serial_type_varint_length != 1:
+
+ """
+
+ Note: Issues can occur here where the pattern matches something not in a serial type
+ header that is a serial type. For instance: 000000900302 will match a simple signature
+ (freeblock) of [[02], [03]] which will result in [03] will match the 03 and detect 90
+ as the first serial type where it could be the size of the freeblock in the form of 0090.
+
+ """
+
+ raise CellCarvingError("Invalid first serial type varint size determined. "
+ "Unable to carve due to probable false positive.")
+
+ if get_serial_type_signature(first_serial_type) in self.first_column_serial_types:
+
+ self.serial_type_definition_size += first_serial_type_varint_length
+
+ first_serial_type_content_size = get_content_size(first_serial_type)
+
+ header_byte_size_varint_length = 1
+
+ if self.serial_type_definition_size >= int('1111111' * 1, 2):
+ header_byte_size_varint_length += 1
+ elif self.serial_type_definition_size >= int('1111111' * 2, 2):
+ header_byte_size_varint_length += 2
+
+ header_byte_size = self.serial_type_definition_size + header_byte_size_varint_length
+
+ body_content_size = serial_type_definition_content_size + first_serial_type_content_size
+
+ payload_byte_size = header_byte_size + body_content_size
+
+ # Add one since row id, payload, or serial type header (not) >= 1 varint
+ calculated_freeblock_size = payload_byte_size + 2 + 1
+ freeblock_size_valid = False
+ if freeblock_size == calculated_freeblock_size:
+ freeblock_size_valid = True
+
+ next_free_block_offset = None
+ if freeblock_size_valid and self.serial_type_definition_start_offset >= 5:
+ next_free_block_offset = unpack(b">H",
+ data[self.serial_type_definition_start_offset - 5:
+ self.serial_type_definition_start_offset - 3])[0]
+ if next_free_block_offset >= page_size:
+ freeblock_size_valid = False
+
+ self.serial_type_signature += str(get_serial_type_signature(first_serial_type))
+
+ record_column_md5_hash_strings[column_index] = \
+ data[self.serial_type_definition_start_offset - 1:
+ self.serial_type_definition_start_offset]
+
+ first_carved_record_column = CarvedRecordColumn(column_index, first_serial_type,
+ first_serial_type_varint_length,
+ first_serial_type_content_size)
+ self.record_columns.append(first_carved_record_column)
+
+ column_index += 1
+ body_byte_size += first_serial_type_content_size
+
+ else:
+ warn("unable to find serial type with 1 preceding", RuntimeWarning)
+
+ else:
+
+ first_serial_type = None
+ first_serial_type_varint_length = None
+ try:
+
+ first_serial_type, first_serial_type_varint_length = \
+ decode_varint_in_reverse(data, self.serial_type_definition_start_offset, 5)
+
+ except InvalidVarIntError:
+ pass
+
+ if self.first_column_serial_types and not len(self.record_columns):
+
+ first_serial_type = first_column_serial_types[0]
+ if signature.total_records == 0:
+ # Set as null for now
+ first_serial_type = 0
+ if len(first_column_serial_types) != 1:
+ simplified_probabilistic_signature = signature.simplified_probabilistic_signature
+ if simplified_probabilistic_signature:
+ # Found probability otherwise it is a schema without probability
+ first_probabilistic_column_serial_types = simplified_probabilistic_signature[0]
+ first_serial_type = max(first_probabilistic_column_serial_types,
+ key=lambda first_probabilistic_column_serial_type:
+ first_probabilistic_column_serial_type[1])[0]
+ first_serial_type_varint_length = 1
+ self.serial_type_signature += str(get_serial_type_signature(first_serial_type))
+ self.serial_type_definition_size += first_serial_type_varint_length
+ if first_serial_type == TEXT_SIGNATURE_IDENTIFIER:
+ first_serial_type = 12
+ if first_serial_type == BLOB_SIGNATURE_IDENTIFIER:
+ first_serial_type = 13
+ first_serial_type_content_size = get_content_size(first_serial_type)
+ first_carved_record_column = CarvedRecordColumn(column_index, first_serial_type,
+ first_serial_type_varint_length,
+ first_serial_type_content_size)
+ first_carved_record_column.probabilistic_first_serial_type = True
+ first_carved_record_column.truncated_first_serial_type = True
+ self.truncated_beginning = True
+ self.record_columns.append(first_carved_record_column)
+ column_index += 1
+ body_byte_size += first_serial_type_content_size
+
+ """
+
+ We iterate through the header and generate all of the carved record columns off of the header. We know we have
+ at least enough information in the header to be able to determine the types and size of the body regardless of
+ if we have the body or not. This is due to the expression being sent in determined from regular expressions
+ which match the header, with the possible exception of the first serial type which if existing, has already
+ been handled above.
+
+ """
+
+ current_header_offset = self.serial_type_definition_start_offset
+ while current_header_offset < self.serial_type_definition_end_offset:
+
+ serial_type, serial_type_varint_length = decode_varint(data, current_header_offset)
+
+ serial_type_varint_end_offset = current_header_offset + serial_type_varint_length
+
+ if serial_type_varint_end_offset > self.serial_type_definition_end_offset:
+ raise CellCarvingError()
+
+ self.serial_type_signature += str(get_serial_type_signature(serial_type))
+
+ record_column_md5_hash_strings[column_index] = data[current_header_offset:serial_type_varint_end_offset]
+
+ content_size = get_content_size(serial_type)
+
+ carved_record_column = CarvedRecordColumn(column_index, serial_type, serial_type_varint_length,
+ content_size)
+ self.record_columns.append(carved_record_column)
+
+ current_header_offset += serial_type_varint_length
+ body_byte_size += content_size
+ column_index += 1
+
+ if len(self.record_columns) != number_of_columns:
+ raise CellCarvingError()
+
+ self.body_start_offset = self.serial_type_definition_end_offset
+ self.body_end_offset = self.serial_type_definition_end_offset + body_byte_size
+
+ if self.body_end_offset > len(data):
+ self.truncated_ending = True
+
+ """
+
+ Note: This does not currently work for multiple options in the first or variable length serial types.
+
+ """
+
+ # First truncated column field
+ current_body_offset = self.body_start_offset
+ for carved_record_column in self.record_columns:
+
+ if (current_body_offset + carved_record_column.content_size) > len(data):
+ carved_record_column.truncated_value = True
+ if current_body_offset < len(data):
+ carved_record_column.value = data[current_body_offset:]
+ record_column_md5_hash_strings[carved_record_column.index] += data[current_body_offset:]
+ carved_record_column.md5_hex_digest = \
+ get_md5_hash(record_column_md5_hash_strings[carved_record_column.index])
+
+ else:
+
+ """
+
+ This means that: offset + content_size <= len(data)
+
+ """
+
+ value_data = data[current_body_offset:current_body_offset + carved_record_column.content_size]
+ content_size, value = get_record_content(carved_record_column.serial_type, value_data)
+
+ if content_size != carved_record_column.content_size:
+ raise CellCarvingError()
+ carved_record_column.value = value
+ record_column_md5_hash_strings[carved_record_column.index] += value_data
+ carved_record_column.md5_hex_digest = \
+ get_md5_hash(record_column_md5_hash_strings[carved_record_column.index])
+
+ current_body_offset += carved_record_column.content_size
+
+ if self.body_end_offset != current_body_offset:
+ raise CellCarvingError()
+
+ # This assumes the length of the header is 1 byte (most cases it will or would mean # of rows > 127 for table).
+ self.header_byte_size = self.serial_type_definition_size + 1
+
+ self.header_byte_size_varint = encode_varint(self.header_byte_size)
+ self.header_byte_size_varint_length = len(self.header_byte_size_varint)
+
+ self.payload_byte_size = self.header_byte_size + body_byte_size
+
+ self.payload_byte_size_varint = encode_varint(self.payload_byte_size)
+ self.payload_byte_size_varint_length = len(self.payload_byte_size_varint)
+
+ # Below is relative to the unallocated space. The "-1" is to account for the row id.
+ self.cell_start_offset = self.serial_type_definition_start_offset - self.record_columns[0].\
+ serial_type_varint_length - self.header_byte_size_varint_length - 1 - self.payload_byte_size_varint_length
+ self.cell_end_offset = self.body_end_offset
+
+
+class CarvedRecordColumn(RecordColumn):
+
+ def __init__(self, index, serial_type, serial_type_varint_length, content_size):
+
+ """
+
+ Constructor.
+
+ This method constructs the carved record column by calling it's super constructor and then setting a few
+ additional fields for itself in reference to carving traits.
+
+ If this carved record column was truncated (ie. the rest of the record was overwritten at some point), then
+ the truncated value flag will be set to True. If this is the case, the value may or may not be set depending
+ if this column was the actually column that got cut off. Past the first column that gets truncated, all
+ following carved record columns will not have the value set.
+
+ Keep in mind that the column value may be "None" if it was a NULL value in the database. However, this will
+ only be truly NULL if the field is not truncated. If the field is truncated, then if it has a value of "None"
+ it is due to the fact that it was unable to be obtained.
+
+ The md5 hex digest will be the md5 of the found portions of the record column whether that just be the serial
+ type header, serial type header and value, or serial type header and truncated value.
+
+ It is also important to keep in mind that parts of the record could be overwritten without being detected
+ resulting in some weird values.
+
+ Note: For reference the RecordColumn super class has the following attributes:
+ 1.) index
+ 2.) serial_type
+ 3.) serial_type_varint_length
+ 4.) content_size
+ 5.) value
+ 6.) md5_hex_digest
+
+ :param index:
+ :param serial_type:
+ :param serial_type_varint_length:
+ :param content_size:
+
+ :return:
+
+ """
+
+ """
+
+ Call to the constructor of the super record column class but specify "None" for the value and
+ md5 hex digest since they aren't known at this time.
+
+ """
+
+ super(CarvedRecordColumn, self).__init__(index, serial_type, serial_type_varint_length, content_size,
+ None, None)
+
+ self.simplified_serial_type = self.serial_type
+ if self.serial_type >= 12 and self.serial_type % 2 == 0:
+ self.simplified_serial_type = -1
+ elif self.serial_type >= 13 and self.serial_type % 2 == 1:
+ self.simplified_serial_type = -2
+
+ """
+
+ Note: The below values are set to defaults and expected to be updated by the calling class if intended for use.
+
+ """
+
+ self.value = None
+ self.md5_hex_digest = None
+
+ self.truncated_first_serial_type = False
+ self.truncated_value = False
+ self.probabilistic = False
+
+ def stringify(self, padding=""):
+ string = "\n" \
+ + padding + "Simplified Serial Type: {}\n" \
+ + padding + "Truncated First Serial Type: {}\n" \
+ + padding + "Truncated Value: {}\n" \
+ + padding + "Probabilistic: {}"
+ string = string.format(self.simplified_serial_type,
+ self.truncated_first_serial_type,
+ self.truncated_value,
+ self.probabilistic)
+ return super(CarvedRecordColumn, self).stringify(padding) + string
+
+ """
+
+ If we have a the first column serial types set, then the full serial type definition (referring to the
+ payload header excepting the header size) was not determined previously. However, since freeblocks
+ overwrite the first four bytes, assuming there is a payload size, row id, and serial type header size
+ followed by the serial types (ie. a b-tree table leaf cell), at most the first serial type can be
+ overwritten, or the first varint byte of a varint serial type if it is more than 1 byte in length.
+ Again, this only accounts for b-tree table leaf cells and there is a TODO in reference to supporting
+ other cell types.
+
+ There are two use cases to address for the first column serial types:
+ 1.) Preceding bytes detected.
+ 2.) No Preceding bytes detected (or invalid varint from #1).
+
+ 1.) Preceding bytes detected:
+
+ If there are bytes preceding the serial type definition start offset in the data, then we may be able
+ to parse backwards in order to determine the first serial type and payload header size assuming the best
+ case scenario and a b-tree table leaf, index interior, or index leaf cell since b-tree table interiors do
+ not have payloads associated with their cells. We also have to assume that the preceding bytes were not
+ overwritten in some manner.
+
+ The way we will check the first column serial type will be to see what serial types are possible for it.
+ Remember that the first column serial types is an array of the different serial types that can exist and
+ will be a subset of: [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] where -2 and -1 are varint serial types
+ representing the TEXT and BLOB storage classes respectfully.
+
+ If a varint serial type exists that can be more than 1 byte (TEXT or BLOB), we will call the
+ decode_varint_in_reverse function in order to retrieve it. However, before we do this we will AND the byte
+ with 0x80 to see if the most significant bit is set (ie. msb_set = varint_serial_type_byte & 0x80).
+ If the most significant bit is set, then it is likely this is not the least significant byte of a serial
+ type since the least significant byte should never have the most significant bit set. Since all serial
+ types that are not multi-byte serial types will be within the range of 0x00 to 0x09, this will tell us a
+ few things. The first thing is if the first serial type is a single byte varint serial type
+ meaning it could be any of the serial types including TEXT and BLOB with the size of 57 or less with
+ regards to the first serial type least significant byte:
+
+ 1.) TEXT: Min single byte size: 0x0D = (13 - 13)/2 = 0
+ Max single byte size: 0x7F = (127 - 13)/2 = 57
+
+ Note: However, there may be additional, preceding bytes signifying a larger size.
+
+ Note: The TEXT is "odd" and can be determined by checking if the byte > 13 and
+ if the byte % 2 == 1. Similarly, if it the byte > 13 then if byte & 0x01,
+ it is also TEXT.
+
+ 2.) BLOB: Min single byte size: 0x0C = (12 - 12)/2 = 0
+ Max single byte size: 0x7E = (126 - 12)/2 = 57
+
+ Note: However, there may be additional, preceding bytes signifying a larger size.
+
+ Note: The BLOB is "even" and can be determined by checking if the byte > 12 and
+ if the byte % 2 == 0. Similarly, if it the byte > 12 then if NOT byte & 0x01,
+ it is also BLOB.
+
+ 3.) All other serial types are single byte varints where 0x00 <= serial_type <= 0x09.
+
+ Note: The bytes 0x0A and 0x0B are not used and are currently reserved for expansion. This in combination
+ of the above use cases and those where the most significant bit is set cover all use cases for
+ relating the preceding byte (the least significant byte of the possible multi-byte varint) to their
+ respective serial type. However, we still may not have the correct length of the serial types in
+ respect to the variable length multi-byte varints for TEXT and BLOB with a size greater than 57.
+ This will be determined by looking at preceding bytes, if existing, and accuracy will be depending
+ on how many bytes preceding this byte remain and if it has not been overwritten in any way.
+
+ If either the 0x0A, 0x0B or msb_set (varint_serial_type_byte & 0x80), then we do not have a serial type
+ and we resort to the same use cases as #2 below since we have determined an invalid varint.
+
+ If we do have a serial type where the byte is between 0x0C and 0x7F, then we have to look at the preceding
+ bytes, if existing to hopefully determine if it is a portion of a larger varint determining a larger size
+ for that data type. In order to get the correct size of the serial type we call the
+ decode_varint_in_reverse function to parse backwards until we either hit the 9 byte maximum for varints or
+ find a most significant byte where the most significant bit is not set. However, there is a chance we will
+ run out of data in the array going backwards. In order to facilitate this, the decode_varint_in_reverse
+ returns three fields in the form of a tuple:
+ (unsigned_integer_value, varint_relative_offset, truncated)
+ Keep in mind that even if it was not truncated and found all bytes for the varint, the varint still may be
+ incorrect due to use cases where it was overwritten with bytes that may be mistaken for valid varint bytes.
+
+ If the variable length serial type turns out to be truncated, then we set that flag in the carved record
+ since we can not be certain if it is either partially carved or completely erroneous. We leave this in
+ order to be addressed as needed when parsing the first serial type data content from the body.
+
+ However, the function can also throw an InvalidVarIntError in which case the varint will be assumed to be
+ overwritten in some way and we will default to the process explained further below where we do not have
+ preceding bytes. This is also true if we find a invalid serial type on the first preceding byte.
+
+ Note: There is a chance of false positives being returned by this function and validation checks need to
+ be investigated in order to make this value more deterministic. A TODO has been placed at the top
+ of this script in reference to this issue.
+
+ Note: Also, 9 byte varints are not currently handled. There are TODOs in references to 9 byte varint
+ parsing in both this script and their respective parsing function scripts.
+
+ 2.) No Preceding bytes detected (or invalid varint from #1).
+
+ If there are no bytes preceding the serial type definition start offset, we will assume the field is the
+ one with the most probability.
+
+ """
+
+ """
+
+ 1.) Preceding bytes detected:
+
+ In order to check if we have preceding bytes and then parse backwards through them we first check if
+ the serial type definition start offset is greater than one. If this is true, we know we have at least
+ one preceding byte that we can check to see the serial type of.
+
+ Keep in mind that although this will give us a serial type, it may be a byte overwritten by something else
+ and is not completely deterministic.
+
+ """
diff --git a/sqlite_dissect/carving/carver.py b/sqlite_dissect/carving/carver.py
new file mode 100644
index 0000000..14ef80b
--- /dev/null
+++ b/sqlite_dissect/carving/carver.py
@@ -0,0 +1,593 @@
+from logging import getLogger
+from re import compile
+from warnings import warn
+from sqlite_dissect.carving.carved_cell import CarvedBTreeCell
+from sqlite_dissect.carving.utilities import generate_signature_regex
+from sqlite_dissect.constants import BLOB_SIGNATURE_IDENTIFIER
+from sqlite_dissect.constants import CELL_LOCATION
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import TEXT_SIGNATURE_IDENTIFIER
+from sqlite_dissect.exception import CarvingError
+from sqlite_dissect.exception import CellCarvingError
+
+"""
+
+carver.py
+
+This script holds carver objects for identifying and parsing out cells from unallocated and
+freeblock space in SQLite b-tree pages.
+
+This script holds the following object(s):
+SignatureCarver(Carver)
+
+"""
+
+
+class SignatureCarver(object):
+
+ @staticmethod
+ def carve_freeblocks(version, source, freeblocks, signature):
+
+ """
+
+ This function will carve the freeblocks list with the signature specified.
+
+ Note: The signature that will be used from the signature object will be the simplified signature unless
+ one does not exist (in the case where one was generated with no row entries), in which case the
+ simplified schema signature will be used.
+
+ Note: The serial type definition nomenclature does not include the serial type header size field in reference
+ to the offsets and may also not include the first (or first byte of a multi-byte varint) serial type and
+ therefor dubbed "definition" instead of header signifying only a portion of the header.
+
+ :param version:
+ :param source:
+ :param freeblocks:
+ :param signature:
+
+ :return:
+
+ """
+
+ logger = getLogger(LOGGER_NAME)
+
+ number_of_columns = signature.number_of_columns
+
+ simplified_signature = signature.simplified_signature
+
+ if not simplified_signature:
+ simplified_signature = signature.recommended_schema_signature
+ logger.debug("Using recommended schema signature: {}.".format(simplified_signature))
+ else:
+ logger.debug("Using simplified signature: {}.".format(simplified_signature))
+
+ if not simplified_signature:
+ log_message = "No signature was found."
+ logger.error(log_message)
+ raise CarvingError(log_message)
+
+ """
+
+ Since we are carving freeblocks here, we will remove the first column serial type. This is due to the fact
+ that the freeblock header overwrites the first four bytes of the cell which usually overwrites the first
+ serial type in the header of the record since that is the fourth byte (assuming payload, row id, and header
+ length (where applicable) are all less than 1 varint).
+
+ """
+
+ first_column_serial_types = simplified_signature[0]
+
+ if BLOB_SIGNATURE_IDENTIFIER in first_column_serial_types or TEXT_SIGNATURE_IDENTIFIER in \
+ first_column_serial_types:
+ log_message = "A variable length serial type was found in the first column serial types: {} while" \
+ "carving freeblocks with signatures: {}. Signatures starting with variable length serial " \
+ "types are not fully implemented and may result in carving false positives."
+ log_message = log_message.format(first_column_serial_types, simplified_signature)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ # Retrieve and compile the serial type definition signature pattern
+ serial_type_definition_signature_pattern = compile(generate_signature_regex(simplified_signature, True))
+
+ # Initialize the carved cells
+ carved_cells = []
+
+ # Iterate through the freeblocks
+ for freeblock in freeblocks:
+
+ # Get the content for the current freeblock
+ freeblock_content = freeblock.content
+
+ # Initialize the list for the serial type definition match objects
+ serial_type_definition_match_objects = []
+
+ # Find all matches for the serial type definition signature pattern
+ for serial_type_definition_match in serial_type_definition_signature_pattern.finditer(freeblock_content):
+ serial_type_definition_match_objects.append(serial_type_definition_match)
+
+ """
+
+ In order to carve the freeblocks we have to start from the ending match and move backwards through the
+ matches in the freeblock. This is due to the fact that when a freeblock is made, it can be reallocated,
+ and then have the entry deleted again in it expanding it back to the original size it previously was. When
+ a freeblock is reallocated it counts the space it needs from the end of the freeblock rather than from
+ the beginning. This means that the ending portion (usually the data) of the previous freeblock that was
+ in the spot will be overwritten. Therefore, there is a good chance we should be able to parse out the last
+ match successfully, but will end up have truncated carvings "beneath" the last one.
+
+ As an example freeblocks are overwritten in the following pattern:
+ [Third Freeblock Entry .............]
+ [Second Freeblock Entry ................]
+ [First Freeblock Entry .........................]
+
+ This can also be in the following pattern though:
+ [Allocated Cell Entry ..............]
+ [Second Freeblock Entry ................]
+ [First Freeblock Entry .........................]
+
+ In the above example we have the possibility of losing all of the data and being unable to parse anything
+ but the header of the previous freeblocks.
+
+ """
+
+ """
+
+ The cutoff offset will be initialized to the length of the freeblock content and then be updated for
+ "beneath" freeblock entries to be the starting offset of the previous entry. There is some variation on
+ if this is the actual cutoff or not but will always be after the actual cutoff when done this way.
+ It is just important to keep in mind that the previous freeblocks may actually be cutoff before this offset
+ and the "above" freeblocks may go back that length for things like payload size, row id, serial type header
+ length and the first serial type depending on the use case.
+
+ """
+
+ cutoff_offset = len(freeblock_content)
+
+ page_offset = version.get_page_offset(freeblock.page_number)
+
+ # Iterate through the serial type definition matches in reverse
+ for serial_type_definition_match in reversed(serial_type_definition_match_objects):
+
+ """
+
+ For the serial type definition match objects returned from the iterator above, the match object has a
+ start and a end function to get the beginning offset and ending offset. This is done by calling
+ start(0) or end (0) with 0 being the group number. The ending offset is exclusive
+ ie. [start(0):end(0)).
+
+ """
+
+ serial_type_definition_start_offset = serial_type_definition_match.start(0)
+ serial_type_definition_end_offset = serial_type_definition_match.end(0)
+ file_offset = page_offset + freeblock.start_offset + serial_type_definition_start_offset
+
+ try:
+
+ # Create and append the carved b-tree cell to the carved cells list
+ carved_cells.append(CarvedBTreeCell(version, file_offset, source, freeblock.page_number,
+ CELL_LOCATION.FREEBLOCK,
+ freeblock.index, freeblock_content,
+ serial_type_definition_start_offset,
+ serial_type_definition_end_offset, cutoff_offset,
+ number_of_columns, signature,
+ first_column_serial_types, freeblock.byte_size))
+
+ # Update the cutoff offset
+ cutoff_offset = serial_type_definition_start_offset
+
+ except (CellCarvingError, ValueError):
+ log_message = "Carved b-tree cell creation failed at file offset: {} page number: {} " \
+ "cell source: {} in location: {} with partial serial type definition " \
+ "start offset: {} and partial serial type definition end offset: {} with " \
+ "cutoff offset of: {} number of columns: {} for master schema " \
+ "entry with name: {} and table name: {}."
+ log_message = log_message.format(file_offset, freeblock.page_number, source,
+ CELL_LOCATION.UNALLOCATED_SPACE,
+ serial_type_definition_start_offset,
+ serial_type_definition_end_offset, cutoff_offset,
+ number_of_columns, signature.name, signature.table_name)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ # Return the cells carved from the freeblocks
+ return carved_cells
+
+ @staticmethod
+ def carve_unallocated_space(version, source, page_number, unallocated_space_start_offset,
+ unallocated_space, signature, page_offset=None):
+
+ """
+
+ This function will carve the unallocated space with the signature specified.
+
+ Note: The signature that will be used from the signature object will be the simplified signature unless
+ one does not exist (in the case where one was generated with no row entries), in which case the
+ simplified schema signature will be used.
+
+ Note: The serial type definition nomenclature does not include the serial type header size field in reference
+ to the offsets and may also not include the first (or first byte of a multi-byte varint) serial type and
+ therefor dubbed "definition" instead of header signifying only a portion of the header.
+
+ :param version:
+ :param source:
+ :param page_number:
+ :param unallocated_space_start_offset:
+ :param unallocated_space:
+ :param signature:
+ :param page_offset: Page offset if needed to be specified. Currently only used for proof of concept
+ journal page parsing.
+
+ :return:
+
+ """
+
+ logger = getLogger(LOGGER_NAME)
+
+ number_of_columns = signature.number_of_columns
+
+ simplified_signature = signature.simplified_signature
+
+ if not simplified_signature:
+ simplified_signature = signature.recommended_schema_signature
+ logger.debug("Using recommended schema signature: {}.".format(simplified_signature))
+ else:
+ logger.debug("Using simplified signature: {}.".format(simplified_signature))
+
+ if not simplified_signature:
+ log_message = "No signature was found."
+ logger.error(log_message)
+ raise CarvingError(log_message)
+
+ # Retrieve and compile the serial type definition signature pattern
+ serial_type_definition_signature_pattern = compile(generate_signature_regex(simplified_signature))
+
+ """
+
+ In reference for supporting freeblocks and additional use cases in unallocated space:
+
+ Currently, unallocated space is carved using a full signature (not removing the first serial type) in order
+ to detect deleted entries. This can result in the following two use cases in reference to deleted entries
+ in the unallocated space:
+ 1.) Cell entries that were deleted or left over from a previous page being reused that ended up in the
+ unallocated space where the serial type header (excepting possibly the header size) of the payload
+ is in tact. Due to the way cells are inserted from the back of the page moving forward it is very
+ likely to have the beginning of the cell as well (but not a certainty).
+ 2.) Freeblocks that had either a payload, row id, or serial type header size that one or more of which were
+ either 2 byte or greater varints. This would push the serial type header (excepting possibly the header
+ size) into the main body of the freeblock. This is due to the fact that the freeblock overwrites the first
+ 4 bytes of the entry with the next freeblock offset and freeblock size. A freeblock needs at least 4 bytes
+ to exist, and if not, it is a fragment. Keep in mind this is also assuming a b-tree table leaf page and
+ may not be the case for b-tree index pages or b-tree table interiors.
+
+ In comparison to the not detected use case below, it is important to note that the first serial type may
+ also be a varint of length greater than 2 bytes and therefore still detected where The #1 use case below
+ is true but would incorrectly determine the size of the varint causing issues parsing the body of the cell.
+ Additional research and handling of this use case is needed.
+
+ The use of a "full" signature will not detect:
+ 1.) Freeblocks that have a payload, row id, and serial type header size of 1 varint will end up having the first
+ serial type overwritten (excepting the use case defined in #2 above) which will result in the entries
+ not being carved unless checking for the signature without the first serial type, like freeblocks are done.
+
+ There are a few ways to do this (very similar to the freeblock carving code above) and needs to
+ be implemented.
+
+ Discussion: There are a few ways to determine freeblocks. One way is to calculate the size of the serial type
+ definition plus 1 byte for the header (depending on size) and compare that to the previous byte to
+ see if it matches. If it does, the full header should be in tact and the body content can be
+ calculated from the serial type definition. (The body content may still be able to be calculated
+ from the serial type definition without finding the serial type header length assuming the rest of
+ the serial types are all existent (the first serial type or portion of first multi-byte varint
+ serial type is not missing). Once the body content and header content are calculated, moving
+ backwards the bytes can be checked for the size of the freeblock + 4 adding on one byte for each
+ byte gone back that does not match the size (this is to account for larger than 1 byte varints for
+ payload or row id). If this is within the acceptable range of the varint sizes and matches the
+ size, there is a good chance this is a freeblock.
+
+ Pseudocode:
+
+ serial_type_header_size =
+ ord(unallocated_space[serial_type_definition_start_offset - 1:
+ serial_type_definition_start_offset])
+
+ if serial_type_header_size ==
+ serial_type_definition_end_offset - serial_type_definition_start_offset + 1
+ This is the serial type header size (1 is added for the one byte serial type header byte size).
+ else:
+ This is not the serial type header size or the first serial type may be a multi-byte varint
+ use case which would then cause this process to move back one byte and repeat or the serial
+ type header size may be a multi-byte varint.
+
+ However the third use case below should be predetermined in the above serial_type_header_size
+ setting statement based on the size between the serial_type_definition_end_offset and
+ serial_type_definition_start_offset.
+
+ After the above:
+
+ Given additional_serial_type_header_bytes is the amount of extra bytes for the header calculated
+ above and header_start_offset refers to the location the full header starts at:
+ calculated_payload_length = additional_serial_type_header_bytes +
+ serial_type_definition_end_offset -
+ serial_type_definition_start_offset + body_content_size + 4
+ if calculated_payload_length ==
+ unpack(b">H", unallocated_space[header_start_offset - 2:header_start_offset])[0]:
+ There is a freeblock possibility but may also be a payload to a b-tree index cell.
+ else:
+ This may be a table leaf cell where this first number would be the row id, in which we should
+ reverse parse out the varint and then check the next index for the size (excepting adding in the
+ size of the row id since the payload size is only the actual payload following the row id).
+
+ A similar process could be used for parsing out cells that are not freeblocks in order to determine
+ things such as payload size, row id, serial type header length, or missing (or partially missing
+ portion of a multi-byte varint) first serial type in actual cells. This will be left up to the
+ CarvedBTreeCell class to do and the above documentation may end up applying more to that class
+ then here.
+
+ Note: Overflow still needs to be addressed.
+
+ Note: The above use cases have been determined from investigation into how SQLite stores data and may not be
+ a complete list.
+
+ """
+
+ # Initialize the list for the serial type definition match objects
+ serial_type_definition_match_objects = []
+
+ # Find all matches for the serial type definition signature pattern
+ for serial_type_definition_match in serial_type_definition_signature_pattern.finditer(unallocated_space):
+ serial_type_definition_match_objects.append(serial_type_definition_match)
+
+ # Initialize the carved cells
+ carved_cells = []
+
+ """
+
+ Like above, in the freeblock carving code, we find all of the matches for the signature and then work in reverse
+ through the unallocated space. The idea here is very similar to the freeblock carving (see the documentation
+ above) since cells are added from the unallocated space at the end of the page moving back towards the front
+ of the page much like how cells are added back into freeblocks from the end if there is enough space.
+
+ """
+
+ """
+
+ The cutoff offset will be initialized to the length of the unallocated space and then be updated for
+ entries that may have been overwritten previously by the entries at the end of the unallocated space.
+ There is some variation on if this is the actual cutoff or not but will always be after the actual cutoff
+ when done this way. It is just important to keep in mind that the previous entries (including possibly
+ freeblocks) may actually be cutoff before this offset and the entries overwritten on top of previous entries
+ may go back that length for things like payload size, row id, serial type header length and the first serial
+ type depending on the use case.
+
+ """
+
+ cutoff_offset = len(unallocated_space)
+
+ # Retrieve the page offset if it was not set through the constructor (should only be set for
+ # proof of concept journal file parsing).
+ if page_offset is None:
+ page_offset = version.get_page_offset(page_number)
+
+ # Iterate through the serial type definition matches in reverse
+ for serial_type_definition_match in reversed(serial_type_definition_match_objects):
+
+ """
+
+ For the serial type definition match objects returned from the iterator above, the match object has a
+ start and a end function to get the beginning offset and ending offset. This is done by calling
+ start(0) or end (0) with 0 being the group number. The ending offset is exclusive ie. [start(0):end(0)).
+
+ """
+
+ serial_type_definition_start_offset = serial_type_definition_match.start(0)
+ serial_type_definition_end_offset = serial_type_definition_match.end(0)
+ file_offset = page_offset + unallocated_space_start_offset + serial_type_definition_start_offset
+
+ try:
+
+ # Create and append the carved b-tree cell to the carved cells list
+ carved_cells.append(CarvedBTreeCell(version, file_offset, source, page_number,
+ CELL_LOCATION.UNALLOCATED_SPACE, 0, unallocated_space,
+ serial_type_definition_start_offset,
+ serial_type_definition_end_offset, cutoff_offset,
+ number_of_columns, signature))
+
+ # Update the cutoff offset
+ cutoff_offset = serial_type_definition_start_offset
+
+ except (CellCarvingError, ValueError):
+ log_message = "Carved b-tree cell creation failed at file offset: {} page number: {} " \
+ "cell source: {} in location: {} with partial serial type definition " \
+ "start offset: {} and partial serial type definition end offset: {} with " \
+ "cutoff offset of: {} number of columns: {} for master schema " \
+ "entry with name: {} and table name: {}."
+ log_message = log_message.format(file_offset, page_number, source,
+ CELL_LOCATION.UNALLOCATED_SPACE,
+ serial_type_definition_start_offset,
+ serial_type_definition_end_offset, cutoff_offset,
+ number_of_columns, signature.name, signature.table_name)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ """
+
+ At this point we have carved all the "full signatures" in reference to the full serial type definition in
+ the cell headers that we found. However, although the above may be freeblocks in the unallocated space (in
+ the use case where the combination of the payload, row id, and/or payload header varint equate out to 4 or
+ more bytes), the use case still remains where all 3 are 1 byte as well as the first serial type. In this case
+ we would only have the 2nd through Nth serial types like the above code in carve freeblocks. Therefore, we
+ recompute the signature removing the first serial type, recheck for patterns and if they do not match the
+ patterns above, add them as well.
+
+
+ Note: If this matches, it does not mean this is necessarily a freeblock since it could have just have been a
+ cell removed and then overwritten partially by another cell. Use cases like these should be addressed
+ in the carved cell classes.
+
+ """
+
+ # Reset the signature pattern removing the first serial type and compile
+ serial_type_definition_signature_pattern = compile(generate_signature_regex(simplified_signature, True))
+
+ # Initialize the list for the partial serial type definition match objects
+ partial_serial_type_definition_match_objects = []
+
+ # Find all matches for the partial serial type definition signature pattern
+ for serial_type_definition_match in serial_type_definition_signature_pattern.finditer(unallocated_space):
+ partial_serial_type_definition_match_objects.append(serial_type_definition_match)
+
+ """
+
+ The partial serial type definition match objects should now be a superset of the serial type definition match
+ objects above. We now go through these match objects and remove any of the data segments found above by
+ comparing the indices.
+
+ Note: This is done after instead of before the full serial type signature matching since it is more conclusive
+ to carve the whole cells rather than the ones without the full serial type header.
+
+ Note: The indices should be updated with the correct cutoff offset and beginning offset where found in the
+ carved cells from the match objects. Currently, these indices only reflect the serial type definition
+ header. This will further improve the validity of the result set. This will be done once the carved
+ cell class and use cases are fully handled.
+
+ """
+
+ # Create a list of all ending indices for the serial type definition match objects and sort by beginning index
+ serial_type_definition_match_objects_indices = sorted([(match_object.start(0), match_object.end(0))
+ for match_object in serial_type_definition_match_objects],
+ key=lambda x: x[0])
+
+ unallocated_space_length = len(unallocated_space)
+ serial_type_definition_match_objects_indices_length = len(serial_type_definition_match_objects_indices)
+ uncarved_unallocated_space_indices = []
+
+ # If there were no serial type definition matches, we set the whole unallocated space to be checked
+ if not serial_type_definition_match_objects_indices:
+ uncarved_unallocated_space_indices.append((0, unallocated_space_length))
+
+ else:
+ last_offset = None
+ for index, match_object_index in enumerate(serial_type_definition_match_objects_indices):
+
+ if index == 0 and index != len(serial_type_definition_match_objects_indices) - 1:
+
+ """
+
+ Check if we are at the first index and if there are additional indexes in the match object. If
+ this is the case, add the section of data from the beginning of the unallocated data to the
+ beginning of this index. This is only done if data is found. If there is no data (ie. the first
+ index of the first match object is the first index of the unallocated data), then we do not set
+ the new index on this first iteration.
+
+ """
+
+ if match_object_index[0] != 0:
+ uncarved_unallocated_space_indices.append((0, match_object_index[0]))
+ last_offset = match_object_index[1]
+
+ elif index == 0 and index == serial_type_definition_match_objects_indices_length - 1:
+
+ """
+
+ Check if we are at the first index and if there are no additional indexes in the match object. If
+ this is the case, we add an index from the beginning of the unallocated data to the first index of
+ the first (and only) match index. If there is data between the ending index of the match we are
+ currently looking at and the end of the unallocated space, we add an index from the ending match
+ index to the ending of the unallocated data.
+
+ """
+
+ uncarved_unallocated_space_indices.append((0, match_object_index[0]))
+ if match_object_index[1] != len(unallocated_space):
+ uncarved_unallocated_space_indices.append((match_object_index[1], unallocated_space_length))
+ last_offset = match_object_index[1]
+
+ elif index != 0 and index != serial_type_definition_match_objects_indices_length - 1:
+
+ """
+
+ If we are not on the first index and there are more indexes to come, we just add the data portion
+ between the ending offset of the last match offset and the beginning index of this first match
+ offset.
+
+ """
+
+ uncarved_unallocated_space_indices.append((last_offset, match_object_index[0]))
+ last_offset = match_object_index[1]
+
+ elif index != 0 and index == serial_type_definition_match_objects_indices_length - 1:
+
+ """
+
+ If we are not on the first index and this is the last index of the previous match objects, we then
+ add the index of the last entry and the first index of this match object. Then, if there is data
+ left in the unallocated space between the ending index of this match object and the end of the
+ unallocated space, we add the last entry between these indices.
+
+ """
+
+ uncarved_unallocated_space_indices.append((last_offset, match_object_index[0]))
+ if match_object_index[1] != len(unallocated_space):
+ uncarved_unallocated_space_indices.append((match_object_index[1], unallocated_space_length))
+ else:
+
+ log_message = "Found invalid use case while carving unallocated space for page number: {} " \
+ "starting from the unallocated space start offset: {} with signature: {}."
+ log_message = log_message.format(page_number, unallocated_space_start_offset, signature.name)
+ logger.error(log_message)
+ raise CarvingError(log_message)
+
+ """
+
+ Iterate through the uncarved portions of the unallocated space and update the cutoff offset to the be the
+ min index of the previous partial cutoff offset and the current uncarved allocated space index ending offset.
+
+ """
+
+ partial_cutoff_offset = len(unallocated_space)
+ for partial_serial_type_definition_match in reversed(partial_serial_type_definition_match_objects):
+ for uncarved_allocated_space_index in reversed(uncarved_unallocated_space_indices):
+
+ cutoff_offset = min(uncarved_allocated_space_index[1], partial_cutoff_offset)
+
+ partial_serial_type_definition_start_offset = partial_serial_type_definition_match.start(0)
+ partial_serial_type_definition_end_offset = partial_serial_type_definition_match.end(0)
+
+ if partial_serial_type_definition_start_offset >= uncarved_allocated_space_index[0] and \
+ partial_serial_type_definition_end_offset <= uncarved_allocated_space_index[1]:
+
+ relative_offset = unallocated_space_start_offset + partial_serial_type_definition_start_offset
+ file_offset = page_offset + relative_offset
+ first_column_serial_types = simplified_signature[0]
+
+ try:
+
+ # Create and append the carved b-tree cell to the carved cells list
+ carved_cells.append(CarvedBTreeCell(version, file_offset, source, page_number,
+ CELL_LOCATION.UNALLOCATED_SPACE,
+ 0, unallocated_space,
+ partial_serial_type_definition_start_offset,
+ partial_serial_type_definition_end_offset,
+ cutoff_offset, number_of_columns, signature,
+ first_column_serial_types))
+
+ # Update the partial cutoff offset
+ partial_cutoff_offset = partial_serial_type_definition_start_offset
+
+ except (CellCarvingError, ValueError):
+ log_message = "Carved b-tree cell creation failed at file offset: {} page number: {} " \
+ "cell source: {} in location: {} with partial serial type definition " \
+ "start offset: {} and partial serial type definition end offset: {} with " \
+ "partial cutoff offset of: {} number of columns: {} for master schema " \
+ "entry with name: {} and table name: {}."
+ log_message = log_message.format(file_offset, page_number, source,
+ CELL_LOCATION.UNALLOCATED_SPACE,
+ partial_serial_type_definition_start_offset,
+ partial_serial_type_definition_end_offset,
+ partial_cutoff_offset, number_of_columns, signature.name,
+ signature.table_name)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ # Return the cells carved from the freeblocks
+ return carved_cells
diff --git a/sqlite_dissect/carving/rollback_journal_carver.py b/sqlite_dissect/carving/rollback_journal_carver.py
new file mode 100644
index 0000000..7003455
--- /dev/null
+++ b/sqlite_dissect/carving/rollback_journal_carver.py
@@ -0,0 +1,124 @@
+from binascii import hexlify
+from logging import getLogger
+from struct import unpack
+from sqlite_dissect.constants import FILE_TYPE
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import PAGE_TYPE
+from sqlite_dissect.carving.carver import SignatureCarver
+from sqlite_dissect.version_history import Commit
+
+"""
+
+rollback_journal_carver.py
+
+This script carves through a rollback journal file with the specified master schema entry and
+signature and returns the entries.
+
+This script holds the following object(s):
+RollBackJournalCarver(Carver)
+
+"""
+
+
+class RollBackJournalCarver(object):
+
+ @staticmethod
+ def carve(rollback_journal, version, master_schema_entry, signature):
+
+ logger = getLogger(LOGGER_NAME)
+
+ """
+
+ Read the page size in from the version class (the base SQLite database). This will be used instead of checking
+ the journal header since that is overwritten with zeros in most cases. If there is no database file, then
+ other means to determine the page size can be used by analyzing the journal file. This is something outside
+ the current scope of this project and could be something followed up on in the future for stand alone rollback
+ journal carving.
+
+ """
+
+ page_size = version.page_size
+
+ """
+
+ This is currently a hard coded value as to what is currently seen (sector size).
+ Some research was done and this value appeared to be hard coded in the SQLite c library.
+ Newer version so the library should be checked as to this was the 3090200 version.
+
+ """
+
+ sector_size = 512
+
+ # The page record header and checksum sizes are fixed
+ page_record_header_size = 4
+ page_record_checksum_size = 4
+
+ page_record_size = page_record_header_size + page_size + page_record_checksum_size
+
+ # Initialize the carve commits
+ carved_commits = []
+
+ logger.debug("Starting carving table: %s... " % master_schema_entry.name)
+
+ has_data = True
+ offset = sector_size
+ while has_data:
+
+ page_number = unpack(b">I", rollback_journal.file_handle.read_data(offset, page_record_header_size))[0]
+ page_content = rollback_journal.file_handle.read_data(offset + page_record_header_size, page_size)
+ page_type = hexlify(page_content[:1])
+ page_checksum = hexlify(rollback_journal.file_handle.read_data(offset + page_record_header_size +
+ page_size, page_record_checksum_size))
+
+ logger.debug("At offset: %s page Number: %s of type: %s has content with checksum of: %s"
+ % (offset, page_number, page_type, page_checksum))
+
+ if page_type in ["0d", "05"]:
+
+ page_type_string = PAGE_TYPE.B_TREE_TABLE_LEAF if page_type == "0d" else PAGE_TYPE.B_TREE_TABLE_INTERIOR
+ carved_cells = SignatureCarver.carve_unallocated_space(version, FILE_TYPE.ROLLBACK_JOURNAL, page_number,
+ 0, page_content, signature,
+ offset + page_record_header_size)
+
+ commit = Commit(master_schema_entry.name, FILE_TYPE.ROLLBACK_JOURNAL, -1,
+ version.database_text_encoding, page_type_string, -1, None)
+ commit.carved_cells.update({cell.md5_hex_digest: cell for cell in carved_cells})
+ carved_commits.append(commit)
+
+ offset += page_record_size
+
+ # Check if the next page record is a full page record size or not
+ if (offset + page_record_size) >= rollback_journal.file_handle.file_size:
+
+ # The page record is cut off since it is goes beyond the end of the file
+ has_data = False
+
+ """
+
+ This accounts for the last incomplete block/frame of the journal file for carving.
+
+ Since this isn't a full page record, we do not care about the checksum since it should be cut off.
+
+ """
+
+ page_number = unpack(b">I", rollback_journal.file_handle.read_data(offset, 4))[0]
+ page_content = rollback_journal.file_handle.read_data(offset + page_record_header_size,
+ rollback_journal.file_handle.file_size -
+ page_record_header_size - offset)
+ page_type = hexlify(page_content[:1])
+
+ if page_type in ["0d", "05"]:
+
+ page_type_string = PAGE_TYPE.B_TREE_TABLE_LEAF if page_type == "0d" \
+ else PAGE_TYPE.B_TREE_TABLE_INTERIOR
+ carved_cells = SignatureCarver.carve_unallocated_space(version, FILE_TYPE.ROLLBACK_JOURNAL,
+ page_number, 0, page_content, signature,
+ offset + page_record_header_size)
+
+ commit = Commit(master_schema_entry.name, FILE_TYPE.ROLLBACK_JOURNAL, -1,
+ version.database_text_encoding, page_type_string, -1, None)
+ commit.carved_cells.update({cell.md5_hex_digest: cell for cell in carved_cells})
+ carved_commits.append(commit)
+
+ logger.debug("Finished carving table: %s... " % master_schema_entry.name)
+ return carved_commits
diff --git a/sqlite_dissect/carving/signature.py b/sqlite_dissect/carving/signature.py
new file mode 100644
index 0000000..812b449
--- /dev/null
+++ b/sqlite_dissect/carving/signature.py
@@ -0,0 +1,1628 @@
+from abc import ABCMeta
+from abc import abstractmethod
+from copy import copy
+from logging import getLogger
+from re import sub
+from warnings import warn
+from sqlite_dissect.carving.utilities import get_content_size
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import MASTER_SCHEMA_ROW_TYPE
+from sqlite_dissect.constants import STORAGE_CLASS
+from sqlite_dissect.constants import TYPE_AFFINITY
+from sqlite_dissect.file.database.utilities import aggregate_leaf_cells
+from sqlite_dissect.file.database.utilities import get_pages_from_b_tree_page
+from sqlite_dissect.file.schema.master import OrdinaryTableRow
+from sqlite_dissect.file.schema.master import VirtualTableRow
+from sqlite_dissect.file.version_parser import VersionParser
+from sqlite_dissect.exception import SignatureError
+
+"""
+
+signature.py
+
+This script holds the objects for the signature generation of SQLite table and index b-trees for carving.
+
+This script holds the following object(s):
+Signature(VersionParser)
+SchemaColumnSignature(object)
+TableColumnSignature(object)
+TableRowSignature(object)
+ColumnSignature(object)
+ColumnFixedLengthSignature(ColumnSignature)
+ColumnVariableLengthSignature(ColumnSignature)
+ColumnReducedVariableLengthSignature(ColumnVariableLengthSignature)
+ColumnNonReducedVariableLengthSignature(ColumnVariableLengthSignature)
+
+"""
+
+
+class Signature(VersionParser):
+
+ def __init__(self, version_history, master_schema_entry, version_number=None, ending_version_number=None):
+
+ """
+
+
+
+ Note: The schema and table column signatures will be lists ordered in relation to the index of the column
+ referred to in the table. The table row signatures will be a dictionary indexed by the serial type
+ signature from the record representing the unique combination of serial types for that row pointing
+ to the related table row signature.
+
+ Note: The above note is not true for "without rowid" tables. A warning will be raised if this
+ case is encountered.
+
+ Note: It is important to pay attention to the column breakdown in the usage of this class in the case of an
+ altered table. This class leaves it up to the user to check for these fields and make use of them
+ accordingly.
+
+ :param version_history:
+ :param master_schema_entry:
+ :param version_number:
+ :param ending_version_number:
+
+ :return:
+
+ :raise:
+
+ """
+
+ # Call to the super class
+ super(Signature, self).__init__(version_history, master_schema_entry, version_number, ending_version_number)
+
+ logger = getLogger(LOGGER_NAME)
+
+ """
+
+ Since the index signatures have not been fully investigated, a warning is printed here to alert of this.
+
+ """
+
+ if master_schema_entry.row_type == MASTER_SCHEMA_ROW_TYPE.INDEX:
+ log_message = "An index row type was found for signature which is not fully supported for master " \
+ "schema entry root page number: {} row type: {} name: {} table name: {} and sql: {}."
+ log_message = log_message.format(master_schema_entry.root_page_number,
+ master_schema_entry.row_type, master_schema_entry.name,
+ master_schema_entry.table_name, master_schema_entry.sql)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ if master_schema_entry.internal_schema_object:
+ log_message = "An internal schema object index row type was found for the version parser which is " \
+ "not fully supported for master schema entry root page number: {} type: {} name: {} " \
+ "table name: {} and sql: {} and may result in erroneous cells."
+ log_message = log_message.format(master_schema_entry.root_page_number,
+ master_schema_entry.row_type, master_schema_entry.name,
+ master_schema_entry.table_name, master_schema_entry.sql)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ log_message = "Creating signature for master schema entry with name: {} table name: {} row type: {} and " \
+ "sql: {} for version number: {} and ending version number: {}."
+ log_message = log_message.format(self.name, self.table_name, self.row_type, self.sql,
+ self.parser_starting_version_number, self.parser_ending_version_number)
+ logger.debug(log_message)
+
+ """
+
+ Create and initialize the variables for the signature
+
+ The schema column signatures and table column signatures will be in order that the fields are in the table. The
+ table row signatures will be in a dictionary keyed off of the record serial type signature.
+
+ """
+
+ self.schema_column_signatures = []
+ self.table_row_signatures = {}
+ self.table_column_signatures = []
+
+ """
+
+ Below variables are declared for total records and unique records. These are counters to determine the number
+ of total rows reviewed across all versions (including duplicates) and the unique rows (non-duplicated) between
+ all versions. This is due to the face that we can have multiple pages with the same data and only minor
+ additions/subtractions to that data. Therefore, total records will record the running total of all records
+ regardless of uniqueness and unique records will be the total number of records with no duplicates included.
+
+ Note: We include the row id into the uniqueness. This way similar signatures between different rows will
+ build up a more accurate probability.
+
+ """
+
+ self.total_records = 0
+ self.unique_records = 0
+
+ """
+
+ Derived the schema column signatures from the SQL statements in the master schema from the
+ table and index types.
+
+ Note: The order of column definitions will match the columns as defined in the schema SQL statement.
+
+ Note: IndexRow master schema entries do not have column definitions at this time so we need to make sure
+ the object is a OrdinaryTableRow object. (VirtualTableRow objects or OrdinaryTableRow that are
+ "without rowid" tables do not have column definitions at this time either.) This results in only
+ normal tables currently having signatures. Warnings have already been thrown in regards to these
+ use cases above.
+
+ """
+
+ if isinstance(master_schema_entry, OrdinaryTableRow) and not master_schema_entry.without_row_id:
+ for column_definition in master_schema_entry.column_definitions:
+ self.schema_column_signatures.append(SchemaColumnSignature(column_definition))
+
+ if isinstance(master_schema_entry, VirtualTableRow):
+
+ """
+
+ Below we initialize variables for the signature to prevent issues with the stringify method. After that,
+ a warning message is printed and the application continues on since the virtual tables in SQLite are not
+ currently supported. All fields are set to the defaults (False and/or None/Empty values).
+
+ """
+ self.altered_columns = False
+ self.column_breakdown = {}
+
+ log_message = "Virtual table found in signature for master schema entry with name: {} table name: {} " \
+ "row type: {} and sql: {} for version number: {} and ending version number: {}. A " \
+ "signature will not be generated since virtual tables are not fully supported yet."
+ log_message = log_message.format(self.name, self.table_name, self.row_type, self.sql,
+ self.parser_starting_version_number, self.parser_ending_version_number)
+ log_message = log_message.format()
+ getLogger(LOGGER_NAME).warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ elif self.parser_starting_version_number is not None and self.parser_ending_version_number is not None:
+
+ # Get the versions
+ versions = version_history.versions
+
+ """
+
+ Below the column definitions are pulled from the initial, base version, master schema. Since these columns
+ will stay the same across all updates to the master schema entry, it is safe to set it here. The only field
+ that can be updated in the master schema entry without causing a new master schema entry is the root page
+ number.
+
+ """
+
+ # Set the column definitions
+ column_definitions = master_schema_entry.column_definitions
+
+ # Create a set for account cells so we don't account for the same record twice across versions
+ accounted_for_cell_digests = set()
+
+ # Initialize the b-tree page numbers
+ root_b_tree_page_numbers = []
+
+ # Iterate through the versions in reference to this master schema entry
+ for version_number in range(self.parser_starting_version_number,
+ self.parser_ending_version_number + 1):
+
+ version = versions[version_number]
+ root_page_number = self.root_page_number_version_index[version_number]
+
+ b_tree_updated = False
+
+ # Check if this is the first version to be investigated
+ if version_number == self.parser_starting_version_number:
+ b_tree_updated = True
+
+ # Check if the root page number changed
+ elif root_page_number != self.root_page_number_version_index[version_number - 1]:
+ b_tree_updated = True
+
+ # Check if any of the non-root pages changed
+ elif [page_number for page_number in root_b_tree_page_numbers
+ if page_number in version.updated_b_tree_page_numbers]:
+ b_tree_updated = True
+
+ # Parse the b-tree page structure if it was updated
+ if b_tree_updated:
+
+ # Get the root page and root page numbers from the first version
+ root_page = version.get_b_tree_root_page(root_page_number)
+ root_b_tree_page_numbers = [b_tree_page.number for b_tree_page
+ in get_pages_from_b_tree_page(root_page)]
+
+ """
+
+ Below we aggregate the records together. This function returns the total of records and then
+ a dictionary of records indexed by their cell md5 hex digest to record. This dictionary may
+ hold less records than the total since records may have already been accounted for in previous
+ versions and are ignored since their cell md5 hex digests are in the accounted for cell digests
+ already.
+
+ Note: The number of unique records reflects the total of all records in terms of uniqueness
+ regardless of the number of columns that are reflected in each row.
+
+ """
+
+ total, records = aggregate_leaf_cells(root_page, accounted_for_cell_digests, True)
+
+ # Add the totals to the counts
+ self.total_records += total
+ self.unique_records += len(records)
+
+ """
+
+ The column definitions in the master schema entry are parsed in order. Therefore, the order of the
+ column definitions should be in the same order as the columns in the record. These orders are
+ assumed to be equivalent to each other.
+
+ Note: In SQLite, it is not possible to rename or remove columns, but columns can be added.
+ Therefore, some records may have less entries in then than the number of column definitions
+ and the table row signatures may have a different number of columns (lesser or equal to
+ the number of column definitions) in them.
+
+ """
+
+ # Iterate through each of the records
+ for cell_md5_hex_digest, record in records.iteritems():
+
+ """
+
+ Note: The serial type signature is a series of serial types in a string to determine the
+ structure of that record. For variable length columns, -2 is used for strings and
+ -1 is used for blobs. The variable length signatures are similar to Epilog.
+
+ """
+
+ # Check if the serial type signature of the record is not already in the row signatures
+ if record.serial_type_signature not in self.table_row_signatures:
+
+ # Create and add a new table row signature
+ table_row_signature = TableRowSignature(column_definitions, record)
+ self.table_row_signatures[record.serial_type_signature] = table_row_signature
+
+ # The signature already exists
+ else:
+
+ # Update the table row signature
+ self.table_row_signatures[record.serial_type_signature].update(record)
+
+ """
+
+ Iterate through each of the table row signatures and update the total number of records that were parsed
+ in order to create probability statistics for that row.
+
+ We also track the count of each row and then match that against the accounted for records for additional
+ validation.
+
+ """
+
+ total_table_row_signature_count = 0
+
+ # Iterate through the table row signatures and set the total rows and increment the count
+ for serial_type_signature, table_row_signature in self.table_row_signatures.iteritems():
+ table_row_signature.number_of_rows = self.unique_records
+ total_table_row_signature_count += table_row_signature.count
+
+ # Make sure the count of records match
+ if total_table_row_signature_count != self.unique_records:
+ log_message = "The total table row signature count: {} does not match the number of unique " \
+ "records: {} for master schema entry row type: {} with root page number: {} name: {} " \
+ "table name: {} and sql: {}."
+ log_message = log_message.format(total_table_row_signature_count, self.unique_records,
+ master_schema_entry.row_type, master_schema_entry.root_page_number,
+ master_schema_entry.name, master_schema_entry.table_name,
+ master_schema_entry.sql)
+ logger.error(log_message)
+ raise SignatureError(log_message)
+
+ """
+
+ Below we have to account for the use case of altered tables.
+
+ In order to do this we have a altered columns boolean that is set to true if this is detected. We also
+ create a dictionary to represent the breakdown of the columns:
+
+ column_breakdown[NUMBER_OF_COLUMNS] = (NUMBER_OF_ROWS, PROBABILITY)
+
+ where NUMBER_OF_ROWS is the number of rows that has exactly the NUMBER_OF_COLUMNS in it, and
+ where PROBABILITY is the NUMBER_OF_ROWS divided by the number of unique records.
+
+ Additionally, there may be no entries in for the last modification to the table. For example, there may be
+ 5 rows with 10 columns, but the latest SQL/schema for the table shows that it has 11 columns. This can
+ occur if no rows are inserted after the last alter statement. In order to account for this, the number
+ of columns found for the schema are checked against the column breakdown dictionary and if the number of
+ columns is not found, it is added to the dictionary with 0 NUMBER_OF_ROWS and 0 PROBABILITY. It is
+ important to note that it is only added if the number of columns in the SQL/schema are greater than the
+ number of columns in the row. If the number of columns in the SQL/schema are less, than an exception
+ will be raised.
+
+ In the case that there are no entries in the table itself, the NUMBER_OF_ROWS and PROBABILITY will both
+ be set to 0 for the SQL/schema number of columns in the column breakdown.
+
+ It is up to the user of the signature class to check against the column breakdown in order to determine the
+ best way to carve the data they are looking at. This class merely supplies the information and leaves it up
+ to the user on how to make use of it.
+
+ Also, in regards to probability, the column signatures created have probability based off of the number of
+ rows that column appeared in. Therefore, columns added in later through alter table statements will have
+ probability calculated based off of the number of rows that only had those columns in it. In order to
+ calculate the probability of a column signature across all rows, the probability of that signature should
+ be multiplied by the probability that column shows up which can be derived through the column breakdown
+ based off of it's column index. A better way to do this may be able to be done moving forward.
+
+ Note: The altered columns flag is not 100% deterministic. It can only be determined when:
+ 1.) The number of columns are different lengths across rows
+ 2.) The number of columns in the SQL/schema is greater than the number of columns in the rows
+
+ Note: It may be better to find a way to correlate the altered columns flag to a master schema associated
+ class.
+
+ """
+
+ # Instantiate the altered columns flag and the column breakdown
+ self.altered_columns = False
+ self.column_breakdown = {}
+
+ # Iterate through all of the table row signatures and add up the counts of each one based on column count
+ for table_row_signature in self.table_row_signatures.values():
+ column_signature_length = len(table_row_signature.column_signatures)
+ if column_signature_length in self.column_breakdown:
+ self.column_breakdown[column_signature_length] += table_row_signature.count
+ else:
+ self.column_breakdown[column_signature_length] = table_row_signature.count
+
+ # Get the number of columns in the schema and add it to the column breakdown if not already added
+ schema_column_length = len(self.schema_column_signatures)
+ if schema_column_length not in self.column_breakdown:
+ self.column_breakdown[schema_column_length] = 0
+
+ # Iterate through the column breakdown and compute probabilities
+ for column_count in self.column_breakdown:
+ row_count = self.column_breakdown[column_count]
+ probability = float(row_count) / self.unique_records if self.unique_records else 0
+ self.column_breakdown[column_count] = (row_count, probability)
+
+ # The columns have been altered if there is more than one entry in the column breakdown
+ if len(self.column_breakdown) > 1:
+ self.altered_columns = True
+
+ """
+
+ At this point we have iterated through all the versions and found all of the table row signatures to each
+ unique row structure that we found. If there was no root page or no rows found in any of the pages, then
+ the table row signatures will be empty. Below we parse through each of the table row signatures and create
+ column signatures across them inverting the data so we can see the signatures in two ways. First, across
+ the rows, and second, across the columns.
+
+ """
+
+ # Check if there were table row signatures found
+ if self.table_row_signatures:
+
+ """
+
+ Next, we create a table row column dictionary with the column index as the key and the value an array
+ of serial types aggregated across all of the table row signatures of that column index. Once we get
+ the table row column serial type arrays, we create the table column signatures.
+
+ This process basically inverts the table row signatures in order to generate the table
+ column signatures.
+
+ Note: The column definitions in the master schema entry are parsed in order. Therefore, the order of
+ the column definitions should be in the same order as the columns in the record. Also, since the
+ table row signatures are created off of the record columns and definitions the columns in the
+ table row signature will also be in the same order. Previously, the column definition size was
+ used to iterate through each row with to get the columns pertaining to the column index of the
+ column definition. However, every row may not have every column and therefore the length of the
+ column signatures for each row being iterated through is used. This will occur if multiple
+ variations of columns occur in the row indicating a table that has been altered at some point.
+
+ Note: The indices of the column signatures should match the indices of the record columns and the
+ columns in the table row signatures since they are all derived originally from the master schema.
+ Below, the index in the range of column definitions size is used for the table row columns
+ creation and the column signatures in the table row signatures.
+
+ """
+
+ table_row_columns = {}
+
+ # Iterate through the table row signatures and create the table row columns dictionary
+ for table_row_md5_hex_digest, table_row_signature in self.table_row_signatures.iteritems():
+
+ # Iterate through all of the column signatures in the current table row signature
+ for column_index in range(len(table_row_signature.column_signatures)):
+
+ # Add or append the column signature in the table row columns dictionary
+ if column_index in table_row_columns:
+ table_row_columns[column_index].append(table_row_signature.column_signatures[column_index])
+ else:
+ table_row_columns[column_index] = [table_row_signature.column_signatures[column_index]]
+
+ # Iterate through the table row columns and create the table column signatures
+ for table_row_column_index, table_row_column_serial_type_array in table_row_columns.iteritems():
+ column_name = column_definitions[table_row_column_index].column_name
+ self.table_column_signatures.append(TableColumnSignature(table_row_column_index, column_name,
+ table_row_column_serial_type_array))
+
+ # No table row signatures were found
+ else:
+
+ """
+
+ Note: Both of these should be 0 if no table row signatures were found. Checking the total records
+ should actually be enough for this check but both are checked for additional validity.
+
+ """
+
+ # Make sure no records were found
+ if self.total_records or self.unique_records:
+ log_message = "The total records: {} and unique records: {} are both not 0 as expected for " \
+ "master schema entry row type: {} with root page number: {} name: {} table " \
+ "name: {} and sql: {}."
+ log_message = log_message.format(self.total_records, self.unique_records,
+ master_schema_entry.row_type, master_schema_entry.root_page_number,
+ master_schema_entry.name, master_schema_entry.table_name,
+ master_schema_entry.sql)
+ logger.error(log_message)
+ raise SignatureError(log_message)
+
+ """
+
+ At this point we now have two sets of signatures depending on the way you want to view the table signatures.
+ 1.) self._table_row_signatures: Each unique row of the table in relation to serial types with probability of
+ each row and column serial type if it is a string or blob.
+ 2.) self._table_column_signatures: Each column of the table with the serial types realized across all the
+ rows along with probability of each serial type in respect to that
+ column.
+
+ """
+
+ """
+
+ Since we may not have records, and may possibly not have a schema to parse schema column signatures from
+ (depending if it is a virtual table, internal schema object, etc.), we check the lengths of the schema
+ column signatures and table column signatures so that if both signatures exist, the column lengths must
+ be equal. We take the max of the two lengths as the number of columns.
+
+ """
+
+ schema_column_signatures_length = len(self.schema_column_signatures)
+ table_column_signatures_length = len(self.table_column_signatures)
+
+ if schema_column_signatures_length and table_column_signatures_length:
+ if schema_column_signatures_length != table_column_signatures_length:
+ log_message = "The schema column signatures length: {} is not equal to the table column signatures " \
+ "length: {} for master schema entry row type: {} with root page number: {} name: {} " \
+ "table name: {} and sql: {}."
+ log_message = log_message.format(schema_column_signatures_length, table_column_signatures_length,
+ master_schema_entry.row_type, master_schema_entry.root_page_number,
+ master_schema_entry.name, master_schema_entry.table_name,
+ master_schema_entry.sql)
+ logger.error(log_message)
+ raise SignatureError(log_message)
+
+ self.number_of_columns = max(schema_column_signatures_length, table_column_signatures_length)
+
+ def stringify(self, padding="", print_table_row_signatures=True, print_schema_column_signatures=True,
+ print_table_column_signatures=True, print_column_signatures=True):
+ string = "\n" \
+ + padding + "Number of Columns: {}\n" \
+ + padding + "Total Records: {}\n" \
+ + padding + "Unique Records: {}\n" \
+ + padding + "Altered Columns: {}\n" \
+ + padding + "Column Breakdown: {}\n" \
+ + padding + "Schema Column Signatures Length: {}\n" \
+ + padding + "Table Row Signatures Length: {}\n" \
+ + padding + "Table Column Signatures Length: {}\n" \
+ + padding + "Recommended Schema Column Signature: {}\n" \
+ + padding + "Complete Schema Column Signature: {}\n" \
+ + padding + "Focused Signature: {}\n" \
+ + padding + "Simplified Signature: {}\n" \
+ + padding + "Focused Probability Signature: {}\n" \
+ + padding + "Simplified Probability Signature: {}\n" \
+ + padding + "Epilog Schema Signature: {}\n" \
+ + padding + "Epilog Focused Signature: {}\n" \
+ + padding + "Epilog Simplified Signature: {}"
+ string = string.format(self.number_of_columns,
+ self.total_records,
+ self.unique_records,
+ self.altered_columns,
+ self.column_breakdown,
+ len(self.schema_column_signatures),
+ len(self.table_row_signatures),
+ len(self.table_column_signatures),
+ self.recommended_schema_signature,
+ self.complete_schema_signature,
+ self.focused_signature,
+ self.simplified_signature,
+ self.focused_probabilistic_signature,
+ self.simplified_probabilistic_signature,
+ self.epilog_schema_signature,
+ self.epilog_focused_signature,
+ self.epilog_simplified_signature)
+ if print_schema_column_signatures:
+ for schema_column_signature in self.schema_column_signatures:
+ signature_string = "\n" + padding + "Schema Column Signature: {}"
+ signature_string = signature_string.format(schema_column_signature.stringify("\t"))
+ string += signature_string
+ if print_table_row_signatures:
+ for table_row_md5_hex_digest, table_row_signature in self.table_row_signatures.iteritems():
+ signature_string = "\n" + padding + "Table Row Signature:\n{}"
+ signature_string = signature_string.format(table_row_signature.stringify("\t", print_column_signatures))
+ string += signature_string
+ if print_table_column_signatures:
+ for table_column_signature in self.table_column_signatures:
+ signature_string = "\n" + padding + "Table Column Signature: {}"
+ signature_string = signature_string.format(table_column_signature.stringify("\t",
+ print_column_signatures))
+ string += signature_string
+ return super(Signature, self).stringify(padding) + string
+
+ @property
+ def epilog_focused_signature(self):
+
+ epilog_focused_signature = []
+
+ for column_signature in self.focused_signature:
+
+ # Copy the column signature signature as a base
+ epilog_column_signature = copy(column_signature)
+
+ """
+
+ Epilog does not log the 8 and 9 serial types in the focused schema. Instead it uses serial type 1 for
+ 8 and 9.
+
+ In order to represent 8 and 9 serial types in epilog column signatures, after epilog replaces the 8 or 9
+ with a 1, it sets the min and max files appropriately for that field. For example setting max = 1.
+
+ More investigation needs to go into the use of epilog signatures with 8 and 9.
+
+ """
+
+ insert_single_byte_integer = False
+
+ if 8 in epilog_column_signature:
+ epilog_column_signature.remove(8)
+ insert_single_byte_integer = True
+
+ if 9 in epilog_column_signature:
+ epilog_column_signature.remove(9)
+ insert_single_byte_integer = True
+
+ if insert_single_byte_integer and 1 not in epilog_column_signature:
+ epilog_column_signature.append(1)
+
+ epilog_focused_signature.append(sorted(epilog_column_signature, key=int))
+
+ return epilog_focused_signature
+
+ @property
+ def epilog_schema_signature(self):
+
+ epilog_schema_signature = []
+
+ for schema_column_signature in self.schema_column_signatures:
+
+ """
+
+ Note: The recommended signature is used here instead of the complete since this seems more in line
+ to the epilog signatures themselves, along with reducing a lot of serial types in the complete
+ signature that may not apply.
+
+ """
+
+ # Copy the recommended signature from this particular schema column signature as a base
+ epilog_column_signature = copy(schema_column_signature.recommended_signature)
+
+ # Append a null value as epilog does if it is not in the column signature already
+ if 0 not in epilog_column_signature:
+ epilog_column_signature.append(0)
+
+ epilog_schema_signature.append(sorted(epilog_column_signature, key=int))
+
+ return epilog_schema_signature
+
+ @property
+ def epilog_simplified_signature(self):
+
+ epilog_simplified_signature = []
+
+ for column_signature in self.simplified_signature:
+
+ # Copy over the like serial types between this column signature and the epilog column signature
+ epilog_column_signature = [x for x in column_signature if x in [-2, -1, 0, 7]]
+
+ """
+
+ Check if any of the integer serial types are in the column signature and add all integer serial
+ types if any of them exist since this is how epilog seems to do it. However, there may be use
+ cases in regards to 8 and 9 being used for non-integer storage classes.
+
+ """
+
+ integer_serial_types = [1, 2, 3, 4, 5, 6, 8, 9]
+ if len(set(integer_serial_types).intersection(set(column_signature))):
+ epilog_column_signature.extend(integer_serial_types)
+
+ epilog_simplified_signature.append(sorted(epilog_column_signature, key=int))
+
+ return epilog_simplified_signature
+
+ @property
+ def complete_schema_signature(self):
+ simplified_signatures = []
+ for schema_column_signature in self.schema_column_signatures:
+ simplified_signatures.append(schema_column_signature.complete_signature)
+ return simplified_signatures
+
+ @property
+ def focused_probabilistic_signature(self):
+ focused_signatures = []
+ for table_column_signature in self.table_column_signatures:
+ focused_signatures.append(table_column_signature.focused_probabilistic_signature)
+ return focused_signatures
+
+ @property
+ def focused_signature(self):
+ focused_signatures = []
+ for table_column_signature in self.table_column_signatures:
+ focused_signatures.append(table_column_signature.focused_signature)
+ return focused_signatures
+
+ @property
+ def recommended_schema_signature(self):
+ simplified_signatures = []
+ for schema_column_signature in self.schema_column_signatures:
+ simplified_signatures.append(schema_column_signature.recommended_signature)
+ return simplified_signatures
+
+ @property
+ def simplified_probabilistic_signature(self):
+ simplified_signatures = []
+ for table_column_signature in self.table_column_signatures:
+ simplified_signatures.append(table_column_signature.simplified_probabilistic_signature)
+ return simplified_signatures
+
+ @property
+ def simplified_signature(self):
+ simplified_signatures = []
+ for table_column_signature in self.table_column_signatures:
+ simplified_signatures.append(table_column_signature.simplified_signature)
+ return simplified_signatures
+
+
+class SchemaColumnSignature(object):
+
+ """
+
+ SchemaColumnSignature
+
+ This class will take a column definition and create a schema column definition from it. This is mostly useful
+ in the case where there are not row entries in the table and a signature has to be built directly off the data
+ types in the column definition. Otherwise, the table column signature or table row signature would be recommended.
+ This is due to the fact that this signature cannot validate the fields will be the types derived from the data types
+ of the column due to the way SQLite works with storage classes and type affinities. This class will retrieve the
+ type affinity derived from the column data type (if specified) and base the signatures off of those affinities.
+ Due to this, there will be two signatures in this class that can be retrieved:
+
+ 1.) Recommended Signature: The recommended signature for what is most likely to be seen in the columns based on the
+ type affinity.
+
+ The recommended signature will be based off the data type and recommended storage class used for that data type,
+ if specified. The following serial types are used for the following type affinities:
+
+ Type Affinity Serial Type Signature
+ INTEGER [1, 2, 3, 4, 5, 6, 8, 9]
+ REAL [1, 2, 3, 4, 5, 6, 7, 8, 9]
+ NUMERIC [-2]
+ TEXT [-1]
+ BLOB (or if not specified) [1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+ 2.) Complete Signature: The full possibility of what can be seen in the columns based on the type affinity.
+
+ Unfortunately, almost every type affinity can be stored as any storage class with the exception of the TEXT
+ type affinity. The storage class is derived from the combination of the type affinity and the actual value.
+ Therefore the complete signature will include all storage classes for every type affinity except TEXT will
+ will only include the TEXT, BLOB, and NULL storage classes. (The TEXT, BLOB and NULL storage classes can be
+ used for all type affinities.)
+
+ Type Affinity Storage Class
+ INTEGER INTEGER, REAL, TEXT, BLOB, NULL
+ REAL INTEGER, REAL, TEXT, BLOB, NULL
+ NUMERIC INTEGER, REAL, TEXT, BLOB, NULL
+ TEXT TEXT, BLOB, NULL
+ BLOB (or if not specified) INTEGER, REAL, TEXT, BLOB, NULL
+
+ Due to this, similar to above, there is also recommended storage class and possible storage class array for
+ what the storage classes of the particular column may be.
+
+ However, the REAL type affinity only uses the storage class INTEGER to store it's values into the file but
+ reads it back out as REAL even though it is not in the file. This conversion is done behind the scenes in
+ SQLite and therefore the possible storage classes for REAL can be updated as:
+
+ REAL REAL, TEXT, BLOB, NULL
+
+ This is a very important (hidden) use case to keep in mind.
+
+ This results in all type affinities having a signature of: [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], instead
+ of the TEXT type affinity which has a signature of: [-2, -1, 0].
+
+ Since many storage classes are possible for each data type, the possible storage classes are set in an array and
+ are as specified above.
+
+ Note: Serial types 8 and 9 are used in all recommended signatures (except TEXT) since these two types are for 0 and
+ 1 constants which are used a lot in order to reserve space in the SQLite file.
+
+ Note: In the column definition, the derived data type name may be None if no data type was specified in the
+ SQL. If this is the case, the data type will be invalid and the type affinity will be BLOB per the
+ way affinities and storage classes are related depending on data type to the SQLite documentation.
+
+ Note: Since TEXT and BLOB are variable length data types, -1 will be used to represent a BLOB and -2 will be used
+ to represent a string. This is similar to Epilog's handling of variable length data types in signatures.
+
+ Note: There may be the possibility that columns were added causing inconsistencies between previous versions of the
+ row data that may not be picked up if solely going off of a schema based signature. However, if there is no
+ data to derive a signature from, we have no other recourse but to use the schema signature. In the future
+ signature files may be able to be imported in and out for this purpose based on os, application, and version.
+
+ """
+
+ def __init__(self, column_definition):
+
+ self.derived_data_type_name = column_definition.derived_data_type_name
+ self.data_type = column_definition.data_type
+ self.type_affinity = column_definition.type_affinity
+
+ if self.type_affinity == TYPE_AFFINITY.INTEGER:
+
+ self.recommended_storage_class = STORAGE_CLASS.INTEGER
+ self.possible_storage_classes = [STORAGE_CLASS.INTEGER, STORAGE_CLASS.REAL, STORAGE_CLASS.TEXT,
+ STORAGE_CLASS.BLOB, STORAGE_CLASS.NULL]
+
+ self.recommended_signature = [1, 2, 3, 4, 5, 6, 8, 9]
+ self.complete_signature = [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+ elif self.type_affinity == TYPE_AFFINITY.REAL:
+
+ self.recommended_storage_class = STORAGE_CLASS.REAL
+ self.possible_storage_classes = [STORAGE_CLASS.REAL, STORAGE_CLASS.TEXT,
+ STORAGE_CLASS.BLOB, STORAGE_CLASS.NULL]
+
+ self.recommended_signature = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+ self.complete_signature = [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+ elif self.type_affinity == TYPE_AFFINITY.TEXT:
+
+ self.recommended_storage_class = TYPE_AFFINITY.TEXT
+ self.possible_storage_classes = [STORAGE_CLASS.TEXT, STORAGE_CLASS.BLOB, STORAGE_CLASS.NULL]
+
+ self.recommended_signature = [-2]
+ self.complete_signature = [-2, -1, 0]
+
+ elif self.type_affinity == TYPE_AFFINITY.BLOB:
+
+ self.recommended_storage_class = TYPE_AFFINITY.BLOB
+ self.possible_storage_classes = [STORAGE_CLASS.INTEGER, STORAGE_CLASS.REAL, STORAGE_CLASS.TEXT,
+ STORAGE_CLASS.BLOB, STORAGE_CLASS.NULL]
+
+ self.recommended_signature = [-1]
+ self.complete_signature = [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+ elif self.type_affinity == TYPE_AFFINITY.NUMERIC:
+
+ self.recommended_storage_class = TYPE_AFFINITY.NUMERIC
+ self.possible_storage_classes = [STORAGE_CLASS.INTEGER, STORAGE_CLASS.REAL, STORAGE_CLASS.TEXT,
+ STORAGE_CLASS.BLOB, STORAGE_CLASS.NULL]
+
+ self.recommended_signature = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+ self.complete_signature = [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+ else:
+
+ log_message = "Invalid type affinity found: {}.".format(self.type_affinity)
+ getLogger(LOGGER_NAME).error(log_message)
+ raise SignatureError(log_message)
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding=""):
+ string = padding + "Derived Data Type Name: {}\n" \
+ + padding + "Data Type: {}\n" \
+ + padding + "Type Affinity: {}\n" \
+ + padding + "Recommended Storage Class: {}\n" \
+ + padding + "Possible Storage Classes: {}\n" \
+ + padding + "Recommended Signature: {}\n" \
+ + padding + "Complete Signature: {}"
+ string = string.format(self.derived_data_type_name,
+ self.data_type,
+ self.type_affinity,
+ self.recommended_storage_class,
+ self.possible_storage_classes,
+ self.recommended_signature,
+ self.complete_signature)
+ return string
+
+
+class TableColumnSignature(object):
+
+ def __init__(self, index, name, column_signatures):
+
+ self._logger = getLogger(LOGGER_NAME)
+
+ self.count = 0
+ self.index = index
+ self.name = name
+ self.column_signatures = {}
+
+ for column_signature in column_signatures:
+
+ if column_signature.index != self.index:
+ log_message = "Invalid column signature index: {} found for table column signature with index: {} " \
+ "and name: {}."
+ log_message = log_message.format(column_signature.index, self.index, self.name)
+ self._logger.error(log_message)
+ raise SignatureError(log_message)
+
+ if column_signature.name != self.name:
+ log_message = "Invalid column signature name: {} found for table column signature with name: {} " \
+ "and name: {}."
+ log_message = log_message.format(column_signature.name, self.index, self.name)
+ self._logger.error(log_message)
+ raise SignatureError(log_message)
+
+ self.count += column_signature.count
+
+ if column_signature.serial_type in self.column_signatures:
+
+ if isinstance(column_signature, ColumnFixedLengthSignature):
+ updated_column_signature = self.column_signatures[column_signature.serial_type]
+ updated_column_signature.update(column_signature.serial_type, column_signature.count)
+
+ elif isinstance(column_signature, ColumnVariableLengthSignature):
+ updated_column_signature = self.column_signatures[column_signature.serial_type]
+ updated_column_signature.update(column_signature.serial_type, column_signature.count,
+ column_signature.variable_length_serial_types)
+
+ else:
+ log_message = "Invalid column signature type: {} found for table column signature with index: {} " \
+ "and name: {}."
+ log_message = log_message.format(type(column_signature), self.index, self.name)
+ self._logger.error(log_message)
+ raise SignatureError(log_message)
+
+ else:
+
+ if isinstance(column_signature, ColumnFixedLengthSignature):
+ new_column_signature = ColumnFixedLengthSignature(index, column_signature.name,
+ column_signature.serial_type,
+ column_signature.count)
+ self.column_signatures[column_signature.serial_type] = new_column_signature
+
+ elif isinstance(column_signature, ColumnVariableLengthSignature):
+ new_column_signature = ColumnReducedVariableLengthSignature(index, column_signature.name,
+ column_signature.serial_type,
+ column_signature.count,
+ column_signature.
+ variable_length_serial_types)
+ self.column_signatures[column_signature.serial_type] = new_column_signature
+
+ else:
+ log_message = "Invalid column signature type: {} found for table column signature with index: {} " \
+ "and name: {}."
+ log_message = log_message.format(type(column_signature), self.index, self.name)
+ self._logger.error(log_message)
+ raise SignatureError(log_message)
+
+ for column_signature_index, column_signature in self.column_signatures.iteritems():
+ column_signature.number_of_rows = self.count
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding="", print_column_signatures=True):
+ string = padding + "Index: {}\n" \
+ + padding + "Name: {}\n" \
+ + padding + "Count: {}\n" \
+ + padding + "Focused Signature: {}\n" \
+ + padding + "Simple Signature: {}\n" \
+ + padding + "Column Signature Length: {}"
+ string = string.format(self.index,
+ self.name,
+ self.count,
+ self.focused_signature,
+ self.simplified_signature,
+ len(self.column_signatures))
+ if print_column_signatures:
+ for column_signature_index, column_signature in self.column_signatures.iteritems():
+ string += "\n" + padding + "Column Signature:\n{}".format(column_signature.stringify(padding + "\t"))
+ return string
+
+ @property
+ def focused_probabilistic_signature(self):
+ focused_signatures = []
+ for column_signature_index, column_signature in self.column_signatures.iteritems():
+ if isinstance(column_signature, ColumnVariableLengthSignature):
+ for serial_type in column_signature.variable_length_serial_types:
+ serial_type_probability = column_signature.get_variable_length_serial_type_probability(serial_type)
+ focused_signatures.append((serial_type, serial_type_probability))
+ elif isinstance(column_signature, ColumnFixedLengthSignature):
+ focused_signatures.append((column_signature.serial_type, column_signature.probability))
+ else:
+ log_message = "Invalid column signature type: {} found for table column signature with index: {} " \
+ "and name: {}."
+ log_message = log_message.format(type(column_signature), self.index, self.name)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+ return sorted(focused_signatures, key=lambda x: x[0])
+
+ @property
+ def focused_signature(self):
+ focused_signatures = []
+ for column_signature_index, column_signature in self.column_signatures.iteritems():
+ if isinstance(column_signature, ColumnVariableLengthSignature):
+ focused_signatures.extend(column_signature.variable_length_serial_types.keys())
+ elif isinstance(column_signature, ColumnFixedLengthSignature):
+ focused_signatures.append(column_signature.serial_type)
+ else:
+ log_message = "Invalid column signature type: {} found for table column signature with index: {} " \
+ "and name: {}."
+ log_message = log_message.format(type(column_signature), self.index, self.name)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+ return sorted(focused_signatures, key=int)
+
+ @property
+ def simplified_probabilistic_signature(self):
+ simplified_signatures = []
+ for column_signature_index, column_signature in self.column_signatures.iteritems():
+ simplified_signatures.append((column_signature.serial_type, column_signature.probability))
+ return sorted(simplified_signatures, key=lambda x: x[0])
+
+ @property
+ def simplified_signature(self):
+ simplified_signatures = []
+ for column_signature_index, column_signature in self.column_signatures.iteritems():
+ simplified_signatures.append(column_signature.serial_type)
+ return sorted(simplified_signatures, key=int)
+
+
+class TableRowSignature(object):
+
+ """
+
+ TableRowSignature
+
+ This class represents a signature of a particular row in a table. The idea is that each table has similar rows
+ in respect to their serial type ordering (storage classes and type affinities). A array is made of these
+ representing all signatures in a table and then can be inverted to represent the column signatures of a table.
+
+ Note: The number of columns in a table row signature may be equal to or less than the number of column definitions
+ since columns can be added over time. However, columns cannot be removed or renamed in SQLite.
+
+ Note: ColumnFixedLengthSignature column signatures will always have a probability of 1 in table row signatures,
+ since this is identifying a unique combination of column signatures (serial types). The
+ ColumnVariableLengthSignature column signatures will have a similar probability of 1 in reference to TEXT
+ and BLOB storage classes but may differ in the variable lengths themselves. Due to this, there is no
+ probabilistic signatures for table row signatures as there are in table column signatures.
+
+ """
+
+ def __init__(self, column_definitions, record):
+
+ """
+
+ Constructor.
+
+ Note: Table row signatures are determined from the record serial type signature. Rows with the same serial
+ type signature for records will be grouped into individual table row signatures and "counted".
+
+ Note: The column definitions array and the record columns in the record are relative to each other in terms
+ of order since the column definitions are pulled from the master schema.
+
+ :param column_definitions:
+ :param record:
+
+ :return:
+
+ """
+
+ self._logger = getLogger(LOGGER_NAME)
+
+ # Get the record columns
+ record_columns = record.record_columns
+
+ self.count = 1
+ self.column_signatures = {}
+ self.record_serial_type_signature = record.serial_type_signature
+
+ """
+
+ Below we check to make sure the number of record column for this table row signature are less than or equal to
+ the number of column definitions. Since columns can be added, but not removed or renamed, the number of record
+ columns can be less than the number of column definitions. However, added columns are always appended to the
+ table and therefore the column definitions will align up to the number of record columns that are found.
+
+ We raise an exception if we find that the number of record columns is greater than the number of column
+ definitions. If we find that the record columns is less than the number of column definitions, we print
+ a debug message.
+
+ """
+
+ # Check the length of the column definitions to the record columns
+ if len(column_definitions) != len(record_columns):
+
+ # Check if the column definitions is less than the number of record columns
+ if len(column_definitions) < len(record_columns):
+ log_message = "The length of column definitions: {} is less than the record column length: {} " \
+ "for table row signature with record serial type signature: {}."
+ log_message = log_message.format(len(column_definitions), len(record_columns),
+ self.record_serial_type_signature)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ # The number of column definitions is greater than the number of record columns
+ else:
+ log_message = "The length of column definitions: {} is greater than the record column length: {} " \
+ "for table row signature with record serial type signature: {}."
+ log_message = log_message.format(len(column_definitions), len(record_columns),
+ self.record_serial_type_signature)
+ self._logger.debug(log_message)
+
+ """
+
+ Note: The count is the number of specific rows that were found with this serial type whereas the number of
+ rows is the total of the rows in the table this column signature is being derived from. Therefore,
+ the probability of this column signature with this serial type occurring in the particular column of
+ the table is the count/total.
+
+ """
+
+ self._number_of_rows = None
+
+ for index in range(len(record_columns)):
+
+ column_name = column_definitions[index].column_name
+ serial_type = record_columns[index].serial_type
+
+ if 0 <= serial_type <= 9:
+ self.column_signatures[index] = ColumnFixedLengthSignature(index, column_name, serial_type)
+ elif serial_type >= 12:
+ self.column_signatures[index] = ColumnNonReducedVariableLengthSignature(index, column_name, serial_type)
+ else:
+ log_message = "Invalid serial type: {} for table row signature with record serial type signature: {}."
+ log_message = log_message.format(serial_type, self.record_serial_type_signature)
+ self._logger.error(log_message)
+ raise SignatureError(log_message)
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding="", print_column_signatures=True):
+ string = padding + "Record Serial Type Signature: {}\n" \
+ + padding + "Count: {}\n" \
+ + padding + "Number of Rows: {}\n" \
+ + padding + "Probability: {}\n" \
+ + padding + "Focused Signature: {}\n" \
+ + padding + "Simple Signature: {}\n" \
+ + padding + "Column Signature Length: {}"
+ string = string.format(self.record_serial_type_signature,
+ self.count,
+ self.number_of_rows,
+ self.probability,
+ self.focused_signature,
+ self.simplified_signature,
+ len(self.column_signatures))
+ if print_column_signatures:
+ for column_signature_index, column_signature in self.column_signatures.iteritems():
+ string += "\n" + padding + "Column Signature:\n{}".format(column_signature.stringify(padding + "\t"))
+ return string
+
+ @property
+ def focused_signature(self):
+ focused_signatures = []
+ for column_signature_index, column_signature in self.column_signatures.iteritems():
+ if isinstance(column_signature, ColumnVariableLengthSignature):
+ focused_signatures.append(sorted(column_signature.variable_length_serial_types.keys(), key=int))
+ elif isinstance(column_signature, ColumnFixedLengthSignature):
+ focused_signatures.append([column_signature.serial_type])
+ else:
+ log_message = "Invalid column signature type: {} found for table row signature with record serial " \
+ "type signature: {}."
+ log_message = log_message.format(type(column_signature), self.record_serial_type_signature)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+ return focused_signatures
+
+ @property
+ def number_of_rows(self):
+
+ """
+
+
+
+ Note: A value of None will be returned if the number of rows is not set.
+
+ :return:
+
+ """
+
+ return self._number_of_rows
+
+ @number_of_rows.setter
+ def number_of_rows(self, number_of_rows):
+
+ if number_of_rows <= 0 or number_of_rows < self.count:
+ log_message = "Invalid number of rows: {} for table row signature with record serial type signature: {}."
+ log_message = log_message.format(number_of_rows, self.record_serial_type_signature)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ self._number_of_rows = number_of_rows
+
+ for column_signature_index, column_signature in self.column_signatures.iteritems():
+ column_signature.number_of_rows = number_of_rows
+
+ @property
+ def probability(self):
+
+ """
+
+
+
+ Note: A value of None will be returned if the number of rows is not set.
+
+ :return:
+
+ """
+
+ if self._number_of_rows:
+ return float(self.count) / self._number_of_rows
+ return None
+
+ @property
+ def simplified_signature(self):
+ simplified_signatures = []
+ for column_signature_index, column_signature in self.column_signatures.iteritems():
+ simplified_signatures.append([column_signature.serial_type])
+ return simplified_signatures
+
+ def update(self, record):
+
+ self.count += 1
+
+ record_columns = record.record_columns
+
+ # Check the length of each (we assume the order in relative to each other is the same)
+ if len(self.column_signatures) != len(record_columns):
+ log_message = "The length of column signatures: {} does not match record column length from record: {} " \
+ "for table row signature with record serial type signature: {}."
+ log_message = log_message.format(len(self.column_signatures), len(record_columns),
+ self.record_serial_type_signature)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ for index in self.column_signatures:
+
+ serial_type = record_columns[index].serial_type
+ column_signature = self.column_signatures[index]
+
+ if isinstance(column_signature, ColumnFixedLengthSignature):
+
+ if column_signature.serial_type != serial_type:
+ log_message = "Column signature serial type: {} does not match record serial type: {} " \
+ "for table row signature with record serial type signature: {}."
+ log_message = log_message.format(column_signature.serial_type, serial_type,
+ self.record_serial_type_signature)
+ self._logger.error(log_message)
+ raise SignatureError(log_message)
+
+ column_signature.update(serial_type)
+
+ elif isinstance(column_signature, ColumnVariableLengthSignature):
+
+ if serial_type >= 12 and serial_type % 2 == 0:
+ if column_signature.serial_type != -1:
+ log_message = "Column signature serial type: {} does not equate to record column variable " \
+ "length serial type: {} for table row signature with record serial " \
+ "type signature: {}."
+ log_message = log_message.format(column_signature.serial_type, serial_type,
+ self.record_serial_type_signature)
+ self._logger.error(log_message)
+ raise SignatureError(log_message)
+
+ elif serial_type >= 13 and serial_type % 2 == 1:
+ if column_signature.serial_type != -2:
+ log_message = "Column signature serial type: {} does not equate to record column variable " \
+ "length serial type: {} for table row signature with record serial " \
+ "type signature: {}."
+ log_message = log_message.format(column_signature.serial_type, serial_type,
+ self.record_serial_type_signature)
+ self._logger.error(log_message)
+ raise SignatureError(log_message)
+
+ else:
+ log_message = "Invalid serial type: {} for column variable length signature " \
+ "for table row signature with record serial type signature: {}."
+ log_message = log_message.format(serial_type, self.record_serial_type_signature)
+ self._logger.error(log_message)
+ raise SignatureError(log_message)
+
+ column_signature.update(serial_type)
+
+ else:
+
+ log_message = "Invalid column signature type: {} found for table row signature with record serial " \
+ "type signature: {}."
+ log_message = log_message.format(type(column_signature), self.record_serial_type_signature)
+ self._logger.error(log_message)
+ raise SignatureError(log_message)
+
+
+class ColumnSignature(object):
+
+ __metaclass__ = ABCMeta
+
+ def __init__(self, index, name, serial_type, count=1):
+
+ """
+
+ Constructor.
+
+ Note: All columns within a signature may have different counts. This is due to the fact that columns can
+ be added in SQLite. If this occurs then columns towards the end of the rows may have less entries
+ (if any) than previous column counts.
+
+ :param index:
+ :param name:
+ :param serial_type:
+ :param count:
+
+ """
+
+ self._logger = getLogger(LOGGER_NAME)
+
+ self.index = index
+ self.name = name
+ self.serial_type = serial_type
+ self.count = count
+
+ """
+
+ Note: The count is the number of specific rows that were found with this serial type whereas the number of
+ rows is the total of the rows in the table this column signature is being derived from. Therefore,
+ the probability of this column signature with this serial type occurring in the particular column of
+ the table is the count/total.
+
+ """
+
+ self._number_of_rows = None
+
+ # These values are reserved and should not be found in SQLite files
+ if self.serial_type == 10 or self.serial_type == 11:
+ log_message = "Invalid serial type: {} found for column signature index: {} and name: {}."
+ log_message = log_message.format(self.serial_type, self.index, self.name)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding=""):
+ string = padding + "Index: {}\n" \
+ + padding + "Name: {}\n" \
+ + padding + "Serial Type: {}\n" \
+ + padding + "Count: {}\n" \
+ + padding + "Number of Rows: {}\n" \
+ + padding + "Probability: {}"
+ return string.format(self.index,
+ self.name,
+ self.serial_type,
+ self.count,
+ self.number_of_rows,
+ self.probability)
+
+ @property
+ def number_of_rows(self):
+
+ """
+
+
+
+ Note: A value of None will be returned if the number of rows is not set.
+
+ :return:
+
+ """
+
+ return self._number_of_rows
+
+ @number_of_rows.setter
+ def number_of_rows(self, number_of_rows):
+ if number_of_rows <= 0 or number_of_rows < self.count:
+ log_message = "Invalid number of rows: {} for column signature index: {} and name: {}"
+ log_message = log_message.format(number_of_rows, self.index, self.name)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+ self._number_of_rows = number_of_rows
+
+ @property
+ def probability(self):
+
+ """
+
+
+
+ Note: A value of None will be returned if the number of rows is not set.
+
+ :return:
+
+ """
+
+ if self._number_of_rows:
+ return float(self.count) / self._number_of_rows
+ return None
+
+ @abstractmethod
+ def update(self, serial_type, count=None, variable_length_serial_types=None):
+ raise NotImplementedError("The abstract method update was called directly and is not implemented.")
+
+
+class ColumnFixedLengthSignature(ColumnSignature):
+
+ def __init__(self, index, name, serial_type, count=1):
+
+ super(ColumnFixedLengthSignature, self).__init__(index, name, serial_type, count)
+
+ if serial_type not in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
+ log_message = "Invalid serial type for column fixed-length signature index: {} and name: {}"
+ log_message = log_message.format(serial_type, self.index, self.name)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ self.content_size = get_content_size(self.serial_type)
+
+ def stringify(self, padding=""):
+ string = "\n" + padding + "Content Size: {}"
+ string = string.format(self.content_size)
+ return super(ColumnFixedLengthSignature, self).stringify(padding) + string
+
+ def update(self, serial_type, count=1, variable_length_serial_types=None):
+
+ if serial_type != self.serial_type:
+ log_message = "Specified serial type: {} does not match column fixed-length signature serial type: {} " \
+ "index: {} and name: {}"
+ log_message = log_message.format(serial_type, self.serial_type, self.index, self.name)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ if variable_length_serial_types:
+ log_message = "Variable length serial types: {} specified for column fixed-length signature " \
+ "index: {} and name: {}"
+ log_message = log_message.format(variable_length_serial_types, self.index, self.name)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ self.count += count
+
+
+class ColumnVariableLengthSignature(ColumnSignature):
+
+ __metaclass__ = ABCMeta
+
+ def __init__(self, index, name, serial_type, count=1):
+
+ super(ColumnVariableLengthSignature, self).__init__(index, name, serial_type, count)
+
+ """
+
+ Note: The variable length serial types is a dictionary where:
+ variable_length_serial_types[variable length serial type] = count of variable length serial type in column
+
+ """
+
+ self.variable_length_serial_types = None
+
+ def stringify(self, padding=""):
+ string = "\n" + padding + "Variable Length Serial Types: {}"
+ string = string.format(self.variable_length_serial_types)
+ return super(ColumnVariableLengthSignature, self).stringify(padding) + string
+
+ def get_variable_length_serial_type_probability(self, variable_length_serial_type):
+
+ """
+
+
+
+ Note: A value of None will be returned if the number of rows is not set.
+
+ :param variable_length_serial_type:
+
+ :return:
+
+ """
+
+ if self._number_of_rows:
+ return float(self.variable_length_serial_types[variable_length_serial_type]) / self._number_of_rows
+ return None
+
+
+class ColumnReducedVariableLengthSignature(ColumnVariableLengthSignature):
+
+ """
+
+ ColumnReducedVariableLengthSignature
+
+
+
+ Note: This class is used where the serial types for variable length signatures are reduced and therefore
+ are either -1 (for BLOB) or -2 (for TEXT).
+
+ """
+
+ def __init__(self, index, name, serial_type, count, variable_length_serial_types):
+
+ if serial_type not in [-2, -1]:
+ log_message = "Invalid serial type: {} for column reduced variable length signature index: {} and name: {}"
+ log_message = log_message.format(serial_type, self.index, self.name)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ if not count:
+ log_message = "Count not specified for column reduced variable length signature index: {} and name: {} " \
+ "for serial type: {} and variable length serial types: {}."
+ log_message = log_message.format(index, name, serial_type, variable_length_serial_types)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ if not variable_length_serial_types:
+ log_message = "Variable length serial types not specified for column reduced variable length signature " \
+ "index: {} and name: {} for serial type: {} and count: {}."
+ log_message = log_message.format(index, name, serial_type, count)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ super(ColumnReducedVariableLengthSignature, self).__init__(index, name, serial_type, count)
+
+ self.variable_length_serial_types = variable_length_serial_types
+
+ def update(self, serial_type, count=None, variable_length_serial_types=None):
+
+ if serial_type != self.serial_type:
+ log_message = "Specified serial type: {} does not match column reduced variable length signature serial " \
+ "type: {} index: {} and name: {}"
+ log_message = log_message.format(serial_type, self.serial_type, self.index, self.name)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ if not count:
+ log_message = "Count not specified for column reduced variable length signature index: {} and name: {} " \
+ "for serial type: {} and variable length serial types: {}."
+ log_message = log_message.format(self.index, self.name, serial_type, variable_length_serial_types)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ if not variable_length_serial_types:
+ log_message = "Variable length serial types not specified for column reduced variable length signature " \
+ "index: {} and name: {} for serial type: {} and count: {}."
+ log_message = log_message.format(self.index, self.name, serial_type, count)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ self.count += count
+
+ for variable_length_serial_type, variable_length_serial_type_count in variable_length_serial_types.iteritems():
+ if variable_length_serial_type in self.variable_length_serial_types:
+ self.variable_length_serial_types[variable_length_serial_type] += variable_length_serial_type_count
+ else:
+ self.variable_length_serial_types[variable_length_serial_type] = variable_length_serial_type_count
+
+
+class ColumnNonReducedVariableLengthSignature(ColumnVariableLengthSignature):
+
+ """
+
+ ColumnNonReducedVariableLengthSignature
+
+
+
+ Note: This class is used where the serial types for variable length signatures are not reduced and therefore
+ are greater or equal to 12.
+
+ """
+
+ def __init__(self, index, name, serial_type):
+
+ if serial_type < 12:
+ log_message = "Invalid serial type: {} for column non-reduced variable length signature index: {} " \
+ "and name: {}"
+ log_message = log_message.format(serial_type, self.index, self.name)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ super(ColumnNonReducedVariableLengthSignature, self).__init__(index, name, serial_type)
+
+ self.variable_length_serial_types = {}
+
+ # A BLOB that is (N-12)/2 bytes in length
+ if self.serial_type >= 12 and self.serial_type % 2 == 0:
+ self.variable_length_serial_types[self.serial_type] = 1
+ self.serial_type = -1
+
+ # A string in the database encoding and is (N-13)/2 bytes in length (The nul terminator is omitted)
+ elif self.serial_type >= 13 and self.serial_type % 2 == 1:
+ self.variable_length_serial_types[self.serial_type] = 1
+ self.serial_type = -2
+
+ else:
+ log_message = "Invalid serial type: {} for column non-reduced variable length signature index: {} and " \
+ "name: {}"
+ log_message = log_message.format(serial_type, self.index, self.name)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ def update(self, serial_type, count=None, variable_length_serial_types=None):
+
+ if serial_type < 12:
+ log_message = "Invalid serial type: {} for column non-reduced variable length signature index: {} " \
+ "and name: {}"
+ log_message = log_message.format(serial_type, self.index, self.name)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ if count:
+ log_message = "Count specified for column non-reduced variable length signature index: {} and name: {} " \
+ "for serial type: {} and variable length serial types: {}."
+ log_message = log_message.format(self.index, self.name, serial_type, variable_length_serial_types)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ if variable_length_serial_types:
+ log_message = "Variable length serial types specified for column non-reduced variable length signature " \
+ "index: {} and name: {} for serial type: {} and count: {}."
+ log_message = log_message.format(self.index, self.name, serial_type, count)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ self.count += 1
+
+ # A BLOB that is (N-12)/2 bytes in length
+ if serial_type >= 12 and serial_type % 2 == 0:
+
+ if self.serial_type != -1:
+ log_message = "Specified serial type: {} does not equate to column non-reduced variable length " \
+ "signature serial type: {} index: {} and name: {}"
+ log_message = log_message.format(serial_type, self.serial_type, self.index, self.name)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ # A string in the database encoding and is (N-13)/2 bytes in length (The nul terminator is omitted)
+ elif serial_type >= 13 and serial_type % 2 == 1:
+
+ if self.serial_type != -2:
+ log_message = "Specified serial type: {} does not equate to column non-reduced variable length " \
+ "signature serial type: {} index: {} and name: {}"
+ log_message = log_message.format(serial_type, self.serial_type, self.index, self.name)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ else:
+
+ log_message = "Invalid serial type: {} for column non-reduced variable length signature index: {} and " \
+ "name: {}"
+ log_message = log_message.format(serial_type, self.index, self.name)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ if serial_type in self.variable_length_serial_types:
+ self.variable_length_serial_types[serial_type] += 1
+ else:
+ self.variable_length_serial_types[serial_type] = 1
diff --git a/sqlite_dissect/carving/utilities.py b/sqlite_dissect/carving/utilities.py
new file mode 100644
index 0000000..78a3481
--- /dev/null
+++ b/sqlite_dissect/carving/utilities.py
@@ -0,0 +1,387 @@
+from binascii import hexlify
+from binascii import unhexlify
+from logging import getLogger
+from sqlite_dissect.constants import BLOB_SIGNATURE_IDENTIFIER
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import TEXT_SIGNATURE_IDENTIFIER
+from sqlite_dissect.exception import CarvingError
+from sqlite_dissect.exception import InvalidVarIntError
+from sqlite_dissect.utilities import decode_varint
+
+"""
+
+utilities.py
+
+This script holds carving utility functions for reference by the SQLite carving module.
+
+This script holds the following function(s):
+decode_varint_in_reverse(byte_array, offset)
+calculate_body_content_size(serial_type_header)
+calculate_serial_type_definition_content_length_min_max(simplified_serial_types, allowed_varint_length=5)
+calculate_serial_type_varint_length_min_max(simplified_serial_types)
+generate_regex_for_simplified_serial_type(simplified_serial_type)
+generate_signature_regex(signature, skip_first_serial_type=False)
+get_content_size(serial_type)
+
+"""
+
+
+def decode_varint_in_reverse(byte_array, offset, max_varint_length=9):
+
+ """
+
+ This function will move backwards through a byte array trying to decode a varint in reverse. A InvalidVarIntError
+ will be raised if a varint is not found by this algorithm used in this function. The calling logic should check
+ for this case in case it is encountered which is likely in the context of carving.
+
+ Note: This cannot determine if the field being parsed was originally a varint or not and may give false positives.
+ Please keep this in mind when calling this function.
+
+ Note: If the array runs out of bytes while parsing in reverse, the currently determined varint will be returned.
+
+ Note: Since the parsing starts from the left of the offset specified, the resulting byte string that represents
+ this varint can be determined by byte_array[varint_relative_offset:offset]. The length of the varint
+ in bytes can be determined likewise either from the len() of the above or offset - varint_relative_offset.
+
+ :param byte_array: bytearray The byte array to parse for the varint in reverse.
+ :param offset: int The offset to move backwards from. The offset specified is not included in the parsing and the
+ algorithm starts with the last byte of the varint at offset - 1. If you want to start at the
+ end of the byte array then the offset should be the length of the byte array (where the offset
+ would refer to a non-existing index in the array).
+ :param max_varint_length: int The maximum number of varint bytes to go back in reverse. The default is 9 since
+ this is the maximum number of bytes a varint can be.
+
+ :return:
+
+ :raise: InvalidVarIntError: If a varint is not determined while parsing the byte array in reverse using the
+ algorithm in this function. This error is not logged as an error but rather a
+ debug statement since it is very likely to occur during carving and should be handled
+ appropriately.
+
+ """
+
+ if offset > len(byte_array):
+ log_message = "The offset: {} is greater than the size of the byte array: {} for the bytes: {}."
+ log_message = log_message.format(offset, len(byte_array), hexlify(byte_array))
+ getLogger(LOGGER_NAME).error(log_message)
+ raise ValueError(log_message)
+
+ unsigned_integer_value = 0
+ varint_inverted_relative_offset = 0
+
+ varint_byte = ord(byte_array[offset - 1 - varint_inverted_relative_offset:offset - varint_inverted_relative_offset])
+ varint_byte &= 0x7f
+ unsigned_integer_value |= varint_byte
+ varint_inverted_relative_offset += 1
+
+ while offset - varint_inverted_relative_offset - 1 >= 0:
+
+ if varint_inverted_relative_offset > max_varint_length:
+
+ """
+
+ Since this exception is not considered a important exception to log as an error, it will be logged
+ as a debug statement. There is a good chance of this use case occurring and is even expected during
+ carving.
+
+ """
+
+ log_message = "A varint was not determined from byte array: {} starting at offset: {} in reverse."
+ log_message = log_message.format(byte_array, offset)
+ getLogger(LOGGER_NAME).debug(log_message)
+ return InvalidVarIntError(log_message)
+
+ varint_byte = ord(byte_array[offset - 1 - varint_inverted_relative_offset:
+ offset - varint_inverted_relative_offset])
+ msb_set = varint_byte & 0x80
+ if msb_set:
+ varint_byte &= 0x7f
+ varint_byte <<= (7 * varint_inverted_relative_offset)
+ unsigned_integer_value |= varint_byte
+ varint_inverted_relative_offset += 1
+ else:
+ break
+
+ varint_relative_offset = offset - varint_inverted_relative_offset
+
+ return unsigned_integer_value, varint_relative_offset
+
+
+def calculate_body_content_size(serial_type_header):
+ body_content_size = 0
+ start_offset = 0
+ while start_offset < len(serial_type_header):
+ serial_type, serial_type_varint_length = decode_varint(serial_type_header, start_offset)
+ body_content_size += get_content_size(serial_type)
+ start_offset += serial_type_varint_length
+ if start_offset > len(serial_type_header):
+ log_message = "Invalid start offset: {} retrieved from serial type header of length: {}: {}."
+ log_message = log_message.format(start_offset, len(serial_type_header), hexlify(serial_type_header))
+ getLogger(LOGGER_NAME).error(log_message)
+ raise CarvingError(log_message)
+ return body_content_size
+
+
+def calculate_serial_type_definition_content_length_min_max(simplified_serial_types=None, allowed_varint_length=5):
+
+ content_max_length = int('1111111' * allowed_varint_length, 2)
+
+ if not simplified_serial_types:
+ return 0, content_max_length
+
+ serial_type_definition_content_length_min = content_max_length
+ serial_type_definition_content_length_max = 0
+
+ for simplified_serial_type in simplified_serial_types:
+ if simplified_serial_type in [BLOB_SIGNATURE_IDENTIFIER, TEXT_SIGNATURE_IDENTIFIER]:
+ serial_type_definition_content_length_min = min(serial_type_definition_content_length_min, 1)
+ serial_type_definition_content_length_max = max(serial_type_definition_content_length_max,
+ content_max_length)
+ else:
+ serial_type_content_length = get_content_size(simplified_serial_type)
+ serial_type_definition_content_length_min = min(serial_type_definition_content_length_min,
+ serial_type_content_length)
+ serial_type_definition_content_length_max = max(serial_type_definition_content_length_max,
+ serial_type_content_length)
+
+ return serial_type_definition_content_length_min, serial_type_definition_content_length_max
+
+
+def calculate_serial_type_varint_length_min_max(simplified_serial_types):
+
+ serial_type_varint_length_min = 5
+ serial_type_varint_length_max = 1
+
+ for simplified_serial_type in simplified_serial_types:
+
+ if simplified_serial_type in [BLOB_SIGNATURE_IDENTIFIER, TEXT_SIGNATURE_IDENTIFIER]:
+ serial_type_varint_length_min = min(serial_type_varint_length_min, 1)
+ serial_type_varint_length_max = min(serial_type_varint_length_max, 5)
+ else:
+ serial_type_varint_length_min = min(serial_type_varint_length_min, 1)
+ serial_type_varint_length_max = min(serial_type_varint_length_max, 1)
+
+ return serial_type_varint_length_min, serial_type_varint_length_max
+
+
+def generate_regex_for_simplified_serial_type(simplified_serial_type):
+
+ """
+
+
+
+ Note: Right now 9 byte varints are not supported in the regular expressions generated for blob and text storage
+ classes.
+
+ :param simplified_serial_type:
+
+ :return:
+
+ """
+
+ if simplified_serial_type == -2:
+ return "(?:[\x0C-\x7F]|[\x80-\xFF]{1,7}[\x00-\x7F])"
+ elif simplified_serial_type == -1:
+ return "(?:[\x0D-\x7F]|[\x80-\xFF]{1,7}[\x00-\x7F])"
+ elif 0 <= simplified_serial_type <= 9:
+ return unhexlify("0{}".format(simplified_serial_type))
+ else:
+ log_message = "Unable to generate regular expression for simplified serial type: {}."
+ log_message = log_message.format(simplified_serial_type)
+ getLogger(LOGGER_NAME).error(log_message)
+ raise CarvingError(log_message)
+
+
+def generate_signature_regex(signature, skip_first_serial_type=False):
+
+ """
+
+ This function will generate the regular expression for a particular signature sent in derived from a Signature
+ class. For instance, the signature should be in list form as the simplified signature, simplified schema
+ signature, etc.
+
+ The skip first serial type field will omit the first serial type from the regular expression. This is to better
+ support carving of freeblocks since the first 4 bytes are overwritten of the entry and this could contain the first
+ serial type byte in the header as the fourth byte. Leaving this out will provide better accuracy for determining
+ deleted entries in freeblocks.
+
+ Note: There may be issues if there is only one field either in the signature or left in the signature after the
+ first serial type is skipped, if specified.
+
+ Note: There is also the case of the first serial type being a varint which needs to be addressed.
+
+ :param signature:
+ :param skip_first_serial_type:
+
+ :return:
+
+ """
+
+ regex = ""
+
+ if skip_first_serial_type:
+ signature = signature[1:]
+
+ for column_serial_type_array in signature:
+
+ number_of_possible_serial_types = len(column_serial_type_array)
+
+ if number_of_possible_serial_types == 1:
+
+ serial_type = column_serial_type_array[0]
+ serial_type_regex = generate_regex_for_simplified_serial_type(serial_type)
+ regex += serial_type_regex
+
+ elif 1 < number_of_possible_serial_types < 13:
+
+ """
+
+ The maximum number of possible serial types are in the range of 1 to 12. Since the case of just
+ a single serial type is handled above, this portion accounts for possible serial types of more than
+ 1 field up to 12. These can be the following 12 serial type fields: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -2.
+
+ """
+
+ basic_serial_type_regex = ""
+ blob_regex = ""
+ text_regex = ""
+
+ for column_serial_type in column_serial_type_array:
+ if column_serial_type == -1:
+ blob_regex = generate_regex_for_simplified_serial_type(column_serial_type)
+ elif column_serial_type == -2:
+ text_regex = generate_regex_for_simplified_serial_type(column_serial_type)
+ else:
+ basic_serial_type_regex += generate_regex_for_simplified_serial_type(column_serial_type)
+
+ if blob_regex or text_regex:
+
+ if basic_serial_type_regex:
+ basic_serial_type_regex = "[{}]".format(basic_serial_type_regex)
+
+ if blob_regex and not text_regex:
+
+ if not basic_serial_type_regex:
+ log_message = "No basic serial type regular expression found when multiple column serial " \
+ "types were defined with a blob regular expression of: {} and no text regular " \
+ "expression in the signature: {} where the skip first serial type was set to: {}."
+ log_message = log_message.format(blob_regex, signature, skip_first_serial_type)
+ getLogger(LOGGER_NAME).error(log_message)
+ raise CarvingError(log_message)
+
+ regex += "(?:{}|{})".format(basic_serial_type_regex, blob_regex)
+
+ elif not blob_regex and text_regex:
+
+ if not basic_serial_type_regex:
+ log_message = "No basic serial type regular expression found when multiple column serial " \
+ "types were defined with no blob regular expression and a text regular " \
+ "expression of: {} in the signature: {} where the skip first serial type " \
+ "was set to: {}."
+ log_message = log_message.format(text_regex, signature, skip_first_serial_type)
+ getLogger(LOGGER_NAME).error(log_message)
+ raise CarvingError(log_message)
+
+ regex += "(?:{}|{})".format(basic_serial_type_regex, text_regex)
+
+ elif blob_regex and text_regex:
+
+ var_length_regex = blob_regex + "|" + text_regex
+ if basic_serial_type_regex:
+ regex += "(?:{}|{})".format(basic_serial_type_regex, var_length_regex)
+ else:
+ regex += "(?:{})".format(var_length_regex)
+
+ else:
+ log_message = "No appropriate regular expressions were found for basic serial type, blob, or " \
+ "text column signature types in the signature: {} where the skip first serial type " \
+ "was set to: {}."
+ log_message = log_message.format(text_regex, signature, skip_first_serial_type)
+ getLogger(LOGGER_NAME).error(log_message)
+ raise CarvingError(log_message)
+
+ else:
+
+ """
+
+ Since a blob or text regex was not found, the signatures must only be basic serial types (which are
+ considered non-variable length serial types).
+
+ """
+
+ if not basic_serial_type_regex:
+ log_message = "No basic serial type regular expression found when no variable length serial " \
+ "types were determined in the signature: {} where the skip first serial type was " \
+ "set to: {}."
+ log_message = log_message.format(signature, skip_first_serial_type)
+ getLogger(LOGGER_NAME).error(log_message)
+ raise CarvingError(log_message)
+
+ regex += "[{}]".format(basic_serial_type_regex)
+
+ else:
+
+ log_message = "Invalid number of columns in the signature: {} to generate a regular expression from " \
+ "where the skip first serial type was set to: {}."
+ log_message = log_message.format(signature, skip_first_serial_type)
+ getLogger(LOGGER_NAME).error(log_message)
+ raise CarvingError(log_message)
+
+ return regex
+
+
+def get_content_size(serial_type):
+
+ # NULL
+ if serial_type == 0:
+ return 0
+
+ # 8-bit twos-complement integer
+ elif serial_type == 1:
+ return 1
+
+ # Big-endian 16-bit twos-complement integer
+ elif serial_type == 2:
+ return 2
+
+ # Big-endian 24-bit twos-complement integer
+ elif serial_type == 3:
+ return 3
+
+ # Big-endian 32-bit twos-complement integer
+ elif serial_type == 4:
+ return 4
+
+ # Big-endian 48-bit twos-complement integer
+ elif serial_type == 5:
+ return 6
+
+ # Big-endian 64-bit twos-complement integer
+ elif serial_type == 6:
+ return 8
+
+ # Big-endian IEEE 754-2008 64-bit floating point number
+ elif serial_type == 7:
+ return 8
+
+ # Integer constant 0 (schema format == 4)
+ elif serial_type == 8:
+ return 0
+
+ # Integer constant 1 (schema format == 4)
+ elif serial_type == 9:
+ return 0
+
+ # A BLOB that is (N-12)/2 bytes in length
+ elif serial_type >= 12 and serial_type % 2 == 0:
+ return (serial_type - 12) / 2
+
+ # A string in the database encoding and is (N-13)/2 bytes in length. The nul terminator is omitted
+ elif serial_type >= 13 and serial_type % 2 == 1:
+ return (serial_type - 13) / 2
+
+ else:
+ log_message = "Invalid serial type: {}."
+ log_message = log_message.format(serial_type)
+ getLogger(LOGGER_NAME).error(log_message)
+ raise ValueError(log_message)
diff --git a/sqlite_dissect/constants.py b/sqlite_dissect/constants.py
new file mode 100644
index 0000000..e6f4a57
--- /dev/null
+++ b/sqlite_dissect/constants.py
@@ -0,0 +1,288 @@
+from collections import MutableMapping
+from logging import getLogger
+from re import compile
+from sys import maxunicode
+
+"""
+
+constants.py
+
+This script holds constants defined for reference by the sqlite carving library. Additionally, a class has been
+added to this script for constant enumerations.
+
+This script holds the following object(s):
+Enum(MutableMapping)
+
+"""
+
+
+LOGGER_NAME = "sqlite_dissect"
+
+
+class Enum(MutableMapping):
+
+ def __init__(self, data):
+ if isinstance(data, list):
+ self._store = {value: value for value in data}
+ elif isinstance(data, dict):
+ self._store = data
+ else:
+ log_message = "Unable to initialize enumeration for: {} with type: {}.".format(data, type(data))
+ getLogger(LOGGER_NAME).error(log_message)
+ raise ValueError(log_message)
+
+ def __getattr__(self, key):
+ return self._store[key]
+
+ def __getitem__(self, key):
+ return self._store[key]
+
+ def __setitem__(self, key, value):
+ self._store[key] = value
+
+ def __delitem__(self, key):
+ del self._store[key]
+
+ def __contains__(self, key):
+ return True if key in self._store else False
+
+ def __iter__(self):
+ return iter(self._store)
+
+ def __len__(self):
+ return len(self._store)
+
+
+UTF_8 = "utf-8"
+UTF_16BE = "utf-16-be"
+UTF_16LE = "utf-16-le"
+
+ENDIANNESS = Enum(["BIG_ENDIAN", "LITTLE_ENDIAN"])
+
+# Supported file types
+FILE_TYPE = Enum(["DATABASE", "WAL", "WAL_INDEX", "ROLLBACK_JOURNAL"])
+
+SQLITE_3_7_0_VERSION_NUMBER = 3007000
+
+PAGE_TYPE_LENGTH = 1
+
+MASTER_PAGE_HEX_ID = b'\x53'
+TABLE_LEAF_PAGE_HEX_ID = b'\x0d'
+TABLE_INTERIOR_PAGE_HEX_ID = b'\x05'
+INDEX_LEAF_PAGE_HEX_ID = b'\x0a'
+INDEX_INTERIOR_PAGE_HEX_ID = b'\x02'
+
+PAGE_TYPE = Enum(["LOCK_BYTE", "FREELIST_TRUNK", "FREELIST_LEAF", "B_TREE_TABLE_INTERIOR", "B_TREE_TABLE_LEAF",
+ "B_TREE_INDEX_INTERIOR", "B_TREE_INDEX_LEAF", "OVERFLOW", "POINTER_MAP"])
+
+LOCK_BYTE_PAGE_START_OFFSET = 1073741824
+LOCK_BYTE_PAGE_END_OFFSET = 1073742336
+
+SQLITE_DATABASE_HEADER_LENGTH = 100
+MAGIC_HEADER_STRING = "SQLite format 3\000"
+MAGIC_HEADER_STRING_ENCODING = UTF_8
+MAXIMUM_PAGE_SIZE_INDICATOR = 1
+MINIMUM_PAGE_SIZE_LIMIT = 512
+MAXIMUM_PAGE_SIZE_LIMIT = 32768
+MAXIMUM_PAGE_SIZE = 65536
+ROLLBACK_JOURNALING_MODE = 1
+WAL_JOURNALING_MODE = 2
+MAXIMUM_EMBEDDED_PAYLOAD_FRACTION = 64
+MINIMUM_EMBEDDED_PAYLOAD_FRACTION = 32
+LEAF_PAYLOAD_FRACTION = 32
+VALID_SCHEMA_FORMATS = [1, 2, 3, 4]
+UTF_8_DATABASE_TEXT_ENCODING = 1
+UTF_16LE_DATABASE_TEXT_ENCODING = 2
+UTF_16BE_DATABASE_TEXT_ENCODING = 3
+DATABASE_TEXT_ENCODINGS = [UTF_8_DATABASE_TEXT_ENCODING,
+ UTF_16LE_DATABASE_TEXT_ENCODING,
+ UTF_16BE_DATABASE_TEXT_ENCODING]
+RESERVED_FOR_EXPANSION_REGEX = "^0{40}$"
+
+FREELIST_NEXT_TRUNK_PAGE_LENGTH = 4
+FREELIST_LEAF_PAGE_POINTERS_LENGTH = 4
+FREELIST_LEAF_PAGE_NUMBER_LENGTH = 4
+FREELIST_HEADER_LENGTH = FREELIST_NEXT_TRUNK_PAGE_LENGTH + FREELIST_LEAF_PAGE_POINTERS_LENGTH # ptr+num size
+LEAF_PAGE_HEADER_LENGTH = 8
+INTERIOR_PAGE_HEADER_LENGTH = 12
+RIGHT_MOST_POINTER_OFFSET = 8
+RIGHT_MOST_POINTER_LENGTH = 4
+CELL_POINTER_BYTE_LENGTH = 2
+LEFT_CHILD_POINTER_BYTE_LENGTH = 4
+FREEBLOCK_HEADER_LENGTH = 4
+NEXT_FREEBLOCK_OFFSET_LENGTH = 2
+FREEBLOCK_BYTE_LENGTH = 2
+PAGE_FRAGMENT_LIMIT = 60
+FIRST_OVERFLOW_PAGE_NUMBER_LENGTH = 4
+OVERFLOW_HEADER_LENGTH = 4 # This is the next overflow page number but we call it a header here
+POINTER_MAP_ENTRY_LENGTH = 5
+
+PAGE_HEADER_MODULE = "sqlite_dissect.file.database.header"
+PAGE_MODULE = "sqlite_dissect.file.database.page"
+CELL_MODULE = "sqlite_dissect.file.database.page"
+
+INTERIOR_PAGE_HEADER_CLASS = "InteriorPageHeader"
+LEAF_PAGE_HEADER_CLASS = "LeafPageHeader"
+
+INDEX_INTERIOR_PAGE_CLASS = "IndexInteriorPage"
+INDEX_LEAF_PAGE_CLASS = "IndexLeafPage"
+TABLE_INTERIOR_PAGE_CLASS = "TableInteriorPage"
+TABLE_LEAF_PAGE_CLASS = "TableLeafPage"
+INDEX_INTERIOR_CELL_CLASS = "IndexInteriorCell"
+INDEX_LEAF_CELL_CLASS = "IndexLeafCell"
+TABLE_INTERIOR_CELL_CLASS = "TableInteriorCell"
+TABLE_LEAF_CELL_CLASS = "TableLeafCell"
+
+FIRST_OVERFLOW_PARENT_PAGE_NUMBER = 0
+FIRST_OVERFLOW_PAGE_INDEX = 0
+FIRST_FREELIST_TRUNK_PARENT_PAGE_NUMBER = 0
+FIRST_FREELIST_TRUNK_PAGE_INDEX = 0
+
+CELL_LOCATION = Enum({"ALLOCATED_SPACE": "Allocated Space",
+ "UNALLOCATED_SPACE": "Unallocated Space",
+ "FREEBLOCK": "Freeblock"})
+
+CELL_SOURCE = Enum({"B_TREE": "B-Tree",
+ "DISPARATE_B_TREE": "Disparate B-Tree",
+ "FREELIST": "Freelist"})
+
+BLOB_SIGNATURE_IDENTIFIER = -1
+TEXT_SIGNATURE_IDENTIFIER = -2
+
+ZERO_BYTE = b'\x00'
+ALL_ZEROS_REGEX = "^0*$"
+
+SQLITE_MASTER_SCHEMA_ROOT_PAGE = 1
+MASTER_SCHEMA_COLUMN = Enum({"TYPE": 0, "NAME": 1, "TABLE_NAME": 2, "ROOT_PAGE": 3, "SQL": 4})
+MASTER_SCHEMA_ROW_TYPE = Enum({"TABLE": "table", "INDEX": "index", "VIEW": "view", "TRIGGER": "trigger"})
+MASTER_SCHEMA_NUMBER_OF_COLUMNS = 5
+
+COLUMN_DEFINITION = Enum(["COLUMN_NAME", "DATA_TYPE_NAME", "COLUMN_CONSTRAINT"])
+STORAGE_CLASS = Enum(["NULL", "INTEGER", "REAL", "TEXT", "BLOB"])
+TYPE_AFFINITY = Enum(["TEXT", "NUMERIC", "INTEGER", "REAL", "BLOB"])
+DATA_TYPE = Enum(["INT", "INTEGER", "TINYINT", "SMALLINT", "MEDIUMINT", "BIGINT",
+ "UNSIGNED_BIG_INT", "INT2", "INT8",
+ "CHARACTER_20", "VARCHAR_255", "VARYING_CHARACTER_255", "NCHAR_55",
+ "NATIVE_CHARACTER_70", "NVARCHAR_100", "TEXT", "CLOB",
+ "BLOB", "NOT_SPECIFIED",
+ "REAL", "DOUBLE", "DOUBLE_PRECISION", "FLOAT",
+ "NUMERIC", "DECIMAL_10_5", "BOOLEAN", "DATE", "DATETIME",
+ "INVALID"])
+
+CREATE_TABLE_CLAUSE = "CREATE TABLE"
+ORDINARY_TABLE_AS_CLAUSE = "AS"
+CREATE_VIRTUAL_TABLE_CLAUSE = "CREATE VIRTUAL TABLE"
+VIRTUAL_TABLE_USING_CLAUSE = "USING"
+
+CREATE_INDEX_CLAUSE = "CREATE INDEX"
+CREATE_UNIQUE_INDEX_CLAUSE = "CREATE UNIQUE INDEX"
+INDEX_ON_COMMAND = "ON"
+INDEX_WHERE_CLAUSE = "WHERE"
+
+INTERNAL_SCHEMA_OBJECT_PREFIX = "sqlite_"
+INTERNAL_SCHEMA_OBJECT_INDEX_PREFIX = "sqlite_autoindex_"
+
+COLUMN_CONSTRAINT_TYPES = Enum(["PRIMARY_KEY", "NOT NULL", "UNIQUE", "CHECK", "DEFAULT",
+ "COLLATE", "FOREIGN_KEY"])
+
+COLUMN_CONSTRAINT_PREFACES = ["CONSTRAINT", "PRIMARY", "NOT", "UNIQUE", "CHECK", "DEFAULT", "COLLATE", "REFERENCES"]
+TABLE_CONSTRAINT_PREFACES = ["CONSTRAINT", "PRIMARY", "UNIQUE", "CHECK", "FOREIGN"]
+
+"""
+
+Note: For TABLE_CONSTRAINT_TYPE, the PRIMARY_KEY and UNIQUE should be handled the same in respect to this library.
+
+"""
+
+TABLE_CONSTRAINT_TYPES = Enum(["PRIMARY_KEY", "UNIQUE", "CHECK", "FOREIGN_KEY"])
+
+POINTER_MAP_B_TREE_ROOT_PAGE_TYPE = b'\x01'
+POINTER_MAP_FREELIST_PAGE_TYPE = b'\x02'
+POINTER_MAP_OVERFLOW_FIRST_PAGE_TYPE = b'\x03'
+POINTER_MAP_OVERFLOW_FOLLOWING_PAGE_TYPE = b'\x04'
+POINTER_MAP_B_TREE_NON_ROOT_PAGE_TYPE = b'\x05'
+POINTER_MAP_PAGE_TYPES = [POINTER_MAP_B_TREE_ROOT_PAGE_TYPE,
+ POINTER_MAP_FREELIST_PAGE_TYPE,
+ POINTER_MAP_OVERFLOW_FIRST_PAGE_TYPE,
+ POINTER_MAP_OVERFLOW_FOLLOWING_PAGE_TYPE,
+ POINTER_MAP_B_TREE_NON_ROOT_PAGE_TYPE]
+
+WAL_FILE_POSTFIX = "-wal"
+WAL_HEADER_LENGTH = 32
+WAL_MAGIC_NUMBER_BIG_ENDIAN = 0x377F0683
+WAL_MAGIC_NUMBER_LITTLE_ENDIAN = 0x377F0682
+WAL_FILE_FORMAT_VERSION = 3007000
+WAL_FRAME_HEADER_LENGTH = 24
+
+WAL_INDEX_POSTFIX = "-shm"
+WAL_INDEX_FILE_FORMAT_VERSION = 3007000
+WAL_INDEX_NUMBER_OF_SUB_HEADERS = 2
+WAL_INDEX_SUB_HEADER_LENGTH = 48
+WAL_INDEX_CHECKPOINT_INFO_LENGTH = 24
+WAL_INDEX_LOCK_RESERVED_LENGTH = 16
+WAL_INDEX_HEADER_LENGTH = WAL_INDEX_NUMBER_OF_SUB_HEADERS * WAL_INDEX_SUB_HEADER_LENGTH + \
+ WAL_INDEX_CHECKPOINT_INFO_LENGTH + WAL_INDEX_LOCK_RESERVED_LENGTH
+WAL_INDEX_NUMBER_OF_FRAMES_BACKFILLED_IN_DATABASE_LENGTH = 4
+
+"""
+
+Note: The reader mark size is referred to as the Maximum xShmLock index (SQLITE_SHM_NLOCK) - 3 in the sqlite code.
+
+"""
+WAL_INDEX_READER_MARK_SIZE = 5
+WAL_INDEX_READER_MARK_LENGTH = 4
+
+ROLLBACK_JOURNAL_ALL_CONTENT_UNTIL_END_OF_FILE = -1
+ROLLBACK_JOURNAL_POSTFIX = "-journal"
+ROLLBACK_JOURNAL_HEADER_LENGTH = 28
+ROLLBACK_JOURNAL_HEADER_HEX_STRING = 'd9d505f920a163d7'
+ROLLBACK_JOURNAL_HEADER_ALL_CONTENT = 'ffffffff'
+
+BASE_VERSION_NUMBER = 0
+COMMIT_RECORD_BASE_VERSION_NUMBER = BASE_VERSION_NUMBER + 1
+
+"""
+
+The DATABASE_HEADER_VERSIONED_FIELDS covers all fields that may change from database header to database header
+throughout the write ahead log. This may not be a definitive list of fields that can change.
+
+"""
+DATABASE_HEADER_VERSIONED_FIELDS = Enum({"FILE_CHANGE_COUNTER": "file_change_counter",
+ "VERSION_VALID_FOR_NUMBER": "version_valid_for_number",
+ "DATABASE_SIZE_IN_PAGES": "database_size_in_pages",
+ "FIRST_FREELIST_TRUNK_PAGE_NUMBER": "first_freelist_trunk_page_number",
+ "NUMBER_OF_FREE_LIST_PAGES": "number_of_freelist_pages",
+ "LARGEST_ROOT_B_TREE_PAGE_NUMBER": "largest_root_b_tree_page_number",
+ "SCHEMA_COOKIE": "schema_cookie",
+ "SCHEMA_FORMAT_NUMBER": "schema_format_number",
+ "DATABASE_TEXT_ENCODING": "database_text_encoding",
+ "USER_VERSION": "user_version",
+ "MD5_HEX_DIGEST": "md5_hex_digest"})
+
+"""
+
+The types of output that are supported by this package.
+
+"""
+EXPORT_TYPES = Enum(["TEXT", "CSV", "SQLITE", "XLSX"])
+
+"""
+
+Below we instantiate and compile a regular expression to check xml illegal characters:
+ILLEGAL_XML_CHARACTER_PATTERN.
+
+"""
+
+_illegal_xml_characters = [(0x00, 0x08), (0x0B, 0x0C), (0x0E, 0x1F), (0x7F, 0x84), (0x86, 0x9F),
+ (0xD800, 0xDFFF), (0xFDD0, 0xFDDF), (0xFFFE, 0xFFFF)]
+
+if maxunicode >= 0x10000:
+ _illegal_xml_characters.extend([(0x1FFFE, 0x1FFFF), (0x2FFFE, 0x2FFFF), (0x3FFFE, 0x3FFFF),
+ (0x4FFFE, 0x4FFFF), (0x5FFFE, 0x5FFFF), (0x6FFFE, 0x6FFFF),
+ (0x7FFFE, 0x7FFFF), (0x8FFFE, 0x8FFFF), (0x9FFFE, 0x9FFFF),
+ (0xAFFFE, 0xAFFFF), (0xBFFFE, 0xBFFFF), (0xCFFFE, 0xCFFFF),
+ (0xDFFFE, 0xDFFFF), (0xEFFFE, 0xEFFFF), (0xFFFFE, 0xFFFFF),
+ (0x10FFFE, 0x10FFFF)])
+
+_illegal_xml_ranges = ["%s-%s" % (unichr(low), unichr(high)) for (low, high) in _illegal_xml_characters]
+ILLEGAL_XML_CHARACTER_PATTERN = compile(u'[%s]' % u''.join(_illegal_xml_ranges))
diff --git a/sqlite_dissect/exception.py b/sqlite_dissect/exception.py
new file mode 100644
index 0000000..8cfbc39
--- /dev/null
+++ b/sqlite_dissect/exception.py
@@ -0,0 +1,110 @@
+
+"""
+
+exception.py
+
+This script holds the custom exceptions used in this library.
+
+This script holds the following object(s):
+SqliteError(Exception)
+ParsingError(SqliteError)
+HeaderParsingError(ParsingError)
+MasterSchemaParsingError(ParsingError)
+MasterSchemaRowParsingError(MasterSchemaParsingError)
+PageParsingError(ParsingError)
+BTreePageParsingError(PageParsingError)
+CellParsingError(BTreePageParsingError)
+RecordParsingError(CellParsingError)
+VersionParsingError(ParsingError)
+DatabaseParsingError(VersionParsingError)
+WalParsingError(VersionParsingError)
+WalFrameParsingError(WalParsingError)
+WalCommitRecordParsingError(WalParsingError)
+SignatureError(SqliteError)
+CarvingError(SqliteError)
+CellCarvingError(CarvingError)
+InvalidVarIntError(CarvingError)
+OutputError(SqliteError)
+ExportError(SqliteError)
+
+"""
+
+
+class SqliteError(Exception):
+ pass
+
+
+class ParsingError(SqliteError):
+ pass
+
+
+class HeaderParsingError(ParsingError):
+ pass
+
+
+class MasterSchemaParsingError(ParsingError):
+ pass
+
+
+class MasterSchemaRowParsingError(MasterSchemaParsingError):
+ pass
+
+
+class PageParsingError(ParsingError):
+ pass
+
+
+class BTreePageParsingError(PageParsingError):
+ pass
+
+
+class CellParsingError(BTreePageParsingError):
+ pass
+
+
+class RecordParsingError(CellParsingError):
+ pass
+
+
+class VersionParsingError(ParsingError):
+ pass
+
+
+class DatabaseParsingError(VersionParsingError):
+ pass
+
+
+class WalParsingError(VersionParsingError):
+ pass
+
+
+class WalFrameParsingError(WalParsingError):
+ pass
+
+
+class WalCommitRecordParsingError(WalParsingError):
+ pass
+
+
+class SignatureError(SqliteError):
+ pass
+
+
+class CarvingError(SqliteError):
+ pass
+
+
+class CellCarvingError(CarvingError):
+ pass
+
+
+class InvalidVarIntError(CarvingError):
+ pass
+
+
+class OutputError(SqliteError):
+ pass
+
+
+class ExportError(SqliteError):
+ pass
diff --git a/sqlite_dissect/export/README.md b/sqlite_dissect/export/README.md
new file mode 100644
index 0000000..5fa84ce
--- /dev/null
+++ b/sqlite_dissect/export/README.md
@@ -0,0 +1,165 @@
+
+# sqlite_dissect.export
+
+This package will have scripts for writing results from the SQLite carving framework to files such
+as csv, sqlite, and so on.
+
+- csv_export.py
+- sqlite_export.py
+- text_export.py
+- xlsx_export.py
+
+TODO items for the "export" package:
+
+- [ ] Finish UML class diagrams.
+- [ ] Create a interface/super class that is extended from for exporters in order to simplify interaction with them.
+- [ ] Redo the exporters to allow multiple exports instead of having to re-parse the file each time.
+- [ ] Incorporate a base export class that takes in a version history and set of exporters.
+- [ ] Normalize the inputs of the exporters so that they address postfix and file names similarly (ex. .csv postfix).
+- [ ] Check inconsistencies among exporters on overwriting or renaming files (also enter/exit methodology).
+- [ ] Investigate pyexcel as a possible alternative to openpyxl for writing xlsx files and possibly csv files.
+
+
+
+### csv_export.py
+
+This script holds the objects used for exporting results of the SQLite carving framework to csv files.
+
+This script holds the following object(s):
+- VersionCsvExporter(object)
+- CommitCsvExporter(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Better exception handling when working with python and SQLite carving objects.
+- [ ] Address superclass/subclass structure.
+- [ ] Augment classes to not have to continuously open and close the file (maybe by using the "with" syntax).
+- [ ] Work on fixing up column headers and hard coded values in columns.
+- [ ] Fix the "column definitions" for names once implemented in b-tree index pages.
+- [ ] Use cases if empty tables and no carvable rows which result in no files?
+- [ ] Use of "iso_" like in the sqlite_export for internal schema object indexes?
+- [ ] Figure out naming conventions (or how to handle) the "Row ID" vs the integer primary key which is NULL.
+- [ ] Do not overwrite files but instead move them to a different name as in the SQLite and text exporters?
+- [ ] Investigate how other applications handle different database text encodings in reference to output.
+- [ ] Investigate decoding and re-encoding affects on carved entries.
+- [ ] Handle the "=" use case better than just replacing with a space.
+- [ ] Investigate why blob objects show up as isinstance of str objects.
+ ##### VersionCsvExporter Class
+ - [ ] Check virtual table rows for any use cases that could cause errors when writing.
+ - [ ] Address use cases with files, directories, multiple files, etc.
+ - [ ] Check if file or directory exists, etc.
+ - [ ] Figure out a better way to handle the carved records.
+ - [ ] Check the carved records dictionary that all carved records are accounted for.
+ - [ ] Fix the carved records once the carving package has been fixed.
+ - [ ] Address the located/carved/status status of the entries.
+ - [ ] Figure out a better way to calculate absolute offsets in write functions better.
+ - [ ] Fix the "Unknown" status of freeblocks and unallocated space carved entries.
+ - [ ] Either note or let the user control overwrite/append mode functionality
+ - [ ] Handle issues with truncation of carved entries (partial records).
+ - [ ] Account for truncated carved entries (status?) and remove NULL for values if truncated.
+ - [ ] _write_b_tree_index_leaf_records: Check how index interior/leaf pages work with records.
+ ##### CommitCsvExporter Class
+ - [ ] _write_cells: Address the use of "NULL" vs None in SQLite for cells.
+ - [ ] write_commit: Remove the master schema entry argument?
+ - [ ] write_commit: Handle the b-tree table interior page better since it is only for journal files.
+
+
+
+### sqlite_export.py
+
+This script holds the objects used for exporting results of the SQLite carving framework to SQLite files.
+
+>Note:
+>
+> During development this script was written testing and using SQLite version 3.9.2. The pysqlite version
+> was 2.6.0. Keep in mind that sqlite3.version gives version information on the pysqlite sqlite interface code,
+> whereas sqlite3.sqlite_version gives the actual version of the SQLite driver that is used.
+
+This script holds the following object(s):
+- CommitSqliteExporter(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Better exception handling when working with python and SQLite carving objects.
+- [ ] Implement a version form similar to the VersionCsvExporter.
+- [ ] Work on fixing up column headers and hard coded values in columns.
+- [ ] Fix the "column definitions" for names once implemented in b-tree index pages.
+- [ ] Use cases if empty tables and no carvable rows which result in no files?
+- [ ] Investigate differences in efficiency in respect to inserting one or many cells (rows) at a time.
+- [ ] Figure out number of columns instead of pulling out the length of each cell over and over again.
+- [ ] Empty tables or those with no "updated commits" do not show up in the file. Should empty tables be created?
+- [ ] Create a constant for "iso_" for internal schema object indexes?
+- [ ] Figure out naming conventions (or how to handle) the "Row ID" vs the integer primary key which is NULL.
+- [ ] Investigate how other applications handle different database text encodings in reference to output.
+- [ ] Consolidate documentation information so that it is not repeated.
+ ##### CommitSqliteExporter Class:
+ -[ ] _write_cells: Address the use of "NULL" vs None in SQLite for cells.
+ -[ ] _write_cells: Address the use case above with the advent of tables with added columns.
+ -[ ] _write_cells: Clean up coding of the for loop for writing cell record column values.
+ -[ ] _write_cells: Handle the failing "str" encodings instead of just setting in a buffer.
+ -[ ] write_commit: Remove the master schema entry argument?
+ -[ ] write_commit: Figure out a way to handle additional columns other than a "sd_" preface.
+ -[ ] write_commit: Address issues that may be caused by prefacing additional columns with "sd_".
+
+
+
+### text_export.py
+
+This script holds the objects used for exporting results of the SQLite carving framework to text files.
+
+This script holds the following object(s):
+- CommitConsoleExporter(object)
+- CommitTextExporter(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Better exception handling when working with python and SQLite carving objects.
+- [ ] Implement a version form similar to the VersionCsvExporter.
+- [ ] Work on fixing up column headers and hard coded values in columns.
+- [ ] Fix the "column definitions" for names once implemented in b-tree index pages.
+- [ ] Use cases if empty tables and no carvable rows which result in no files?
+- [ ] Use of "iso_" like in the sqlite_export for internal schema object indexes?
+- [ ] Figure out naming conventions (or how to handle) the "Row ID" vs the integer primary key which is NULL.
+- [ ] Investigate how other applications handle different database text encodings in reference to output.
+- [ ] Empty tables or those with no "updated commits" do not show up in the file. Should empty tables be ignored?
+ ##### CommitTextExporter Class:
+ -[ ] _write_cells: Address the use of "NULL" vs None in SQLite for cells.
+ -[ ] write_header: Remove the master schema entry argument?
+
+
+
+### xlsx_export.py
+
+This script holds the objects used for exporting results of the SQLite carving framework to xlsx files.
+
+This script holds the following object(s):
+- CommitXlsxExporter(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Better exception handling when working with python and SQLite carving objects.
+- [ ] Address superclass/subclass structure (the CommitXlsxExporter shares a lot with the CommitCsvExporter).
+- [ ] Implement a version form similar to the VersionCsvExporter.
+- [ ] Work on fixing up column headers and hard coded values in columns.
+- [ ] Fix the "column definitions" for names once implemented in b-tree index pages.
+- [ ] Use cases if empty tables and no carvable rows which result in no files?
+- [ ] Use of "iso_" like in the sqlite_export for internal schema object indexes?
+- [ ] Figure out naming conventions (or how to handle) the "Row ID" vs the integer primary key which is NULL.
+- [ ] Investigate decoding and re-encoding affects on carved entries.
+- [ ] Investigate how other applications handle different database text encodings in reference to output.
+ ##### CommitXlsxExporter Class:
+ -[ ] Document and address issues with encoding of unicode.
+ -[ ] Document and address issues with the 31 max length sheet names (ie. the max 10 similar names).
+ -[ ] write_commit: Remove the master schema entry argument?
+ -[ ] _write_cells: Address the use of "NULL" vs None in SQLite for cells.
+ -[ ] _write_cells: Handle the "=" use case better than just replacing with a space.
+ -[ ] _write_cells: Investigate why blob objects show up as isinstance of str objects.
+ -[ ] _write_cells: Check the operation is "Carved" when decoding text values with "replace".
diff --git a/sqlite_dissect/export/__init__.py b/sqlite_dissect/export/__init__.py
new file mode 100644
index 0000000..7bb811a
--- /dev/null
+++ b/sqlite_dissect/export/__init__.py
@@ -0,0 +1,11 @@
+
+"""
+
+__init__.py
+
+This init script will initialize any needed logic for this package.
+
+This package will have scripts for writing results from the SQLite carving framework to files such
+as csv, sqlite, and so on.
+
+"""
diff --git a/sqlite_dissect/export/csv_export.py b/sqlite_dissect/export/csv_export.py
new file mode 100644
index 0000000..e7a0efd
--- /dev/null
+++ b/sqlite_dissect/export/csv_export.py
@@ -0,0 +1,674 @@
+from csv import QUOTE_ALL
+from csv import writer
+from logging import DEBUG
+from logging import getLogger
+from os.path import basename
+from os.path import normpath
+from os.path import sep
+from re import sub
+from sqlite_dissect.constants import ILLEGAL_XML_CHARACTER_PATTERN
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import MASTER_SCHEMA_ROW_TYPE
+from sqlite_dissect.constants import PAGE_TYPE
+from sqlite_dissect.constants import UTF_8
+from sqlite_dissect.exception import ExportError
+from sqlite_dissect.file.database.utilities import aggregate_leaf_cells
+
+"""
+
+csv_export.py
+
+This script holds the objects used for exporting results of the SQLite carving framework to csv files.
+
+This script holds the following object(s):
+VersionCsvExporter(object)
+CommitCsvExporter(object)
+
+"""
+
+
+class VersionCsvExporter(object):
+
+ @staticmethod
+ def write_version(csv_file_name, export_directory, version, master_schema_entry_carved_records=None):
+
+ logger = getLogger(LOGGER_NAME)
+
+ if not master_schema_entry_carved_records:
+ master_schema_entry_carved_records = {}
+
+ for master_schema_entry in version.master_schema.master_schema_entries:
+
+ """
+
+ Here we only care about the master schema entries that have a root page number since ones that either
+ do not have a root page number or have a root page number of 0 do not have correlating b-trees in the
+ SQLite file and are instead either trigger types, view types, or special cases of table types such as
+ virtual tables.
+
+ """
+
+ if master_schema_entry.root_page_number:
+
+ fixed_file_name = basename(normpath(csv_file_name))
+ fixed_master_schema_name = sub(" ", "_", master_schema_entry.name)
+ csv_file_name = export_directory + sep + fixed_file_name + "-" + fixed_master_schema_name + ".csv"
+
+ logger.info("Writing CSV file: {}.".format(csv_file_name))
+
+ with open(csv_file_name, "wb") as csv_file_handle:
+
+ csv_writer = writer(csv_file_handle, delimiter=',', quotechar="\"", quoting=QUOTE_ALL)
+
+ b_tree_root_page = version.get_b_tree_root_page(master_schema_entry.root_page_number)
+
+ """
+
+ Retrieve the carved records for this particular master schema entry.
+
+ """
+
+ carved_cells = []
+ if master_schema_entry.name in master_schema_entry_carved_records:
+ carved_cells = master_schema_entry_carved_records[master_schema_entry.name]
+
+ """
+
+ Below we have to account for how the pages are stored.
+
+ For the table master schema entry row type:
+ 1.) If the table is not a "without rowid" table, it will be stored on a table b-tree page with
+ row ids.
+ 2.) If the table is a "without rowid" table, it will be stored on an index b-tree page with no
+ row ids.
+
+ For the index master schema entry row type:
+ 1.) It will be stored on an index b-tree page with no row ids.
+
+ Different functions are created to write records for both table and index b-tree pages. Keep in
+ mind that a table master schema row type may be stored on a index b-tree page depending if it is
+ specified as a "without rowid" table. All index master schema row types are stored on index
+ b-tree pages.
+
+ """
+
+ if master_schema_entry.row_type == MASTER_SCHEMA_ROW_TYPE.TABLE:
+
+ if not master_schema_entry.without_row_id:
+
+ VersionCsvExporter._write_b_tree_table_leaf_records(csv_writer, version,
+ master_schema_entry,
+ b_tree_root_page, carved_cells)
+
+ else:
+
+ VersionCsvExporter._write_b_tree_index_leaf_records(csv_writer, version,
+ master_schema_entry,
+ b_tree_root_page, carved_cells)
+
+ elif master_schema_entry.row_type == MASTER_SCHEMA_ROW_TYPE.INDEX:
+
+ VersionCsvExporter._write_b_tree_index_leaf_records(csv_writer, version, master_schema_entry,
+ b_tree_root_page, carved_cells)
+
+ else:
+
+ log_message = "Invalid master schema entry row type: {} found for csv export on master " \
+ "schema entry name: {} table name: {} sql: {}."
+ log_message = log_message.format(master_schema_entry.row_type, master_schema_entry.name,
+ master_schema_entry.table_name, master_schema_entry.sql)
+
+ logger.warn(log_message)
+ raise ExportError(log_message)
+
+ @staticmethod
+ def _write_b_tree_index_leaf_records(csv_writer, version, master_schema_entry, b_tree_root_page, carved_cells):
+
+ """
+
+ This function will write the list of cells sent in to the sheet specified including the metadata regarding
+ to the file type, page type, and operation.
+
+ Note: The types of the data in the values can prove to be an issue here. We want to write the value out as
+ a string similarly as the text and csv outputs do for example even though it may contain invalid
+ characters. When data is sent into the openpyxl library to be written to the xml xlsx, if it is a
+ string, it is encoded into the default encoding and then checked for xml illegal characters that may
+ pose an issue when written to the xml. In order to properly check the values and write them accordingly
+ through the openpyxl library we address the following use cases for the value in order:
+ 1.) If the value is None, we replace the value with the string "NULL". This might be replaced by
+ leaving it None but issues can be seen when carving cells where the value is None not because it
+ was NULL originally in the database, but because it was unable to be parsed out when it may have
+ actually had a value (when it was truncated). Distinction is needed between these two use cases.
+ 2.) If the value is a bytearray (most likely originally a blob object) or a string value, we want to
+ write the value as a string. However, in order to do this for blob objects or strings that may
+ have a few bad characters in them from carving, we need to do our due diligence and make sure
+ there are no bad unicode characters and no xml illegal characters that may cause issues with
+ writing to the xlsx. In order to do this we do the following:
+ a.) We first convert the value to string if the affinity was not text, otherwise we decode
+ the value in the database text encoding. When we decode using the database text encoding,
+ we specify to "replace" characters it does not recognize in order to compensate for carved
+ rows.
+ b.) We then test encoding it to UTF-8.
+ i.) If the value successfully encodes as UTF-8 we set that as the value.
+ ii.) If the value throws an exception encoding, we have illegal unicode characters in the
+ string that need to be addressed. In order to escape these, we decode the string
+ as UTF-8 using the "replace" method to replace any illegal unicode characters
+ with '\ufffd' and set this back as the value after encoding again.
+ c.) After we have successfully set the value back to a UTF-8 compliant value, we need to check
+ the value for xml illegal characters. If any of these xml illegal characters are found,
+ they are replaced with a space. This behaviour may be different from how values are output
+ into text or csv since this is being written to xml and additional rules apply for certain
+ characters.
+ between the xlsx output and text/csv output in reference to xml illegal characters.
+ d.) After all the illegal characters are removed, due to the way openpyxl determines data types
+ of particular cells, if a cell starts with "=", it is determined to be a formula and set as
+ that in the data type field for that cell. This causes issues when opening the file in excel.
+ Microsoft Excel recommends prefacing the string with a single quote character, however,
+ this only seems to be within Excel itself. You can specify the data type of the cell in
+ openpyxl, but not in the write-only mode that is being used here. In order to work around
+ this, we check if the first character of a string or bytearray is a "=" character and preface
+ that string with a space. There may be better ways to handle this such as not using the
+ write-only mode.
+ Note: Additionally to the "=" character, the "-" character has similar issues in excel.
+ However, openpyxl explicitly checks on the "=" character being the first character
+ and setting that cell to a formula and does not handle the use case of a cell starting
+ with the "-" character, so this use case is ignored.
+ 3.) If the value does not fall in one of the above use cases, we leave it as is and write it to the
+ xlsx without any modifications.
+
+ Note: It was noticed that blob objects are typically detected as isinstance of str here and strings are
+ bytearray objects. This needs to be investigated why exactly blob objects are coming out as str
+ objects.
+
+ Note: Comparision should be done on how other applications work with different database text encodings in
+ reference to their output.
+
+ Note: The decoding of the value in the database text encoding should only specify replace on a carved entry.
+
+ :param csv_writer:
+ :param version:
+ :param master_schema_entry:
+ :param b_tree_root_page:
+ :param carved_cells:
+
+ :return:
+
+ """
+
+ logger = getLogger(LOGGER_NAME)
+
+ number_of_cells, cells = aggregate_leaf_cells(b_tree_root_page)
+
+ if logger.isEnabledFor(DEBUG):
+ master_schema_entry_string = "The {} b-tree page with {} row type and name: {} with sql: {} " \
+ "has {} in-tact rows:"
+ master_schema_entry_string = master_schema_entry_string.format(b_tree_root_page.page_type,
+ master_schema_entry.row_type,
+ master_schema_entry.name,
+ master_schema_entry.sql, number_of_cells)
+ logger.debug(master_schema_entry_string)
+
+ """
+
+ Note: The index master schema entries are currently not fully parsed and therefore we do not have column
+ definitions in order to derive the column names from.
+
+ """
+
+ column_headers = []
+ column_headers.extend(["File Source", "Version", "Page Version", "Cell Source", "Page Number", "Location",
+ "Carved", "Status", "File Offset"])
+ logger.debug("Column Headers: {}".format(" , ".join(column_headers)))
+
+ csv_writer.writerow(column_headers)
+
+ for cell in cells.values():
+
+ cell_record_column_values = []
+
+ for record_column in cell.payload.record_columns:
+ serial_type = record_column.serial_type
+ text_affinity = True if serial_type >= 13 and serial_type % 2 == 1 else False
+ value = record_column.value
+ if value is None:
+ pass
+ elif isinstance(value, (bytearray, str)):
+ value = value.decode(version.database_text_encoding, "replace") if text_affinity else str(value)
+ try:
+ value.encode(UTF_8)
+ except UnicodeDecodeError:
+ value = value.decode(UTF_8, "replace")
+ value = ILLEGAL_XML_CHARACTER_PATTERN.sub(" ", value)
+ if value.startswith("="):
+ value = ' ' + value
+ cell_record_column_values.append(value)
+
+ row = [version.file_type, cell.version_number, cell.page_version_number, cell.source, cell.page_number,
+ cell.location, False, "Complete", cell.file_offset]
+ row.extend(cell_record_column_values)
+ csv_writer.writerow(row)
+
+ if logger.isEnabledFor(DEBUG):
+ for cell in cells.values():
+ cell_record_column_values = [str(record_column.value) if record_column.value else "NULL"
+ for record_column in cell.payload.record_columns]
+ log_message = "File source: {} version: {} page version: {} cell source: {} page: {} located: {} " \
+ "carved: {} status: {} at file offset: {}: "
+ log_message = log_message.format(version.file_type, cell.version_number, cell.page_version_number,
+ cell.source, cell.page_number, cell.location, False,
+ "Complete", cell.file_offset)
+ log_message += "(" + ", ".join(cell_record_column_values) + ")"
+ logger.debug(log_message)
+
+ VersionCsvExporter._write_b_tree_table_master_schema_carved_records(csv_writer, version, carved_cells, False)
+
+ @staticmethod
+ def _write_b_tree_table_leaf_records(csv_writer, version, master_schema_entry, b_tree_root_page, carved_cells):
+
+ """
+
+ This function will write the list of cells sent in to the sheet specified including the metadata regarding
+ to the file type, page type, and operation.
+
+ Note: The types of the data in the values can prove to be an issue here. We want to write the value out as
+ a string similarly as the text and csv outputs do for example even though it may contain invalid
+ characters. When data is sent into the openpyxl library to be written to the xml xlsx, if it is a
+ string, it is encoded into the default encoding and then checked for xml illegal characters that may
+ pose an issue when written to the xml. In order to properly check the values and write them accordingly
+ through the openpyxl library we address the following use cases for the value in order:
+ 1.) If the value is None, we replace the value with the string "NULL". This might be replaced by
+ leaving it None but issues can be seen when carving cells where the value is None not because it
+ was NULL originally in the database, but because it was unable to be parsed out when it may have
+ actually had a value (when it was truncated). Distinction is needed between these two use cases.
+ 2.) If the value is a bytearray (most likely originally a blob object) or a string value, we want to
+ write the value as a string. However, in order to do this for blob objects or strings that may
+ have a few bad characters in them from carving, we need to do our due diligence and make sure
+ there are no bad unicode characters and no xml illegal characters that may cause issues with
+ writing to the xlsx. In order to do this we do the following:
+ a.) We first convert the value to string if the affinity was not text, otherwise we decode
+ the value in the database text encoding. When we decode using the database text encoding,
+ we specify to "replace" characters it does not recognize in order to compensate for carved
+ rows.
+ b.) We then test encoding it to UTF-8.
+ i.) If the value successfully encodes as UTF-8 we set that as the value.
+ ii.) If the value throws an exception encoding, we have illegal unicode characters in the
+ string that need to be addressed. In order to escape these, we decode the string
+ as UTF-8 using the "replace" method to replace any illegal unicode characters
+ with '\ufffd' and set this back as the value after encoding again.
+ c.) After we have successfully set the value back to a UTF-8 compliant value, we need to check
+ the value for xml illegal characters. If any of these xml illegal characters are found,
+ they are replaced with a space. This behaviour may be different from how values are output
+ into text or csv since this is being written to xml and additional rules apply for certain
+ characters.
+ between the xlsx output and text/csv output in reference to xml illegal characters.
+ d.) After all the illegal characters are removed, due to the way openpyxl determines data types
+ of particular cells, if a cell starts with "=", it is determined to be a formula and set as
+ that in the data type field for that cell. This causes issues when opening the file in excel.
+ Microsoft Excel recommends prefacing the string with a single quote character, however,
+ this only seems to be within Excel itself. You can specify the data type of the cell in
+ openpyxl, but not in the write-only mode that is being used here. In order to work around
+ this, we check if the first character of a string or bytearray is a "=" character and preface
+ that string with a space. There may be better ways to handle this such as not using the
+ write-only mode.
+ Note: Additionally to the "=" character, the "-" character has similar issues in excel.
+ However, openpyxl explicitly checks on the "=" character being the first character
+ and setting that cell to a formula and does not handle the use case of a cell starting
+ with the "-" character, so this use case is ignored.
+ 3.) If the value does not fall in one of the above use cases, we leave it as is and write it to the
+ xlsx without any modifications.
+
+ Note: It was noticed that blob objects are typically detected as isinstance of str here and strings are
+ bytearray objects. This needs to be investigated why exactly blob objects are coming out as str
+ objects.
+
+ Note: Comparision should be done on how other applications work with different database text encodings in
+ reference to their output.
+
+ Note: The decoding of the value in the database text encoding should only specify replace on a carved entry.
+
+ :param csv_writer:
+ :param version:
+ :param master_schema_entry:
+ :param b_tree_root_page:
+ :param carved_cells:
+
+ :return:
+
+ """
+
+ logger = getLogger(LOGGER_NAME)
+
+ number_of_cells, cells = aggregate_leaf_cells(b_tree_root_page)
+
+ if logger.isEnabledFor(DEBUG):
+ master_schema_entry_string = "The {} b-tree page with {} row type and name: {} with sql: {} " \
+ "has {} in-tact rows:"
+ master_schema_entry_string = master_schema_entry_string.format(b_tree_root_page.page_type,
+ master_schema_entry.row_type,
+ master_schema_entry.name,
+ master_schema_entry.sql, number_of_cells)
+ logger.debug(master_schema_entry_string)
+
+ column_headers = []
+ column_headers.extend(["File Source", "Version", "Page Version", "Cell Source", "Page Number", "Location",
+ "Carved", "Status", "File Offset", "Row ID"])
+ column_headers.extend([column_definition.column_name
+ for column_definition in master_schema_entry.column_definitions])
+
+ logger.debug("Column Headers: {}".format(" , ".join(column_headers)))
+
+ csv_writer.writerow(column_headers)
+
+ sorted_cells = sorted(cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id)
+
+ for cell in sorted_cells:
+
+ cell_record_column_values = []
+
+ for record_column in cell.payload.record_columns:
+ serial_type = record_column.serial_type
+ text_affinity = True if serial_type >= 13 and serial_type % 2 == 1 else False
+ value = record_column.value
+ if value is None:
+ pass
+ elif isinstance(value, (bytearray, str)):
+ value = value.decode(version.database_text_encoding, "replace") if text_affinity else str(value)
+ try:
+ value = value.encode(UTF_8)
+ except UnicodeDecodeError:
+ value = value.decode(UTF_8, "replace").encode(UTF_8)
+ value = ILLEGAL_XML_CHARACTER_PATTERN.sub(" ", value)
+ if value.startswith("="):
+ value = ' ' + value
+ value = str(value)
+ cell_record_column_values.append(value)
+
+ row = [version.file_type, cell.version_number, cell.page_version_number, cell.source, cell.page_number,
+ cell.location, False, "Complete", cell.file_offset, cell.row_id]
+ row.extend(cell_record_column_values)
+ csv_writer.writerow(row)
+
+ if logger.isEnabledFor(DEBUG):
+ for cell in sorted_cells:
+ cell_record_column_values = [str(record_column.value) if record_column.value else "NULL"
+ for record_column in cell.payload.record_columns]
+ log_message = "File source: {} version: {} page version: {} cell source: {} page: {} location: {} " \
+ "carved: {} status: {} at file offset: {} for row id: {}: "
+ log_message = log_message.format(version.file_type, cell.version_number, cell.page_version_number,
+ cell.source, cell.page_number, cell.location, False, "Complete",
+ cell.file_offset, cell.row_id)
+ log_message += "(" + ", ".join(cell_record_column_values) + ")"
+ logger.debug(log_message)
+
+ VersionCsvExporter._write_b_tree_table_master_schema_carved_records(csv_writer, version, carved_cells, True)
+
+ @staticmethod
+ def _write_b_tree_table_master_schema_carved_records(csv_writer, version, carved_cells, has_row_ids):
+
+ logger = getLogger(LOGGER_NAME)
+
+ for carved_cell in carved_cells:
+
+ cell_record_column_values = []
+
+ for record_column in carved_cell.payload.record_columns:
+ serial_type = record_column.serial_type
+ text_affinity = True if serial_type >= 13 and serial_type % 2 == 1 else False
+ value = record_column.value
+ if value is None:
+ pass
+ elif isinstance(value, (bytearray, str)):
+ value = value.decode(version.database_text_encoding, "replace") if text_affinity else str(value)
+ try:
+ value = value.encode(UTF_8)
+ except UnicodeDecodeError:
+ value = value.decode(UTF_8, "replace").encode(UTF_8)
+ value = ILLEGAL_XML_CHARACTER_PATTERN.sub(" ", value)
+ if value.startswith("="):
+ value = ' ' + value
+ value = str(value)
+ cell_record_column_values.append(value)
+
+ row = [version.file_type, carved_cell.version_number, carved_cell.page_version_number,
+ carved_cell.source, carved_cell.page_number, carved_cell.location, True, "Unknown",
+ carved_cell.file_offset]
+ if has_row_ids:
+ row.append("")
+ row.extend(cell_record_column_values)
+ csv_writer.writerow(row)
+
+ if logger.isEnabledFor(DEBUG):
+ for carved_cell in carved_cells:
+ cell_record_column_values = [str(record_column.value) if record_column.value else "NULL"
+ for record_column in carved_cell.payload.record_columns]
+ log_message = "File source: {} version: {} version number: {} cell source: {} page: {} location: {} " \
+ "carved: {} status: {} at file offset: {}"
+ log_message = log_message.format(version.file_type, carved_cell.version_number,
+ carved_cell.page_version_number, carved_cell.source,
+ carved_cell.page_number, carved_cell.location, True,
+ "Unknown", carved_cell.file_offset)
+ if has_row_ids:
+ log_message += " for row id: {}:".format("")
+ log_message += "(" + ", ".join(cell_record_column_values) + ")"
+ logger.debug(log_message)
+
+
+class CommitCsvExporter(object):
+
+ def __init__(self, export_directory, file_name_prefix=""):
+ self._export_directory = export_directory
+ self._file_name_prefix = file_name_prefix
+ self._csv_file_names = {}
+
+ def write_commit(self, master_schema_entry, commit):
+
+ """
+
+
+
+ Note: This function only writes the commit record if the commit record was updated.
+
+ :param master_schema_entry:
+ :param commit:
+
+ :return:
+
+ """
+
+ if not commit.updated:
+ return
+
+ logger = getLogger(LOGGER_NAME)
+
+ mode = "ab"
+ csv_file_name = self._csv_file_names[commit.name] if commit.name in self._csv_file_names else None
+ write_headers = False
+
+ if not csv_file_name:
+ mode = "wb"
+ commit_name = sub(" ", "_", commit.name)
+ csv_file_name = self._export_directory + sep + self._file_name_prefix + "-" + commit_name + ".csv"
+ self._csv_file_names[commit.name] = csv_file_name
+ write_headers = True
+
+ with open(csv_file_name, mode) as csv_file_handle:
+
+ csv_writer = writer(csv_file_handle, delimiter=',', quotechar="\"", quoting=QUOTE_ALL)
+
+ """
+
+ Below we have to account for how the pages are stored.
+
+ For the table master schema entry row type:
+ 1.) If the table is not a "without rowid" table, it will be stored on a table b-tree page with
+ row ids.
+ 2.) If the table is a "without rowid" table, it will be stored on an index b-tree page with no
+ row ids.
+
+ For the index master schema entry row type:
+ 1.) It will be stored on an index b-tree page with no row ids.
+
+ The commit object handles this by having a page type to make this distinction easier. Therefore, we only
+ need to check on the page type here.
+
+ """
+
+ column_headers = []
+ if write_headers:
+ column_headers.extend(["File Source", "Version", "Page Version", "Cell Source", "Page Number",
+ "Location", "Operation", "File Offset"])
+
+ if commit.page_type == PAGE_TYPE.B_TREE_INDEX_LEAF:
+
+ """
+
+ Note: The index master schema entries are currently not fully parsed and therefore we do not have
+ column definitions in order to derive the column names from.
+
+ """
+
+ csv_writer.writerow(column_headers)
+
+ CommitCsvExporter._write_cells(csv_writer, commit.file_type, commit.database_text_encoding,
+ commit.page_type, commit.added_cells.values(), "Added")
+ CommitCsvExporter._write_cells(csv_writer, commit.file_type, commit.database_text_encoding,
+ commit.page_type, commit.updated_cells.values(), "Updated")
+ CommitCsvExporter._write_cells(csv_writer, commit.file_type, commit.database_text_encoding,
+ commit.page_type, commit.deleted_cells.values(), "Deleted")
+ CommitCsvExporter._write_cells(csv_writer, commit.file_type, commit.database_text_encoding,
+ commit.page_type, commit.carved_cells.values(), "Carved")
+
+ elif commit.page_type == PAGE_TYPE.B_TREE_TABLE_LEAF or commit.page_type == PAGE_TYPE.B_TREE_TABLE_INTERIOR:
+
+ if write_headers:
+ column_headers.append("Row ID")
+ column_headers.extend([column_definition.column_name
+ for column_definition in master_schema_entry.column_definitions])
+ csv_writer.writerow(column_headers)
+
+ # Sort the added, updated, and deleted cells by the row id
+ sorted_added_cells = sorted(commit.added_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id)
+ CommitCsvExporter._write_cells(csv_writer, commit.file_type, commit.database_text_encoding,
+ commit.page_type, sorted_added_cells, "Added")
+ sorted_updated_cells = sorted(commit.updated_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id)
+ CommitCsvExporter._write_cells(csv_writer, commit.file_type, commit.database_text_encoding,
+ commit.page_type, sorted_updated_cells, "Updated")
+ sorted_deleted_cells = sorted(commit.deleted_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id)
+ CommitCsvExporter._write_cells(csv_writer, commit.file_type, commit.database_text_encoding,
+ commit.page_type, sorted_deleted_cells, "Deleted")
+
+ # We will not sort the carved cells since row ids are not deterministic even if parsed
+ CommitCsvExporter._write_cells(csv_writer, commit.file_type, commit.database_text_encoding,
+ commit.page_type, commit.carved_cells.values(), "Carved")
+
+ else:
+
+ log_message = "Invalid commit page type: {} found for csv export on master " \
+ "schema entry name: {} while writing to csv file name: {}."
+ log_message = log_message.format(commit.page_type, commit.name, csv_file_name)
+ logger.warn(log_message)
+ raise ExportError(log_message)
+
+ @staticmethod
+ def _write_cells(csv_writer, file_type, database_text_encoding, page_type, cells, operation):
+
+ """
+
+ This function will write the list of cells sent in to the sheet specified including the metadata regarding
+ to the file type, page type, and operation.
+
+ Note: The types of the data in the values can prove to be an issue here. We want to write the value out as
+ a string similarly as the text and csv outputs do for example even though it may contain invalid
+ characters. When data is sent into the openpyxl library to be written to the xml xlsx, if it is a
+ string, it is encoded into the default encoding and then checked for xml illegal characters that may
+ pose an issue when written to the xml. In order to properly check the values and write them accordingly
+ through the openpyxl library we address the following use cases for the value in order:
+ 1.) If the value is a bytearray (most likely originally a blob object) or a string value, we want to
+ write the value as a string. However, in order to do this for blob objects or strings that may
+ have a few bad characters in them from carving, we need to do our due diligence and make sure
+ there are no bad unicode characters and no xml illegal characters that may cause issues with
+ writing to the xlsx. In order to do this we do the following:
+ a.) We first convert the value to string if the affinity was not text, otherwise we decode
+ the value in the database text encoding. When we decode using the database text encoding,
+ we specify to "replace" characters it does not recognize in order to compensate for carved
+ rows.
+ b.) We then test encoding it to UTF-8.
+ i.) If the value successfully encodes as UTF-8 we set that as the value.
+ ii.) If the value throws an exception encoding, we have illegal unicode characters in the
+ string that need to be addressed. In order to escape these, we decode the string
+ as UTF-8 using the "replace" method to replace any illegal unicode characters
+ with '\ufffd' and set this back as the value after encoding again.
+ c.) After we have successfully set the value back to a UTF-8 compliant value, we need to check
+ the value for xml illegal characters. If any of these xml illegal characters are found,
+ they are replaced with a space. This behaviour may be different from how values are output
+ into text or csv since this is being written to xml and additional rules apply for certain
+ characters.
+ between the xlsx output and text/csv output in reference to xml illegal characters.
+ d.) After all the illegal characters are removed, due to the way openpyxl determines data types
+ of particular cells, if a cell starts with "=", it is determined to be a formula and set as
+ that in the data type field for that cell. This causes issues when opening the file in excel.
+ Microsoft Excel recommends prefacing the string with a single quote character, however,
+ this only seems to be within Excel itself. You can specify the data type of the cell in
+ openpyxl, but not in the write-only mode that is being used here. In order to work around
+ this, we check if the first character of a string or bytearray is a "=" character and preface
+ that string with a space. There may be better ways to handle this such as not using the
+ write-only mode.
+ Note: Additionally to the "=" character, the "-" character has similar issues in excel.
+ However, openpyxl explicitly checks on the "=" character being the first character
+ and setting that cell to a formula and does not handle the use case of a cell starting
+ with the "-" character, so this use case is ignored.
+ 2.) If the value does not fall in one of the above use cases, we leave it as is and write it to the
+ xlsx without any modifications.
+
+ Note: If the value is None, we leave it as None. We used to update the None value with the string "NULL"
+ since issues could be seen when carving cells where the value is None not because it was NULL originally
+ in the database, but because it was unable to be parsed out when it may have actually had a value (when
+ it was truncated). Distinction is needed between these two use cases.
+
+ Note: It was noticed that blob objects are typically detected as isinstance of str here and strings are
+ bytearray objects. This needs to be investigated why exactly blob objects are coming out as str
+ objects.
+
+ Note: Comparision should be done on how other applications work with different database text encodings in
+ reference to their output.
+
+ Note: The decoding of the value in the database text encoding should only specify replace on a carved entry.
+
+ :param csv_writer:
+ :param file_type:
+ :param database_text_encoding:
+ :param page_type:
+ :param cells:
+ :param operation:
+
+ :return:
+
+ """
+
+ for cell in cells:
+
+ cell_record_column_values = []
+ for record_column in cell.payload.record_columns:
+ serial_type = record_column.serial_type
+ text_affinity = True if serial_type >= 13 and serial_type % 2 == 1 else False
+ value = record_column.value
+ if value is None:
+ pass
+ elif isinstance(value, (bytearray, str)):
+ value = value.decode(database_text_encoding, "replace") if text_affinity else str(value)
+ try:
+ value = value.encode(UTF_8)
+ except UnicodeDecodeError:
+ value = value.decode(UTF_8, "replace").encode(UTF_8)
+ value = ILLEGAL_XML_CHARACTER_PATTERN.sub(" ", value)
+ if value.startswith("="):
+ value = ' ' + value
+ value = str(value)
+ cell_record_column_values.append(value)
+
+ row = [file_type, cell.version_number, cell.page_version_number, cell.source, cell.page_number,
+ cell.location, operation, cell.file_offset]
+ if page_type == PAGE_TYPE.B_TREE_TABLE_LEAF:
+ row.append(cell.row_id)
+ row.extend(cell_record_column_values)
+ csv_writer.writerow(row)
diff --git a/sqlite_dissect/export/sqlite_export.py b/sqlite_dissect/export/sqlite_export.py
new file mode 100644
index 0000000..3d120f5
--- /dev/null
+++ b/sqlite_dissect/export/sqlite_export.py
@@ -0,0 +1,412 @@
+from logging import getLogger
+from os import rename
+from os.path import exists
+from os.path import sep
+from re import sub
+from sqlite3 import connect
+from sqlite3 import sqlite_version
+from sqlite3 import version
+from uuid import uuid4
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import PAGE_TYPE
+from sqlite_dissect.exception import ExportError
+
+"""
+
+sqlite_export.py
+
+This script holds the objects used for exporting results of the SQLite carving framework to SQLite files.
+
+Note: During development this script was written testing and using SQLite version 3.9.2. The pysqlite version
+ was 2.6.0. Keep in mind that sqlite3.version gives version information on the pysqlite SQLite interface code,
+ whereas sqlite3.sqlite_version gives the actual version of the SQLite driver that is used.
+
+This script holds the following object(s):
+CommitSqliteExporter(object)
+
+"""
+
+
+class CommitSqliteExporter(object):
+
+ def __init__(self, export_directory, file_name):
+
+ """
+
+ Constructor.
+
+ The master schema entries created tables dictionary will hold the names of the created tables in the SQLite
+ file being written to so consecutive writes to those tables will be able to tell if the table was already
+ created or not. The reason it is a dictionary and not just a list of names is that the value keyed off the
+ master schema name will be the number of columns in that table. This is needed since different rows within
+ the same table may have a different number of columns in the case that the table was altered and columns were
+ added at some point. This way the number of columns can be specified and values that may be missing can be
+ specified as being left NULL.
+
+ Note: According to documentation, it appears only tables can be altered. However, we include the same logic
+ with the number of rows for both tables and indexes for consistency and code reduction.
+
+ Note: If the file is detected as already existing, a uuid will be appended to the file name of the old file
+ and a new file by the name specified will be created.
+
+ :param export_directory:
+ :param file_name:
+
+ :return:
+
+ """
+
+ self._sqlite_file_name = export_directory + sep + file_name
+ self._connection = None
+ self._master_schema_entries_created_tables = {}
+
+ def __enter__(self):
+
+ # Check if the file exists and if it does rename it
+ if exists(self._sqlite_file_name):
+
+ # Generate a uuid to append to the file name
+ new_file_name_for_existing_file = self._sqlite_file_name + "-" + str(uuid4())
+
+ # Rename the existing file
+ rename(self._sqlite_file_name, new_file_name_for_existing_file)
+
+ log_message = "File: {} already existing when creating the file for commit sqlite exporting. The " \
+ "file was renamed to: {} and new data will be written to the file name specified."
+ log_message = log_message.format(self._sqlite_file_name, new_file_name_for_existing_file)
+ getLogger(LOGGER_NAME).debug(log_message)
+
+ self._connection = connect(self._sqlite_file_name)
+ log_message = "Opened connection to {} using sqlite version: {} and pysqlite version: {}"
+ log_message = log_message.format(self._sqlite_file_name, sqlite_version, version)
+ getLogger(LOGGER_NAME).debug(log_message)
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self._connection.close()
+ log_message = "Closed connection to {} using sqlite version: {} and pysqlite version: {}"
+ log_message = log_message.format(self._sqlite_file_name, sqlite_version, version)
+ getLogger(LOGGER_NAME).debug(log_message)
+
+ def write_commit(self, master_schema_entry, commit):
+
+ """
+
+
+
+ Note: This function only writes the commit record if the commit record was updated.
+
+ Note: Any table or index names beginning with sqlite_ are not allowed since "sqlite_" is reserved for
+ internal schema object names. In the case that a table or index is an internal schema object, we
+ will preface that name with an "iso_" representing an (i)nternal (s)chema (o)bject.
+
+ :param master_schema_entry:
+ :param commit:
+
+ :return:
+
+ """
+
+ if not commit.updated:
+ return
+
+ logger = getLogger(LOGGER_NAME)
+
+ # Check if the master schema entry name is a internal schema object and if so preface it with "iso_"
+ internal_schema_object = master_schema_entry.internal_schema_object \
+ if hasattr(master_schema_entry, "internal_schema_object") else False
+ table_name = "iso_" + master_schema_entry.name if internal_schema_object else master_schema_entry.name
+
+ # Check if we have created the table for this master schema entry name yet
+ if master_schema_entry.name not in self._master_schema_entries_created_tables:
+
+ column_headers = ["File Source", "Version", "Page Version", "Cell Source", "Page Number", "Location",
+ "Operation", "File Offset"]
+
+ """
+
+ Below we have to account for how the pages are stored.
+
+ For the table master schema entry row type:
+ 1.) If the table is not a "without rowid" table, it will be stored on a table b-tree page with
+ row ids.
+ 2.) If the table is a "without rowid" table, it will be stored on an index b-tree page with no
+ row ids.
+
+ For the index master schema entry row type:
+ 1.) It will be stored on an index b-tree page with no row ids.
+
+ The commit object handles this by having a page type to make this distinction easier. Therefore, we only
+ need to check on the page type here.
+
+ """
+
+ if commit.page_type == PAGE_TYPE.B_TREE_INDEX_LEAF:
+
+ """
+
+ Note: The index master schema entries are currently not fully parsed and therefore we do not have
+ column definitions in order to derive the column names from.
+
+ Since we need to have column headers defined for each of the fields, here we calculate the
+ number of additional columns that will be needed to output the fields from the index and expand
+ the table by that number using generic column names.
+
+ At least one of the added, updated, deleted, or carved cells fields must be set for the commit
+ to have been considered updated and for us to have gotten here.
+
+ """
+
+ cells = list()
+ cells.extend(commit.added_cells.values())
+ cells.extend(commit.updated_cells.values())
+ cells.extend(commit.deleted_cells.values())
+ cells.extend(commit.carved_cells.values())
+
+ if len(cells) < 1:
+ log_message = "Found invalid number of cells in commit when specified updated: {} " \
+ "found for sqlite export on master schema entry name: {} page type: {} " \
+ "while writing to sqlite file name: {}."
+ log_message = log_message.format(len(cells), commit.name, commit.page_type, self._sqlite_file_name)
+ logger.warn(log_message)
+ raise ExportError(log_message)
+
+ number_of_columns = len(cells[0].payload.record_columns)
+ index_column_headers = []
+ for i in range(number_of_columns):
+ index_column_headers.append("Column {}".format(i))
+
+ column_headers.extend(index_column_headers)
+ column_headers = [sub(" ", "_", column_header).lower() for column_header in column_headers]
+
+ elif commit.page_type == PAGE_TYPE.B_TREE_TABLE_LEAF:
+
+ column_definitions = [column_definition.column_name
+ for column_definition in master_schema_entry.column_definitions]
+ column_headers.append("Row ID")
+
+ """
+
+ In order to make sure there are no pre-existing columns with "sd_" prefacing them, we check for that
+ use case and add another "sd_" to the beginning of the column header name until there are no conflicts.
+
+ """
+
+ updated_column_headers = []
+ for column_header in column_headers:
+ updated_column_header_name = "sd_" + sub(" ", "_", column_header).lower()
+ while updated_column_header_name in column_definitions:
+ updated_column_header_name = "sd_" + updated_column_header_name
+ updated_column_headers.append(updated_column_header_name)
+
+ updated_column_headers.extend(column_definitions)
+ column_headers = updated_column_headers
+
+ else:
+
+ log_message = "Invalid commit page type: {} found for sqlite export on master " \
+ "schema entry name: {} while writing to sqlite file name: {}."
+ log_message = log_message.format(commit.page_type, commit.name, self._sqlite_file_name)
+ logger.warn(log_message)
+ raise ExportError(log_message)
+
+ create_table_statement = "CREATE TABLE {} ({})"
+ create_table_statement = create_table_statement.format(table_name, " ,".join(column_headers))
+ self._connection.execute(create_table_statement)
+ self._connection.commit()
+
+ self._master_schema_entries_created_tables[master_schema_entry.name] = len(column_headers)
+
+ """
+
+ Now write all of the cells to the SQLite file in their table.
+
+ """
+
+ column_count = self._master_schema_entries_created_tables[master_schema_entry.name]
+
+ if commit.page_type == PAGE_TYPE.B_TREE_INDEX_LEAF:
+
+ CommitSqliteExporter._write_cells(self._connection, table_name, column_count, commit.file_type,
+ commit.database_text_encoding, commit.page_type,
+ commit.added_cells.values(), "Added")
+ CommitSqliteExporter._write_cells(self._connection, table_name, column_count, commit.file_type,
+ commit.database_text_encoding, commit.page_type,
+ commit.updated_cells.values(), "Updated")
+ CommitSqliteExporter._write_cells(self._connection, table_name, column_count, commit.file_type,
+ commit.database_text_encoding, commit.page_type,
+ commit.deleted_cells.values(), "Deleted")
+ CommitSqliteExporter._write_cells(self._connection, table_name, column_count, commit.file_type,
+ commit.database_text_encoding, commit.page_type,
+ commit.carved_cells.values(), "Carved")
+
+ elif commit.page_type == PAGE_TYPE.B_TREE_TABLE_LEAF:
+
+ # Sort the added, updated, and deleted cells by the row id
+ sorted_added_cells = sorted(commit.added_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id)
+ CommitSqliteExporter._write_cells(self._connection, table_name, column_count, commit.file_type,
+ commit.database_text_encoding, commit.page_type, sorted_added_cells,
+ "Added")
+ sorted_updated_cells = sorted(commit.updated_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id)
+ CommitSqliteExporter._write_cells(self._connection, table_name, column_count, commit.file_type,
+ commit.database_text_encoding, commit.page_type, sorted_updated_cells,
+ "Updated")
+ sorted_deleted_cells = sorted(commit.deleted_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id)
+ CommitSqliteExporter._write_cells(self._connection, table_name, column_count, commit.file_type,
+ commit.database_text_encoding, commit.page_type, sorted_deleted_cells,
+ "Deleted")
+
+ # We will not sort the carved cells since row ids are not deterministic even if parsed
+ CommitSqliteExporter._write_cells(self._connection, table_name, column_count, commit.file_type,
+ commit.database_text_encoding, commit.page_type,
+ commit.carved_cells.values(), "Carved")
+
+ else:
+
+ log_message = "Invalid commit page type: {} found for sqlite export on master " \
+ "schema entry name: {} while writing to sqlite file name: {}."
+ log_message = log_message.format(commit.page_type, commit.name, self._sqlite_file_name)
+ logger.warn(log_message)
+ raise ExportError(log_message)
+
+ """
+
+ Commit any entries written to the SQLite file.
+
+ Note: This is done to speed up writing to the SQLite file and was previously in the "_write_cells" function
+ and called after every set of cells written. Now that it has been brought out here, it will execute
+ for every commit record. This will reduce calls to commit and also make sure at least one statement
+ has been executed when calling a commit. In addition the insert statement was changed to insert
+ many at a time instead of individually.
+
+ """
+
+ self._connection.commit()
+
+ @staticmethod
+ def _write_cells(connection, table_name, column_count, file_type,
+ database_text_encoding, page_type, cells, operation):
+
+ """
+
+ This function will write the list of cells sent in to the connection under the table name specified including
+ the metadata regarding to the file type, page type, and operation.
+
+ Note: The types of the data in the values can prove to be an issue here. For the most part we want to write
+ back the value as the type that we read it out of the file as even though the data has the possibility
+ of still being stored differently since we are leaving all data types to be undefined causing the storage
+ algorithm internal to SQLite to slightly change. Despite this, we make the following modifications in
+ order to best ensure data integrity when writing the data back to the SQLite file:
+ 1.) If the value is a bytearray, the value is interpreted as a blob object. In order to write this
+ back correctly, we set it to buffer(value) in order to write it back to the SQLite database as
+ a blob object. Before we write it back, we make sure that the object does not have text affinity,
+ or if it does we decode it in the database text encoding before writing it.
+ 2.) If the value is a string, we encode it using UTF-8. If this fails, that means it had characters
+ not supported by the unicode encoding which caused it to fail. Since we are writing back carved
+ records that may have invalid characters in strings due to parts being overwritten or false
+ positives, this can occur a lot. Therefore, if the unicode encoding fails, we do the same
+ as above for blob objects and create a buffer(value) blob object and write that back to the
+ database in order to maintain the original data. Therefore, in some tables, depending on the
+ data parsed or strings retrieved may be stored in either a string (text) or blob storage class.
+ 3.) If the value does not fall in one of the above use cases, we leave it as is and write it back to the
+ database without any modifications.
+
+ Note: If the value is None, we leave it as None. We used to update the None value with the string "NULL"
+ since issues could be seen when carving cells where the value is None not because it was NULL originally
+ in the database, but because it was unable to be parsed out when it may have actually had a value (when
+ it was truncated). Distinction is needed between these two use cases.
+
+ Note: Since the amount of columns found may be less than the number of columns actually in the SQL/schema
+ due to alter table statements over time that may have added columns, we account for the difference
+ in the number of columns. This is done by taking the difference of the number of columns in the
+ SQL/schema and subtracting the number of columns for the particular row that is being worked on
+ and multiply that number by the "None" field in order to pad out the row in the SQLite database
+ with no data for the remaining columns.
+
+ :param connection:
+ :param table_name:
+ :param column_count:
+ :param file_type:
+ :param database_text_encoding:
+ :param page_type:
+ :param cells:
+ :param operation:
+
+ :return:
+
+ """
+
+ if cells:
+
+ entries = []
+
+ for cell in cells:
+
+ cell_record_column_values = []
+ for record_column in cell.payload.record_columns:
+ serial_type = record_column.serial_type
+ text_affinity = True if serial_type >= 13 and serial_type % 2 == 1 else False
+ value = record_column.value
+
+ if value is None:
+ pass
+ elif isinstance(value, bytearray):
+ if text_affinity:
+ value = value.decode(database_text_encoding, "replace")
+ else:
+ value = buffer(value)
+ elif isinstance(value, str):
+ try:
+ if text_affinity:
+ value = value.decode(database_text_encoding, "replace")
+ else:
+ value = buffer(value)
+ except UnicodeDecodeError:
+
+ """
+
+ Note: Here we do not decode or encode the value, since the above failed the value will
+ contain text that cannot be properly decoded and most likely due to random bytes
+ in a carving. In this case, we just print the value without trying to account
+ for the database text encoding which may mean the text may appear differently
+ (ie. with spaces between each character), but it is better to do it this way
+ rather then to risk replacing characters since we don't know if it is indeed text.
+
+ """
+
+ value = buffer(value)
+
+ cell_record_column_values.append(value)
+
+ row = [file_type, cell.version_number, cell.page_version_number, cell.source, cell.page_number,
+ cell.location, operation, cell.file_offset]
+ if page_type == PAGE_TYPE.B_TREE_TABLE_LEAF:
+ row.append(cell.row_id)
+ row.extend(cell_record_column_values)
+
+ # Check the length of the row against the column count and pad it out with NULLs if necessary
+ if len(row) < column_count:
+ row.extend([None] * (column_count - len(row)))
+
+ if len(row) > column_count:
+ log_message = "The number of columns found in the row: {} were more than the expected: {} " \
+ "for sqlite export on master schema entry name: {} with file type: {} " \
+ "and page type: {}."
+ log_message = log_message.format(len(row), column_count, table_name, file_type, page_type)
+ getLogger(LOGGER_NAME).warn(log_message)
+ raise ExportError(log_message)
+
+ entries.append(tuple(row))
+
+ if not entries:
+ log_message = "Did not find any entries to write when cells were specified for sqlite export on " \
+ "master schema entry name: {} with file type: {} and page type: {}."
+ log_message = log_message.format(table_name, file_type, page_type)
+ getLogger(LOGGER_NAME).warn(log_message)
+ raise ExportError(log_message)
+
+ number_of_rows = (len(entries[0]) - 1)
+
+ column_fields = "?" + (", ?" * number_of_rows)
+ insert_statement = "INSERT INTO {} VALUES ({})".format(table_name, column_fields)
+ connection.executemany(insert_statement, entries)
diff --git a/sqlite_dissect/export/text_export.py b/sqlite_dissect/export/text_export.py
new file mode 100644
index 0000000..ba53927
--- /dev/null
+++ b/sqlite_dissect/export/text_export.py
@@ -0,0 +1,257 @@
+from logging import getLogger
+from os import rename
+from os.path import exists
+from os.path import sep
+from uuid import uuid4
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import PAGE_TYPE
+from sqlite_dissect.exception import ExportError
+from sqlite_dissect.output import stringify_cell_record
+
+"""
+
+text_export.py
+
+This script holds the objects used for exporting results of the SQLite carving framework to text files.
+
+This script holds the following object(s):
+CommitConsoleExporter(object)
+CommitTextExporter(object)
+
+"""
+
+
+class CommitConsoleExporter(object):
+
+ @staticmethod
+ def write_header(master_schema_entry, page_type):
+ header = "\nMaster schema entry: {} row type: {} on page type: {} with sql: {}."
+ header = header.format(master_schema_entry.name, master_schema_entry.row_type,
+ page_type, master_schema_entry.sql)
+ print(header)
+
+ @staticmethod
+ def write_commit(commit):
+
+ """
+
+
+
+ Note: This function only prints the commit record if the commit record was updated.
+
+ :param commit:
+
+ :return:
+
+ """
+
+ if not commit.updated:
+ return
+
+ logger = getLogger(LOGGER_NAME)
+
+ commit_header = "Commit: {} updated in version: {} with root page number: {} on b-tree page numbers: {}."
+ print(commit_header.format(commit.name, commit.version_number,
+ commit.root_page_number, commit.b_tree_page_numbers))
+
+ if commit.page_type == PAGE_TYPE.B_TREE_INDEX_LEAF:
+
+ CommitConsoleExporter._write_cells(commit.file_type, commit.database_text_encoding, commit.page_type,
+ commit.added_cells.values(), "Added")
+ CommitConsoleExporter._write_cells(commit.file_type, commit.database_text_encoding, commit.page_type,
+ commit.updated_cells.values(), "Updated")
+ CommitConsoleExporter._write_cells(commit.file_type, commit.database_text_encoding, commit.page_type,
+ commit.deleted_cells.values(), "Deleted")
+ CommitConsoleExporter._write_cells(commit.file_type, commit.database_text_encoding, commit.page_type,
+ commit.carved_cells.values(), "Carved")
+
+ elif commit.page_type == PAGE_TYPE.B_TREE_TABLE_LEAF:
+
+ # Sort the added, updated, and deleted cells by the row id
+ sorted_added_cells = sorted(commit.added_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id)
+ CommitConsoleExporter._write_cells(commit.file_type, commit.database_text_encoding, commit.page_type,
+ sorted_added_cells, "Added")
+ sorted_updated_cells = sorted(commit.updated_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id)
+ CommitConsoleExporter._write_cells(commit.file_type, commit.database_text_encoding, commit.page_type,
+ sorted_updated_cells, "Updated")
+ sorted_deleted_cells = sorted(commit.deleted_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id)
+ CommitConsoleExporter._write_cells(commit.file_type, commit.database_text_encoding, commit.page_type,
+ sorted_deleted_cells, "Deleted")
+
+ # We will not sort the carved cells since row ids are not deterministic even if parsed
+ CommitConsoleExporter._write_cells(commit.file_type, commit.database_text_encoding, commit.page_type,
+ commit.carved_cells.values(), "Carved")
+
+ else:
+
+ log_message = "Invalid commit page type: {} found for text export on master " \
+ "schema entry name: {} while writing to sqlite file name: {}."
+ log_message = log_message.format(commit.page_type, commit.name)
+ logger.warn(log_message)
+ raise ExportError(log_message)
+
+ @staticmethod
+ def _write_cells(file_type, database_text_encoding, page_type, cells, operation):
+
+ """
+
+ This function will write the list of cells sent in to the connection under the table name specified including
+ the metadata regarding to the file type, page type, and operation.
+
+ Note: Since we are writing out to text, all values are written as strings.
+
+ :param file_type:
+ :param database_text_encoding:
+ :param page_type:
+ :param cells:
+ :param operation:
+
+ :return:
+
+ """
+
+ base_string = "File Type: {} Version Number: {} Page Version Number: {} Source: {} " \
+ "Page Number: {} Location: {} Operation: {} File Offset: {}"
+ for cell in cells:
+ preface = base_string.format(file_type, cell.version_number, cell.page_version_number, cell.source,
+ cell.page_number, cell.location, operation, cell.file_offset)
+ row_values = stringify_cell_record(cell, database_text_encoding, page_type)
+ print(preface + " " + row_values + ".")
+
+
+class CommitTextExporter(object):
+
+ def __init__(self, export_directory, file_name):
+
+ """
+
+
+
+ Note: If the file is detected as already existing, a uuid will be appended to the file name of the old file
+ and a new file by the name specified will be created.
+
+ :param export_directory:
+ :param file_name:
+
+ :return:
+
+ """
+
+ self._text_file_name = export_directory + sep + file_name
+ self._file_handle = None
+
+ def __enter__(self):
+
+ # Check if the file exists and if it does rename it
+ if exists(self._text_file_name):
+
+ # Generate a uuid to append to the file name
+ new_file_name_for_existing_file = self._text_file_name + "-" + str(uuid4())
+
+ # Rename the existing file
+ rename(self._text_file_name, new_file_name_for_existing_file)
+
+ log_message = "File: {} already existing when creating the file for commit text exporting. The " \
+ "file was renamed to: {} and new data will be written to the file name specified."
+ log_message = log_message.format(self._text_file_name, new_file_name_for_existing_file)
+ getLogger(LOGGER_NAME).debug(log_message)
+
+ self._file_handle = open(self._text_file_name, "w")
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self._file_handle.close()
+
+ def write_header(self, master_schema_entry, page_type):
+ header = "\nMaster schema entry: {} row type: {} on page type: {} with sql: {}."
+ header = header.format(master_schema_entry.name, master_schema_entry.row_type,
+ page_type, master_schema_entry.sql)
+ self._file_handle.write(header + "\n")
+
+ def write_commit(self, commit):
+
+ """
+
+
+
+ Note: This function only writes the commit record if the commit record was updated.
+
+ :param commit:
+
+ :return:
+
+ """
+
+ if not commit.updated:
+ return
+
+ logger = getLogger(LOGGER_NAME)
+
+ commit_header = "Commit: {} updated in version: {} with root page number: {} on b-tree page numbers: {}.\n"
+ self._file_handle.write(commit_header.format(commit.name, commit.version_number,
+ commit.root_page_number, commit.b_tree_page_numbers))
+
+ if commit.page_type == PAGE_TYPE.B_TREE_INDEX_LEAF:
+
+ CommitTextExporter._write_cells(self._file_handle, commit.file_type, commit.database_text_encoding,
+ commit.page_type, commit.added_cells.values(), "Added")
+ CommitTextExporter._write_cells(self._file_handle, commit.file_type, commit.database_text_encoding,
+ commit.page_type, commit.updated_cells.values(), "Updated")
+ CommitTextExporter._write_cells(self._file_handle, commit.file_type, commit.database_text_encoding,
+ commit.page_type, commit.deleted_cells.values(), "Deleted")
+ CommitTextExporter._write_cells(self._file_handle, commit.file_type, commit.database_text_encoding,
+ commit.page_type, commit.carved_cells.values(), "Carved")
+
+ elif commit.page_type == PAGE_TYPE.B_TREE_TABLE_LEAF:
+
+ # Sort the added, updated, and deleted cells by the row id
+ sorted_added_cells = sorted(commit.added_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id)
+ CommitTextExporter._write_cells(self._file_handle, commit.file_type, commit.database_text_encoding,
+ commit.page_type, sorted_added_cells, "Added")
+ sorted_updated_cells = sorted(commit.updated_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id)
+ CommitTextExporter._write_cells(self._file_handle, commit.file_type, commit.database_text_encoding,
+ commit.page_type, sorted_updated_cells, "Updated")
+ sorted_deleted_cells = sorted(commit.deleted_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id)
+ CommitTextExporter._write_cells(self._file_handle, commit.file_type, commit.database_text_encoding,
+ commit.page_type, sorted_deleted_cells, "Deleted")
+
+ # We will not sort the carved cells since row ids are not deterministic even if parsed
+ CommitTextExporter._write_cells(self._file_handle, commit.file_type, commit.database_text_encoding,
+ commit.page_type, commit.carved_cells.values(), "Carved")
+
+ else:
+
+ log_message = "Invalid commit page type: {} found for text export on master " \
+ "schema entry name: {}."
+ log_message = log_message.format(commit.page_type, commit.name, self._text_file_name)
+ logger.warn(log_message)
+ raise ExportError(log_message)
+
+ @staticmethod
+ def _write_cells(file_handle, file_type, database_text_encoding, page_type, cells, operation):
+
+ """
+
+ This function will write the list of cells sent in to the connection under the table name specified including
+ the metadata regarding to the file type, page type, and operation.
+
+ Note: Since we are writing out to text, all values are written as strings.
+
+ :param file_handle:
+ :param file_type:
+ :param database_text_encoding:
+ :param page_type:
+ :param cells:
+ :param operation:
+
+ :return:
+
+ """
+
+ base_string = "File Type: {} Version Number: {} Page Version Number: {} Source: {} " \
+ "Page Number: {} Location: {} Operation: {} File Offset: {}"
+ for cell in cells:
+ preface = base_string.format(file_type, cell.version_number, cell.page_version_number, cell.source,
+ cell.page_number, cell.location, operation, cell.file_offset)
+ row_values = stringify_cell_record(cell, database_text_encoding, page_type)
+ file_handle.write(preface + " " + row_values + ".\n")
diff --git a/sqlite_dissect/export/xlsx_export.py b/sqlite_dissect/export/xlsx_export.py
new file mode 100644
index 0000000..d8c9c8f
--- /dev/null
+++ b/sqlite_dissect/export/xlsx_export.py
@@ -0,0 +1,337 @@
+from logging import getLogger
+from openpyxl import Workbook
+from os import rename
+from os.path import exists
+from os.path import sep
+from uuid import uuid4
+from sqlite_dissect.constants import ILLEGAL_XML_CHARACTER_PATTERN
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import PAGE_TYPE
+from sqlite_dissect.constants import UTF_8
+from sqlite_dissect.exception import ExportError
+
+"""
+
+xlsx_export.py
+
+This script holds the objects used for exporting results of the SQLite carving framework to xlsx files.
+
+This script holds the following object(s):
+CommitXlsxExporter(object)
+
+"""
+
+
+class CommitXlsxExporter(object):
+
+ def __init__(self, export_directory, file_name):
+ self._workbook = Workbook(write_only=True)
+ self._xlsx_file_name = export_directory + sep + file_name
+ self._sheets = {}
+ self._long_sheet_name_translation_dictionary = {}
+
+ def __enter__(self):
+
+ # Check if the file exists and if it does rename it
+ if exists(self._xlsx_file_name):
+
+ # Generate a uuid to append to the file name
+ new_file_name_for_existing_file = self._xlsx_file_name + "-" + str(uuid4())
+
+ # Rename the existing file
+ rename(self._xlsx_file_name, new_file_name_for_existing_file)
+
+ log_message = "File: {} already existing when creating the file for commit xlsx exporting. The " \
+ "file was renamed to: {} and new data will be written to the file name specified."
+ log_message = log_message.format(self._xlsx_file_name, new_file_name_for_existing_file)
+ getLogger(LOGGER_NAME).debug(log_message)
+
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self._workbook.save(self._xlsx_file_name)
+ log_message = "Saving file {} after xlsx export."
+ log_message = log_message.format(self._xlsx_file_name)
+ getLogger(LOGGER_NAME).debug(log_message)
+
+ def write_commit(self, master_schema_entry, commit):
+
+ """
+
+
+
+ Note: This function only writes the commit record if the commit record was updated.
+
+ :param master_schema_entry:
+ :param commit:
+
+ :return:
+
+ """
+
+ if not commit.updated:
+ return
+
+ logger = getLogger(LOGGER_NAME)
+
+ """
+
+ In xlsx files, there is a limit to the number of characters allowed to be specified in a sheet name. This
+ limit is 31 characters. The openpyxl library also checks for this use case and if it finds a sheet name longer
+ than 31 characters, raises an exception. Therefore, we check that here and accommodate for that use case when
+ it occurs.
+
+ This is done by maintaining a dictionary of commit names longer than 31 characters and a sheet name
+ based off of the commit name that is within the character limit. If a commit name is longer than 31 characters,
+ all characters past 30 are chopped off and then a integer is added to the end in the range of 0 to 9 depending
+ on the number of collisions that may occur for multiple similar commit names.
+
+ Note: There needs to be a better way to distinguish between similar commit names and if there are more than 10
+ names similar in the first 30 characters, an exception will be raised. Right now a maximum of 10 similar
+ names are support (0 to 9).
+
+ """
+
+ # Setup the name postfix increment counter
+ name_postfix_increment = 0
+
+ # Set the sheet name to be the commit name
+ sheet_name = commit.name
+
+ # Check if the sheet name is greater than 31 characters
+ if len(sheet_name) > 31:
+
+ # Check if the sheet name is already in the dictionary
+ if sheet_name in self._long_sheet_name_translation_dictionary:
+
+ # Set it to the name already made for it from a previous call
+ sheet_name = self._long_sheet_name_translation_dictionary[sheet_name]
+
+ # The sheet name was not already in the dictionary so we need to make a new name
+ else:
+
+ # Continue while we are between 0 and 9
+ while name_postfix_increment < 10:
+
+ # Create the truncated sheet name from the first 30 characters of the sheet name and name postfix
+ truncated_sheet_name = sheet_name[:30] + str(name_postfix_increment)
+
+ # CHeck if the name does not already exist in the dictionary
+ if truncated_sheet_name not in self._long_sheet_name_translation_dictionary:
+
+ # Add the sheet name and truncated sheet name into the dictionary
+ self._long_sheet_name_translation_dictionary[sheet_name] = truncated_sheet_name
+
+ # Set the sheet name
+ sheet_name = truncated_sheet_name
+
+ # Log a debug message for the truncation of the commit name as a sheet name
+ log_message = "Commit name: {} was truncated to: {} since it had a length of {} characters " \
+ "which is greater than the 31 allowed characters for a sheet name."
+ log_message = log_message.format(commit.name, sheet_name, len(commit.name))
+ logger.debug(log_message)
+
+ # Break from the while loop
+ break
+
+ # The name already exists
+ else:
+
+ # Increment the name postfix counter
+ name_postfix_increment += 1
+
+ # Raise an exception if the name postfix increment counter reached 10
+ if name_postfix_increment == 10:
+ log_message = "Max number of allowed (10) increments reached for renaming the sheet with " \
+ "original name: {} for page type: {} due to having a length of {} characters " \
+ "which is greater than the 31 allowed characters while writing to xlsx file name: {}."
+ log_message = log_message.format(commit.name, commit.page_type, len(commit.name),
+ self._xlsx_file_name)
+ logger.warn(log_message)
+ raise ExportError(log_message)
+
+ sheet = self._sheets[sheet_name] if sheet_name in self._sheets else None
+ write_headers = False
+
+ if not sheet:
+ sheet = self._workbook.create_sheet(sheet_name)
+ self._sheets[sheet_name] = sheet
+ write_headers = True
+
+ """
+
+ Below we have to account for how the pages are stored.
+
+ For the table master schema entry row type:
+ 1.) If the table is not a "without rowid" table, it will be stored on a table b-tree page with
+ row ids.
+ 2.) If the table is a "without rowid" table, it will be stored on an index b-tree page with no
+ row ids.
+
+ For the index master schema entry row type:
+ 1.) It will be stored on an index b-tree page with no row ids.
+
+ The commit object handles this by having a page type to make this distinction easier. Therefore, we only
+ need to check on the page type here.
+
+ """
+
+ column_headers = []
+ if write_headers:
+ column_headers.extend(["File Source", "Version", "Page Version", "Cell Source", "Page Number",
+ "Location", "Operation", "File Offset"])
+
+ if commit.page_type == PAGE_TYPE.B_TREE_INDEX_LEAF:
+
+ """
+
+ Note: The index master schema entries are currently not fully parsed and therefore we do not have
+ column definitions in order to derive the column names from.
+
+ """
+
+ sheet.append(column_headers)
+
+ CommitXlsxExporter._write_cells(sheet, commit.file_type, commit.database_text_encoding, commit.page_type,
+ commit.added_cells.values(), "Added")
+ CommitXlsxExporter._write_cells(sheet, commit.file_type, commit.database_text_encoding, commit.page_type,
+ commit.updated_cells.values(), "Updated")
+ CommitXlsxExporter._write_cells(sheet, commit.file_type, commit.database_text_encoding, commit.page_type,
+ commit.deleted_cells.values(), "Deleted")
+ CommitXlsxExporter._write_cells(sheet, commit.file_type, commit.database_text_encoding, commit.page_type,
+ commit.carved_cells.values(), "Carved")
+
+ elif commit.page_type == PAGE_TYPE.B_TREE_TABLE_LEAF:
+
+ if write_headers:
+ column_headers.append("Row ID")
+ column_headers.extend([column_definition.column_name
+ for column_definition in master_schema_entry.column_definitions])
+ sheet.append(column_headers)
+
+ # Sort the added, updated, and deleted cells by the row id
+ sorted_added_cells = sorted(commit.added_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id)
+ CommitXlsxExporter._write_cells(sheet, commit.file_type, commit.database_text_encoding, commit.page_type,
+ sorted_added_cells, "Added")
+ sorted_updated_cells = sorted(commit.updated_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id)
+ CommitXlsxExporter._write_cells(sheet, commit.file_type, commit.database_text_encoding, commit.page_type,
+ sorted_updated_cells, "Updated")
+ sorted_deleted_cells = sorted(commit.deleted_cells.values(), key=lambda b_tree_cell: b_tree_cell.row_id)
+ CommitXlsxExporter._write_cells(sheet, commit.file_type, commit.database_text_encoding, commit.page_type,
+ sorted_deleted_cells, "Deleted")
+
+ # We will not sort the carved cells since row ids are not deterministic even if parsed
+ CommitXlsxExporter._write_cells(sheet, commit.file_type, commit.database_text_encoding, commit.page_type,
+ commit.carved_cells.values(), "Carved")
+
+ else:
+
+ log_message = "Invalid commit page type: {} found for xlsx export on master " \
+ "schema entry name: {} while writing to xlsx file name: {}."
+ log_message = log_message.format(commit.page_type, commit.name, self._xlsx_file_name)
+ logger.warn(log_message)
+ raise ExportError(log_message)
+
+ @staticmethod
+ def _write_cells(sheet, file_type, database_text_encoding, page_type, cells, operation):
+
+ """
+
+ This function will write the list of cells sent in to the sheet specified including the metadata regarding
+ to the file type, page type, and operation.
+
+ Note: The types of the data in the values can prove to be an issue here. We want to write the value out as
+ a string similarly as the text and csv outputs do for example even though it may contain invalid
+ characters. When data is sent into the openpyxl library to be written to the xml xlsx, if it is a
+ string, it is encoded into the default encoding and then checked for xml illegal characters that may
+ pose an issue when written to the xml. In order to properly check the values and write them accordingly
+ through the openpyxl library we address the following use cases for the value in order:
+ 1.) If the value is a bytearray (most likely originally a blob object) or a string value, we want to
+ write the value as a string. However, in order to do this for blob objects or strings that may
+ have a few bad characters in them from carving, we need to do our due diligence and make sure
+ there are no bad unicode characters and no xml illegal characters that may cause issues with
+ writing to the xlsx. In order to do this we do the following:
+ a.) We first convert the value to string if the affinity was not text, otherwise we decode
+ the value in the database text encoding. When we decode using the database text encoding,
+ we specify to "replace" characters it does not recognize in order to compensate for carved
+ rows.
+ b.) We then test encoding it to UTF-8.
+ i.) If the value successfully encodes as UTF-8 nothing is done further for this step.
+ ii.) If the value throws an exception encoding, we have illegal unicode characters in the
+ string that need to be addressed. In order to escape these, we decode the string
+ as UTF-8 using the "replace" method to replace any illegal unicode characters
+ with '\ufffd' and set this back as the value.
+ c.) After we have successfully set the value back to a UTF-8 compliant value, we need to check
+ the value for xml illegal characters. If any of these xml illegal characters are found,
+ they are replaced with a space. This behaviour may be different from how values are output
+ into text or csv since this is being written to xml and additional rules apply for certain
+ characters.
+ between the xlsx output and text/csv output in reference to xml illegal characters.
+ d.) After all the illegal characters are removed, due to the way openpyxl determines data types
+ of particular cells, if a cell starts with "=", it is determined to be a formula and set as
+ that in the data type field for that cell. This causes issues when opening the file in excel.
+ Microsoft Excel recommends prefacing the string with a single quote character, however,
+ this only seems to be within Excel itself. You can specify the data type of the cell in
+ openpyxl, but not in the write-only mode that is being used here. In order to work around
+ this, we check if the first character of a string or bytearray is a "=" character and preface
+ that string with a space. There may be better ways to handle this such as not using the
+ write-only mode.
+ Note: Additionally to the "=" character, the "-" character has similar issues in excel.
+ However, openpyxl explicitly checks on the "=" character being the first character
+ and setting that cell to a formula and does not handle the use case of a cell starting
+ with the "-" character, so this use case is ignored.
+ 2.) If the value does not fall in one of the above use cases, we leave it as is and write it to the
+ xlsx without any modifications.
+
+ Note: If the value is None, we leave it as None. We used to update the None value with the string "NULL"
+ since issues could be seen when carving cells where the value is None not because it was NULL originally
+ in the database, but because it was unable to be parsed out when it may have actually had a value (when
+ it was truncated). Distinction is needed between these two use cases.
+
+ Note: It was noticed that blob objects are typically detected as isinstance of str here and strings are
+ bytearray objects. This needs to be investigated why exactly blob objects are coming out as str
+ objects.
+
+ Note: Comparisons should be done on how other applications work with different database text encodings in
+ reference to their output.
+
+ Note: The decoding of the value in the database text encoding should only specify replace on a carved entry.
+
+ :param sheet:
+ :param file_type:
+ :param database_text_encoding:
+ :param page_type:
+ :param cells:
+ :param operation:
+
+ :return:
+
+ """
+
+ for cell in cells:
+ cell_record_column_values = []
+ for record_column in cell.payload.record_columns:
+ serial_type = record_column.serial_type
+ text_affinity = True if serial_type >= 13 and serial_type % 2 == 1 else False
+ value = record_column.value
+ if isinstance(value, (bytearray, str)):
+ if len(value) == 0 and isinstance(value, bytearray):
+ value = None
+ else:
+ value = value.decode(database_text_encoding, "replace") if text_affinity else str(value)
+ try:
+ value.encode(UTF_8)
+ except UnicodeDecodeError:
+ value = value.decode(UTF_8, "replace")
+ value = ILLEGAL_XML_CHARACTER_PATTERN.sub(" ", value)
+ if value.startswith("="):
+ value = ' ' + value
+ cell_record_column_values.append(value)
+
+ row = [file_type, cell.version_number, cell.page_version_number, cell.source, cell.page_number,
+ cell.location, operation, cell.file_offset]
+ if page_type == PAGE_TYPE.B_TREE_TABLE_LEAF:
+ row.append(cell.row_id)
+ row.extend(cell_record_column_values)
+
+ sheet.append(row)
diff --git a/sqlite_dissect/file/README.md b/sqlite_dissect/file/README.md
new file mode 100644
index 0000000..d7b81df
--- /dev/null
+++ b/sqlite_dissect/file/README.md
@@ -0,0 +1,139 @@
+
+# sqlite_dissect.file
+
+This package will control parsing and access to all (supported) sqlite files including the
+database, rollback journal, and wal.
+
+- file_handle.py
+- header.py
+- utilities.py
+- version.py
+- version_parser.py
+
+TODO items for the "file" package:
+
+- [ ] Finish UML class diagrams.
+
+
+
+### file_handle.py
+
+This script holds the file handle for file objects to be worked with in relation to the database, wal, journal and other
+supported file types specified in the FILE_TYPE file types list.
+
+This script holds the following object(s):
+- FileHandle(object)
+
+
+```mermaid
+%%{init: { "theme": "dark" }}%%
+classDiagram
+ class FileHandleobject {
+ -_logger
+ -_database_text_encoding
+ +file_type
+ +file_object
+ +file_externally_controlled
+ +file_size
+ +header
+ +__init__(self, file_type, file_identifier, database_text_encoding=None, file_size=None)
+ +__repr__(self)
+ +__str__(self)
+ +stringify(self, padding="", print_header=True)
+ +database_text_encoding(self)
+ +database_text_encoding(self, database_text_encoding)
+ +close(self)
+ +read_data(self, offset, number_of_bytes)
+ }
+```
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+ ##### FileHandle Class:
+ - [ ] Handle the encoding differently (in particular the WAL file)?
+ - [ ] Investigate a better way of cleaning up the file object other than having to explicitly call close.
+ - [ ] The lock byte page is not implemented yet and therefore databases >= 1GB will fail to open.
+ - [ ] Investigate if lock byte pages affect other SQLite file types such as WAL, journal, etc. at all.
+ - [ ] Handle exceptions that may be raised from creating headers and reading data better.
+ - [ ] Split the read_data function into separate read and seek functions?
+
+
+
+### header.py
+
+This script holds an abstract class for file header objects to extend and inherit from. File headers such as that
+of the wal, journal, and database file headers will extend this class.
+
+>Note:
+>
+> The database file header is the same as the file header for the sqlite database. However, for cases like the wal
+> file, the file has a file header that is not related to the actual database information and then depending on how
+> many commits were done with the first page in them, could have many database headers.
+
+This script holds the following object(s):
+- SQLiteHeader(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+ ##### SQLiteHeader Class:
+ -[ ] Investigate if there is a correct way to enforce class variables to subclasses.
+
+
+### utilities.py
+This script holds utility functions for dealing with the version classes rather than more general utility methods.
+
+This script holds the following function(s):
+- validate_page_version_history(version_history)
+
+
+TODO:
+- [ ] Documentation improvements.
+
+
+
+### version.py
+This script holds the superclass objects used for parsing the database and write ahead log.
+
+This script holds the following object(s):
+- Version(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Investigate if there is a correct way to enforce class variables to subclasses.
+ ##### Version Class:
+ - [ ] Better exception handling when creating objects such as pages, etc.
+ - [ ] Incorporate a get_page function?
+ - [ ] Improve the (freelist/pointer map/master schema) page lists by making dictionaries?
+ - [ ] Have a way to parse and store pages in the object itself?
+ - [ ] get_b_tree_root_page: Check to make sure it is only a root page specified by the master schema or 1.
+ - [ ] Document how the database_size_in_pages field is generated across different files and versions.
+ - [ ] Check that rollback journals update the version valid for number and file change counter >= 3.7.0.
+ - [ ] Have the database/version implement the commit record interface and rename it? Rename version?
+ - [ ] get_b_tree_root_page: Check if stored in memory for the version and if so return it instead of parsing.
+
+
+
+### version_parser.py
+
+This script holds the objects for parsing through the version history for master schema entries. This can be used
+for retrieving cells (records), carving, signature generation, etc..
+
+This script holds the following object(s):
+- VersionParser(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Handle exceptions that may be raised from creating and working with objects better.
+ ##### VersionParser Class:
+ - [ ] Support the same master schema entry being removed and then re-added (Keep in mind row id).
+ - [ ] How to handle master schema entries not found in specified versions (warning currently raised)?
+ - [ ] Support for virtual table modules of master schema entry table type (warning currently raised).
+ - [ ] Support for "without rowid" tables (index b-tree pages) (warning currently raised).
+ - [ ] Investigate issues with same rows in index b-tree leaf pages that might get removed.
+ - [ ] Either transition or also put the page_type field in the master schema entry.
diff --git a/sqlite_dissect/file/__init__.py b/sqlite_dissect/file/__init__.py
new file mode 100644
index 0000000..99baad0
--- /dev/null
+++ b/sqlite_dissect/file/__init__.py
@@ -0,0 +1,11 @@
+
+"""
+
+__init__.py
+
+This init script will initialize any needed logic for this package.
+
+This package will control parsing and access to all (supported) sqlite files including the
+database, rollback journal, and wal.
+
+"""
diff --git a/sqlite_dissect/file/database/README.md b/sqlite_dissect/file/database/README.md
new file mode 100644
index 0000000..b8976d8
--- /dev/null
+++ b/sqlite_dissect/file/database/README.md
@@ -0,0 +1,163 @@
+
+# sqlite_dissect.file.wal_index
+
+This package will control parsing and access to the SQLite database files.
+
+- database.py
+- header.py
+- page.py
+- payload.py
+- utilities.py
+
+TODO items for the "database" package:
+
+- [ ] Finish UML class diagrams.
+
+
+
+### database.py
+This script holds the objects used for parsing the database file.
+
+This script holds the following object(s):
+- Database(Version)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Investigate where a database file has empty space beyond the page size (wal checkpoints were set).
+ ##### Database Class:
+ - [ ] Better exception handling when creating objects such as pages, etc.
+ - [ ] Check the use case in regards to a database size in pages of 0 in the header and it's calculated.
+ - [ ] Handle where the version valid for number != file change counter (warning currently thrown).
+ - [ ] Test out code with a empty database file with no schema (especially the master schema parsing).
+ - [ ] More detailed documentation on pages stored in memory. (Trade offs in speed/memory.)
+ - [ ] Check lists and dictionaries for fields before adding.
+ - [ ] The file_size arg may not be needed since it is in the file handle and may be removed
+
+
+
+### header.py
+This script holds the header objects used for parsing the header of the database file structure from the root page.
+
+This script holds the following object(s):
+- DatabaseHeader(SQLiteHeader)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Finish try/except exception handling for struct.error and ord in classes.
+ ##### DatabaseHeader Class:
+ - [ ] Document the database size in pages is going to be 0 if < version 3.7.0 for calling classes.
+ - [ ] Investigate why the sqlite version number is 0 in some sqlite files.
+ - [ ] Figure a way to determine the number of pages and version number for a suspected empty schema.
+ ##### BTreePageHeader Class:
+ - [ ] The contains_sqlite_database_header attribute should apply to table b-trees, not all b-trees.
+ - [ ] The root_page_only_md5_hex_digest attribute should apply to table b-trees, not all b-trees.
+
+
+
+### page.py
+This script holds the Page and Cell related objects for parsing out the different types of SQLite pages in the
+SQLite database file. This also includes freeblock and fragment related objects.
+
+This script holds the following object(s):
+Page(object)
+OverflowPage(Page)
+FreelistTrunkPage(Page)
+FreelistLeafPage(Page)
+PointerMapPage(Page)
+PointerMapEntry(object)
+BTreePage(Page)
+TableInteriorPage(BTreePage)
+TableLeafPage(BTreePage)
+IndexInteriorPage(BTreePage)
+IndexLeafPage(BTreePage)
+BTreeCell(object)
+TableInteriorCell(BTreeCell)
+TableLeafCell(BTreeCell)
+IndexInteriorCell(BTreeCell)
+IndexLeafCell(BTreeCell)
+Freeblock(BTreeCell)
+Fragment(BTreeCell)
+
+>Note: In some places, like with unallocated data on the page, it was decided to not store this data in memory
+> and pull it from the file on demand and/or calculate information from it if needed on demand. This was done
+> to prevent the memory used by this program becoming bloated with unneeded data.
+
+Assumptions:
+1. OverflowPage: All overflow pages are replaced in a chain on modification. This assumes that whenever a cell is
+ modified, that even if the content of the overflow portion does not change, the whole cell including
+ overflow need to be replaced due to the way the cells are stored in SQLite.
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Finish try/except exception handling for struct.error and ord in classes.
+- [ ] Replace version_interface with a more appropriately named variable.
+- [ ] Investigate if there is a correct way to enforce class variables to subclasses.
+- [ ] Calculation for overflow across the b-tree pages could be pulled out to condense code or for use with carving.
+- [ ] Retrieval of cells on demand as well as other fields should be analyzed for better memory handling.
+- [ ] Research the documentation on how it says certain things are done with freelists for backwards compatibility.
+- [ ] Figure out a better way to read out overflow content on demand in regards to payloads/records.
+- [ ] Have a iterator for overflow pages in table leaf and index b-tree pages.
+ ##### FreelistTrunkPage Class:
+ - [ ] Make sure a freelist trunk page can be updated without updating following freelist pages.
+ ##### PointerMapPage Class:
+ - [ ] See documentation in class regarding unallocated space in pointer maps that may be carvable.
+ ##### TableInteriorPage Class:
+ - [ ] Verify that the right-most pointer must always exist.
+ ##### IndexInteriorPage Class:
+ - [ ] Verify that the right-most pointer must always exist.
+ ##### BTreeCell Class:
+ - [ ] Cells with payloads do not have overflow calculated in their md5 hash. Should this be changed?
+ - [ ] Rename start_offset to just offset (and in other objects as well)?
+ ##### TableInteriorCell Class:
+ - [ ] Verify that the left child pointer must always exist.
+ ##### IndexInteriorCell Class:
+ - [ ] Verify that the left child pointer must always exist.
+
+
+
+### payload.py
+This script holds the objects used for parsing payloads from the cells in SQLite b-tree pages for
+index leaf, index interior, and table leaf. (Table Interior pages do not have payloads in their cells.)
+
+This script holds the following object(s):
+Payload(object)
+Record(Payload)
+RecordColumn(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+ ##### Record Class:
+ - [ ] Incorporate absolute offsets.
+ - [ ] Use \_\_slots\_\_ or some other way to reduce memory since many of these objects will be created.
+
+
+
+### utilities.py
+This script holds utility functions for dealing with database specific objects such as pages rather than more general
+utility methods.
+
+This script holds the following function(s):
+aggregate_leaf_cells(b_tree_page, accounted_for_cell_md5s=None, records_only=False)
+create_pointer_map_pages(version, database_size_in_pages, page_size)
+get_maximum_pointer_map_entries_per_page(page_size)
+get_page_numbers_and_types_from_b_tree_page(b_tree_page)
+get_pages_from_b_tree_page(b_tree_page)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] aggregate_leaf_cells: Investigate ways of making this faster like with intersections of sets.
+- [ ] aggregate_leaf_cells: Check if not using accounted for cell md5s if not specified speeds the function up.
+- [ ] aggregate_leaf_cells: Investigate how do index b-tree pages work with fields in interior vs leaf b-tree pages?
+- [ ] aggregate_leaf_cells: Account for "without rowid" tables (where they are stored on index b-tree pages).
+- [ ] create_pointer_map_pages: Handle exceptions that may occur if the page is not a pointer map page.
+- [ ] get_all_pages_from_b_tree_page: Check for duplicates in dictionary when adding?
+- [ ] get_page_numbers_and_types_from_b_tree_page: Check for duplicates in dictionary when adding?
diff --git a/sqlite_dissect/file/database/__init__.py b/sqlite_dissect/file/database/__init__.py
new file mode 100644
index 0000000..56764a4
--- /dev/null
+++ b/sqlite_dissect/file/database/__init__.py
@@ -0,0 +1,10 @@
+
+"""
+
+__init__.py
+
+This init script will initialize any needed logic for this package.
+
+This package will control parsing and access to the SQLite database files.
+
+"""
diff --git a/sqlite_dissect/file/database/database.py b/sqlite_dissect/file/database/database.py
new file mode 100644
index 0000000..f440394
--- /dev/null
+++ b/sqlite_dissect/file/database/database.py
@@ -0,0 +1,367 @@
+from copy import copy
+from warnings import warn
+from sqlite_dissect.constants import BASE_VERSION_NUMBER
+from sqlite_dissect.constants import FILE_TYPE
+from sqlite_dissect.constants import FIRST_FREELIST_TRUNK_PAGE_INDEX
+from sqlite_dissect.constants import FIRST_FREELIST_TRUNK_PARENT_PAGE_NUMBER
+from sqlite_dissect.constants import SQLITE_3_7_0_VERSION_NUMBER
+from sqlite_dissect.constants import SQLITE_MASTER_SCHEMA_ROOT_PAGE
+from sqlite_dissect.exception import DatabaseParsingError
+from sqlite_dissect.file.database.page import FreelistTrunkPage
+from sqlite_dissect.file.database.utilities import create_pointer_map_pages
+from sqlite_dissect.file.file_handle import FileHandle
+from sqlite_dissect.file.schema.master import MasterSchema
+from sqlite_dissect.file.version import Version
+
+"""
+
+database.py
+
+This script holds the objects used for parsing the database file.
+
+This script holds the following object(s):
+Database(Version)
+
+"""
+
+
+class Database(Version):
+
+ def __init__(self, file_identifier, store_in_memory=False, file_size=None, strict_format_checking=True):
+
+ """
+
+ Constructor. This constructor initializes this object.
+
+ :param file_identifier: str The full file path to the file to be opened or the file object.
+ :param store_in_memory: boolean Tells this class to store it's particular version information in memory or not.
+ :param file_size: int Optional parameter to supply the file size.
+ :param strict_format_checking: boolean Specifies if the application should exit if structural validations fail.
+
+ """
+
+ """
+
+ Note: We pass the file name and file object to the file handle and let that do any needed error checking
+ for us.
+
+ """
+
+ database_file_handle = FileHandle(FILE_TYPE.DATABASE, file_identifier, file_size=file_size)
+ super(Database, self).__init__(database_file_handle, BASE_VERSION_NUMBER,
+ store_in_memory, strict_format_checking)
+
+ """
+
+ Retrieve the database header from the file handle.
+
+ """
+
+ self._database_header = self.file_handle.header
+
+ """
+
+ Make sure the database size in pages is not 0. If this occurs, the version has to be prior to 3.7.0. If the
+ size is 0 and the version is < 3.7.0 we set the database size in pages to the calculated number of pages
+ computed from the file size multiplied by the page size. If the version is >= 3.7.0, we raise an exception.
+
+ If the database size in pages is not 0, there is still a use case that could cause the page size to be
+ incorrect. This is when the version valid for number does not match the file change counter. Versions before
+ 3.7.0 did not know to update the page size, but also did not know to update the version valid for number. Only
+ the change counter was updated. Therefore, in the use case where a file could be made with a version >= 3.7.0
+ where the database size in pages is set as well as the version valid for number but then closed down, opened
+ with a SQLite driver version < 3.7.0 and modified, the version valid for number would not match the change
+ counter resulting in what could possibly be a bad database size in pages.
+
+ Note: If the file is opened back up in a version >= 3.7.0 after being opened in a previous version, the
+ database size in pages and version valid for number are set correctly again along with the file change
+ counter on the first modification to the database. It is important to note this was only tested using
+ WAL mode and the base database file remained with the incorrect information until the WAL updated it
+ either at a checkpoint or file closure. Rollback journals are assumed to also update this but have not
+ been observed as of yet.
+
+ """
+
+ # The database header size in pages is not set
+ if self.database_header.database_size_in_pages == 0:
+
+ log_message = "Database header for version: {} specifies a database size in pages of 0 for " \
+ "sqlite version: {}."
+ log_message = log_message.format(self.version_number, self.database_header.sqlite_version_number)
+ self._logger.info(log_message)
+
+ if self.database_header.sqlite_version_number >= SQLITE_3_7_0_VERSION_NUMBER:
+ log_message = "The database header database size in pages is 0 when the sqlite version: {} is " \
+ "greater or equal than 3.7.0 in version: {} and should be set."
+ log_message = log_message.format(self.database_header.sqlite_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise DatabaseParsingError(log_message)
+
+ # Calculate the number of pages from the file size and page size
+ self.database_size_in_pages = self.file_handle.file_size / self.page_size
+
+ # The database header size in pages is set and the version valid for number does not equal the change counter
+ elif self.database_header.version_valid_for_number != self.database_header.file_change_counter:
+
+ """
+
+ We now know that the database has been modified by a legacy version and the database size may not
+ be correct. We have to rely on calculating the page size here.
+
+ """
+
+ # Calculate the number of pages from the file size and page size
+ self.database_size_in_pages = self.file_handle.file_size / self.page_size
+
+ log_message = "Database header for version: {} specifies a database size in pages of {} but version " \
+ "valid for number: {} does not equal the file change counter: {} for sqlite " \
+ "version: {}. Setting the database size in pages to the calculated page size of: {}."
+ log_message = log_message.format(self.version_number, self.database_header.database_size_in_pages,
+ self.database_header.version_valid_for_number,
+ self.database_header.file_change_counter,
+ self.database_header.sqlite_version_number,
+ self.database_size_in_pages)
+ self._logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ # The database header size in pages is set and the version valid for number does equals the change counter
+ else:
+
+ """
+
+ Check to make sure the calculated size in pages matches the database header database size in pages as
+ it should.
+
+ Note: The calculated number of pages can and has been found to be wrong in some cases where the database
+ size in pages is specified where the version valid for number equals the file change counter. It is
+ still unsure of why this can occur but in the use cases this was seen, the database size in pages was
+ correct and the file was inflated (padded) with empty space at the end indicating additional pages
+ when calculating page size from file size. For this reason a warning is thrown instead of an
+ exception (in the case that the version valid for number equals the file change counter and database
+ size in pages is set).
+
+ The use case has not been seen where the database size in pages is 0 and the database size in pages
+ has been calculated. More investigation is needed.
+
+ """
+
+ calculated_size_in_pages = self.file_handle.file_size / self.page_size
+
+ if self.database_header.database_size_in_pages != calculated_size_in_pages:
+
+ # Set the database size in pages to the database header size in pages
+ self.database_size_in_pages = self.database_header.database_size_in_pages
+
+ log_message = "Database header for version: {} specifies a database size in pages of {} but the " \
+ "calculated size in pages is {} instead for sqlite version: {}. The database size in " \
+ "pages will remain unchanged but possibly erroneous use cases may occur when parsing."
+ log_message = log_message.format(self.version_number, self.database_header.database_size_in_pages,
+ calculated_size_in_pages, self.database_header.sqlite_version_number)
+ self._logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ else:
+
+ self.database_size_in_pages = self.database_header.database_size_in_pages
+
+ """
+
+ Since the main database file is the first version (version number 0) all pages are considered "updated"
+ since they are new in terms of the information retrieved from them.
+
+ The page version index will set all page numbers currently in the database pages to the version number of
+ this first version (version number 0).
+
+ """
+
+ self.updated_page_numbers = [page_index + 1 for page_index in range(self.database_size_in_pages)]
+ self.page_version_index = dict(map(lambda x: [x, self.version_number], self.updated_page_numbers))
+
+ self._logger.debug("Updated page numbers initialized as: {} in version: {}.".format(self.updated_page_numbers,
+ self.version_number))
+ self._logger.debug("Page version index initialized as: {} in version: {}.".format(self.page_version_index,
+ self.version_number))
+
+ """
+
+ Here we setup the updated b-tree page numbers. This array will be removed from as we parse through the file
+ to leave just the b-tree pages of the commit record that were updated at the end.
+
+ """
+
+ self.updated_b_tree_page_numbers = copy(self.updated_page_numbers)
+
+ """
+
+ Create the freelist trunk and leaf pages.
+
+ Note: If there are no freelist pages, the first freelist trunk page will be None and there will be an empty
+ array for the freelist page numbers.
+
+ """
+
+ if self.database_header.first_freelist_trunk_page_number:
+ self.first_freelist_trunk_page = FreelistTrunkPage(self,
+ self.database_header.first_freelist_trunk_page_number,
+ FIRST_FREELIST_TRUNK_PARENT_PAGE_NUMBER,
+ FIRST_FREELIST_TRUNK_PAGE_INDEX)
+
+ self.freelist_page_numbers = []
+ observed_freelist_pages = 0
+ freelist_trunk_page = self.first_freelist_trunk_page
+ while freelist_trunk_page:
+
+ # Remove it from the updated b-tree pages
+ self.updated_b_tree_page_numbers.remove(freelist_trunk_page.number)
+
+ self.freelist_page_numbers.append(freelist_trunk_page.number)
+ observed_freelist_pages += 1
+ for freelist_leaf_page in freelist_trunk_page.freelist_leaf_pages:
+ self.freelist_page_numbers.append(freelist_leaf_page.number)
+ observed_freelist_pages += 1
+ freelist_trunk_page = freelist_trunk_page.next_freelist_trunk_page
+
+ if observed_freelist_pages != self.database_header.number_of_freelist_pages:
+ log_message = "The number of observed freelist pages: {} does not match the number of freelist pages " \
+ "specified in the header: {} for version: {}."
+ log_message = log_message.format(observed_freelist_pages, self.database_header.number_of_freelist_pages,
+ self.version_number)
+ self._logger.error(log_message)
+ raise DatabaseParsingError(log_message)
+
+ """
+
+ Create the pointer map pages.
+
+ Note: If there are no pointer map pages, both the pointer map pages and pointer map page numbers will be an
+ empty array.
+
+ """
+
+ if self.database_header.largest_root_b_tree_page_number:
+ self.pointer_map_pages = create_pointer_map_pages(self, self.database_size_in_pages, self.page_size)
+ else:
+ self.pointer_map_pages = []
+
+ self.pointer_map_page_numbers = []
+ for pointer_map_page in self.pointer_map_pages:
+
+ # Remove it from the updated b-tree pages
+ self.updated_b_tree_page_numbers.remove(pointer_map_page.number)
+
+ self.pointer_map_page_numbers.append(pointer_map_page.number)
+
+ """
+
+ Create the root page of the SQLite database.
+
+ """
+
+ self._root_page = self.get_b_tree_root_page(SQLITE_MASTER_SCHEMA_ROOT_PAGE)
+
+ """
+
+ Create the master schema from the root page of the SQLite database.
+
+ Note: There is the possibility that there is no information in the master schema (ie. a "blank" root page).
+ To check this we make sure the schema format number and database text encoding are 0 in the header.
+ A warning is already printed in the database header if this use case is determined.
+
+ In this case the master schema will double check that the root page is indeed devoid of information
+ and will have no schema entries but maintain its fields such as the master schema page numbers which
+ will be a list of just the root page such as: [1].
+
+ """
+
+ self._master_schema = MasterSchema(self, self.root_page)
+
+ # Remove the master schema pages from the updated b-tree pages (this will always include the root page number)
+ for master_schema_page_number in self.master_schema.master_schema_page_numbers:
+ self.updated_b_tree_page_numbers.remove(master_schema_page_number)
+
+ """
+
+ Since we do not check the schema format number and database text encoding in the master schema, we do that here.
+ This is due to the fact that the database header is not sent into the master schema (although if needed it could
+ retrieve it through the instance of this class sent in).
+
+ """
+
+ if len(self.master_schema.master_schema_entries) == 0:
+ if self.database_header.schema_format_number != 0 or self.database_header.database_text_encoding != 0:
+ log_message = "No master schema entries found in master schema for version: {} when the database " \
+ "schema format number was: {} and the database text encoding was: {} when both should " \
+ "be 0."
+ log_message = log_message.format(self.version_number, self.database_header.schema_format_number,
+ self.database_header.database_text_encoding)
+ self._logger.error(log_message)
+ raise DatabaseParsingError(log_message)
+
+ """
+
+ Setup the flags to report on modifications.
+
+ See the version superclass for more documentation on the setup of these flags for the Database class.
+
+ """
+
+ self.database_header_modified = True
+ self.root_b_tree_page_modified = True
+ self.master_schema_modified = True
+
+ if self.first_freelist_trunk_page:
+ self.freelist_pages_modified = True
+
+ if self.database_header.largest_root_b_tree_page_number:
+ self.pointer_map_pages_modified = True
+
+ """
+
+ If the version information is being stored in memory, parse out the pages and store them as a private variable.
+
+ """
+
+ self._pages = {}
+ if self.store_in_memory:
+ self._pages = self.pages
+
+ @Version.database_text_encoding.setter
+ def database_text_encoding(self, database_text_encoding):
+ log_message = "Database text encoding {} requested to be set on database. Operation not permitted. " \
+ "Should be set during object construction."
+ log_message = log_message.format(database_text_encoding)
+ self._logger.error(log_message)
+ raise TypeError(log_message)
+
+ def get_page_data(self, page_number, offset=0, number_of_bytes=None):
+
+ # Set the number of bytes to the rest of the page if it was not set
+ number_of_bytes = self.page_size - offset if not number_of_bytes else number_of_bytes
+
+ if offset >= self.page_size:
+ log_message = "Requested offset: {} is >= the page size: {} for page: {}."
+ log_message = log_message.format(offset, self.page_size, page_number)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ if offset + number_of_bytes > self.page_size:
+ log_message = "Requested length of data: {} at offset {} to {} is greater than the page " \
+ "size: {} for page: {}."
+ log_message = log_message.format(number_of_bytes, offset, number_of_bytes + offset,
+ self.page_size, page_number)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ page_offset = self.get_page_offset(page_number)
+
+ return self.file_handle.read_data(page_offset + offset, number_of_bytes)
+
+ def get_page_offset(self, page_number):
+
+ if page_number < 1 or page_number > self.database_size_in_pages:
+ log_message = "Invalid page number: {} for version: {} with database size in pages: {}."
+ log_message = log_message.format(page_number, self.version_number, self.database_size_in_pages)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ return (page_number - 1) * self.page_size
diff --git a/sqlite_dissect/file/database/header.py b/sqlite_dissect/file/database/header.py
new file mode 100644
index 0000000..33bd680
--- /dev/null
+++ b/sqlite_dissect/file/database/header.py
@@ -0,0 +1,404 @@
+from abc import ABCMeta
+from binascii import hexlify
+from logging import getLogger
+from re import compile
+from re import sub
+from struct import error
+from struct import unpack
+from warnings import warn
+from sqlite_dissect.constants import DATABASE_TEXT_ENCODINGS
+from sqlite_dissect.constants import INTERIOR_PAGE_HEADER_LENGTH
+from sqlite_dissect.constants import LEAF_PAGE_HEADER_LENGTH
+from sqlite_dissect.constants import LEAF_PAYLOAD_FRACTION
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import MAGIC_HEADER_STRING
+from sqlite_dissect.constants import MAGIC_HEADER_STRING_ENCODING
+from sqlite_dissect.constants import MASTER_PAGE_HEX_ID
+from sqlite_dissect.constants import MAXIMUM_EMBEDDED_PAYLOAD_FRACTION
+from sqlite_dissect.constants import MAXIMUM_PAGE_SIZE
+from sqlite_dissect.constants import MAXIMUM_PAGE_SIZE_INDICATOR
+from sqlite_dissect.constants import MAXIMUM_PAGE_SIZE_LIMIT
+from sqlite_dissect.constants import MINIMUM_EMBEDDED_PAYLOAD_FRACTION
+from sqlite_dissect.constants import MINIMUM_PAGE_SIZE_LIMIT
+from sqlite_dissect.constants import RESERVED_FOR_EXPANSION_REGEX
+from sqlite_dissect.constants import RIGHT_MOST_POINTER_LENGTH
+from sqlite_dissect.constants import RIGHT_MOST_POINTER_OFFSET
+from sqlite_dissect.constants import ROLLBACK_JOURNALING_MODE
+from sqlite_dissect.constants import SQLITE_DATABASE_HEADER_LENGTH
+from sqlite_dissect.constants import VALID_SCHEMA_FORMATS
+from sqlite_dissect.constants import WAL_JOURNALING_MODE
+from sqlite_dissect.exception import HeaderParsingError
+from sqlite_dissect.file.header import SQLiteHeader
+from sqlite_dissect.utilities import get_md5_hash
+
+"""
+
+header.py
+
+This script holds the header objects used for parsing the header of the database file structure from the root page.
+
+This script holds the following object(s):
+DatabaseHeader(SQLiteHeader)
+
+"""
+
+
+class DatabaseHeader(SQLiteHeader):
+
+ def __init__(self, database_header_byte_array):
+
+ super(DatabaseHeader, self).__init__()
+
+ logger = getLogger(LOGGER_NAME)
+
+ if len(database_header_byte_array) != SQLITE_DATABASE_HEADER_LENGTH:
+ log_message = "The database header byte array of size: {} is not the expected size of: {}."
+ log_message = log_message.format(len(database_header_byte_array), SQLITE_DATABASE_HEADER_LENGTH)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ try:
+
+ self.magic_header_string = database_header_byte_array[0:16]
+
+ except error:
+
+ logger.error("Failed to retrieve the magic header.")
+ raise
+
+ if self.magic_header_string != MAGIC_HEADER_STRING.decode(MAGIC_HEADER_STRING_ENCODING):
+ log_message = "The magic header string is invalid."
+ logger.error(log_message)
+ raise HeaderParsingError(log_message)
+
+ try:
+
+ self.page_size = unpack(b">H", database_header_byte_array[16:18])[0]
+
+ except error:
+
+ logger.error("Failed to retrieve the page size.")
+ raise
+
+ if self.page_size == MAXIMUM_PAGE_SIZE_INDICATOR:
+ self.page_size = MAXIMUM_PAGE_SIZE
+ elif self.page_size < MINIMUM_PAGE_SIZE_LIMIT:
+ log_message = "The page size: {} is less than the minimum page size limit: {}."
+ log_message = log_message.format(self.page_size, MINIMUM_PAGE_SIZE_LIMIT)
+ logger.error(log_message)
+ raise HeaderParsingError(log_message)
+ elif self.page_size > MAXIMUM_PAGE_SIZE_LIMIT:
+ log_message = "The page size: {} is greater than the maximum page size limit: {}."
+ log_message = log_message.format(self.page_size, MAXIMUM_PAGE_SIZE_LIMIT)
+ logger.error(log_message)
+ raise HeaderParsingError(log_message)
+
+ try:
+
+ self.file_format_write_version = ord(database_header_byte_array[18:19])
+
+ except TypeError:
+
+ logger.error("Failed to retrieve the file format write version.")
+ raise
+
+ if self.file_format_write_version not in [ROLLBACK_JOURNALING_MODE, WAL_JOURNALING_MODE]:
+ log_message = "The file format write version: {} is invalid.".format(self.file_format_write_version)
+ logger.error(log_message)
+ raise HeaderParsingError(log_message)
+
+ try:
+
+ self.file_format_read_version = ord(database_header_byte_array[19:20])
+
+ except TypeError:
+
+ logger.error("Failed to retrieve the file format read version.")
+ raise
+
+ if self.file_format_read_version not in [ROLLBACK_JOURNALING_MODE, WAL_JOURNALING_MODE]:
+ log_message = "The file format read version: {} is invalid.".format(self.file_format_read_version)
+ logger.error(log_message)
+ raise HeaderParsingError(log_message)
+
+ try:
+
+ self.reserved_bytes_per_page = ord(database_header_byte_array[20:21])
+
+ except TypeError:
+
+ logger.error("Failed to retrieve the reserved bytes per page.")
+ raise
+
+ if self.reserved_bytes_per_page != 0:
+ log_message = "Reserved bytes per page is not 0 but {} and is not implemented."
+ log_message = log_message.format(self.reserved_bytes_per_page)
+ logger.error(log_message)
+ raise NotImplementedError(log_message)
+
+ try:
+
+ self.maximum_embedded_payload_fraction = ord(database_header_byte_array[21:22])
+
+ except TypeError:
+
+ logger.error("Failed to retrieve the maximum embedded payload fraction.")
+ raise
+
+ if self.maximum_embedded_payload_fraction != MAXIMUM_EMBEDDED_PAYLOAD_FRACTION:
+ log_message = "Maximum embedded payload fraction: {} is not expected the expected value of: {}."
+ log_message = log_message.format(self.maximum_embedded_payload_fraction, MAXIMUM_EMBEDDED_PAYLOAD_FRACTION)
+ logger.error(log_message)
+ raise HeaderParsingError(log_message)
+
+ try:
+
+ self.minimum_embedded_payload_fraction = ord(database_header_byte_array[22:23])
+
+ except TypeError:
+
+ logger.error("Failed to retrieve the minimum embedded payload fraction.")
+ raise
+
+ if self.minimum_embedded_payload_fraction != MINIMUM_EMBEDDED_PAYLOAD_FRACTION:
+ log_message = "Minimum embedded payload fraction: {} is not expected the expected value of: {}."
+ log_message = log_message.format(self.minimum_embedded_payload_fraction, MINIMUM_EMBEDDED_PAYLOAD_FRACTION)
+ logger.error(log_message)
+ raise HeaderParsingError(log_message)
+
+ try:
+
+ self.leaf_payload_fraction = ord(database_header_byte_array[23:24])
+
+ except TypeError:
+
+ logger.error("Failed to retrieve the leaf payload fraction.")
+ raise
+
+ if self.leaf_payload_fraction != LEAF_PAYLOAD_FRACTION:
+ log_message = "Leaf payload fraction: {} is not expected the expected value of: {}."
+ log_message = log_message.format(self.leaf_payload_fraction, LEAF_PAYLOAD_FRACTION)
+ logger.error(log_message)
+ raise HeaderParsingError(log_message)
+
+ self.file_change_counter = unpack(b">I", database_header_byte_array[24:28])[0]
+ self.database_size_in_pages = unpack(b">I", database_header_byte_array[28:32])[0]
+ self.first_freelist_trunk_page_number = unpack(b">I", database_header_byte_array[32:36])[0]
+ self.number_of_freelist_pages = unpack(b">I", database_header_byte_array[36:40])[0]
+ self.schema_cookie = unpack(b">I", database_header_byte_array[40:44])[0]
+ self.schema_format_number = unpack(b">I", database_header_byte_array[44:48])[0]
+ self.default_page_cache_size = unpack(b">I", database_header_byte_array[48:52])[0]
+ self.largest_root_b_tree_page_number = unpack(b">I", database_header_byte_array[52:56])[0]
+ self.database_text_encoding = unpack(b">I", database_header_byte_array[56:60])[0]
+
+ if self.schema_format_number == 0 and self.database_text_encoding == 0:
+
+ """
+
+ Note: If the schema format number and database text encoding are both 0 then no schema or data has been
+ placed into this database file. If a schema or any data was inputted and then all tables dropped,
+ the schema format number and database text encoding would then be set. In this case the database
+ should only be 1 page. However, we have no way to determine what the size of the database page is
+ unless the version is at least 3.7.0. We could check on the SQLite version and make sure the
+ version is at least 3.7.0 and then check the database size in pages to make sure it was 1 but we
+ would have no way to handle the case if the version was not at least 3.7.0. Also, it has been
+ noticed that the SQLite version number is 0 in some database files. Until this is further
+ thought out and possible solutions are determined, we will not worry about checking that
+ the database has 1 page.
+
+ """
+
+ log_message = "Schema format number and database text encoding are 0 indicating no schema or data."
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ else:
+
+ if self.schema_format_number not in VALID_SCHEMA_FORMATS:
+ log_message = "Schema format number: {} not a valid schema format.".format(self.schema_format_number)
+ logger.error(log_message)
+ raise HeaderParsingError(log_message)
+
+ if self.database_text_encoding not in DATABASE_TEXT_ENCODINGS:
+ log_message = "Database text encoding: {} not a valid encoding.".format(self.database_text_encoding)
+ logger.error(log_message)
+ raise HeaderParsingError(log_message)
+
+ self.user_version = unpack(b">I", database_header_byte_array[60:64])[0]
+ self.incremental_vacuum_mode = unpack(b">I", database_header_byte_array[64:68])[0]
+
+ """
+
+ Originally a check was done that if the largest root b-tree page number existed and the database was less
+ than or equal to 2 pages in size, an exception was thrown. This was found to be wrong in the case of where
+ a database file was generated initially with one page with no information in it yet. In this case (where
+ auto-vacuuming was turned on resulting in a non-zero largest root b-tree page number) the largest root
+ b tree page number was found to be 1. Therefore no exception is thrown if the database size in pages is 1
+ as well as the largest root b-tree page number. However, this resulted in the check of the largest root
+ b-tree page number == 2 as well as the database size in pages == 2. This was decided an irrelevant use case
+ and removed.
+
+ Now the only thing that is checked is that if the incremental vacuum mode is set than the database header
+ largest root b-tree page number must be set. (The inverse of this is not true.)
+
+ Note: In regards to the above, the checking of the page size was done by the database size in pages calculated
+ from the actual parsing of the SQLite file and did not originally reside in this class. After that
+ specific use case was removed, there was no reason not to move this to the database header class.
+
+ """
+
+ if not self.largest_root_b_tree_page_number and self.incremental_vacuum_mode:
+ log_message = "The database header largest root b-tree page number was not set when the incremental " \
+ "vacuum mode was: {}."
+ log_message = log_message.format(self.incremental_vacuum_mode)
+ logger.error(log_message)
+ raise HeaderParsingError(log_message)
+
+ self.application_id = unpack(b">I", database_header_byte_array[68:72])[0]
+ self.reserved_for_expansion = database_header_byte_array[72:92]
+
+ pattern = compile(RESERVED_FOR_EXPANSION_REGEX)
+ reserved_for_expansion_hex = hexlify(self.reserved_for_expansion)
+ if not pattern.match(reserved_for_expansion_hex):
+ log_message = "Header space reserved for expansion is not zero: {}.".format(reserved_for_expansion_hex)
+ logger.error(log_message)
+ raise HeaderParsingError(log_message)
+
+ self.version_valid_for_number = unpack(b">I", database_header_byte_array[92:96])[0]
+ self.sqlite_version_number = unpack(b">I", database_header_byte_array[96:100])[0]
+
+ self.md5_hex_digest = get_md5_hash(database_header_byte_array)
+
+ def stringify(self, padding=""):
+ string = padding + "Magic Header String: {}\n" \
+ + padding + "Page Size: {}\n" \
+ + padding + "File Format Write Version: {}\n" \
+ + padding + "File Format Read Version: {}\n" \
+ + padding + "Reserved Bytes per Page: {}\n" \
+ + padding + "Maximum Embedded Payload Fraction: {}\n" \
+ + padding + "Minimum Embedded Payload Fraction: {}\n" \
+ + padding + "Leaf Payload Fraction: {}\n" \
+ + padding + "File Change Counter: {}\n" \
+ + padding + "Database Size in Pages: {}\n" \
+ + padding + "First Freelist Trunk Page Number: {}\n" \
+ + padding + "Number of Freelist Pages: {}\n" \
+ + padding + "Schema Cookie: {}\n" \
+ + padding + "Schema Format Number: {}\n" \
+ + padding + "Default Page Cache Size: {}\n" \
+ + padding + "Largest Root B-Tree Page Number: {}\n" \
+ + padding + "Database Text Encoding: {}\n" \
+ + padding + "User Version: {}\n" \
+ + padding + "Incremental Vacuum Mode: {}\n" \
+ + padding + "Application ID: {}\n" \
+ + padding + "Reserved for Expansion (Hex): {}\n" \
+ + padding + "Version Valid for Number: {}\n" \
+ + padding + "SQLite Version Number: {}\n" \
+ + padding + "MD5 Hex Digest: {}"
+ return string.format(self.magic_header_string,
+ self.page_size,
+ self.file_format_write_version,
+ self.file_format_read_version,
+ self.reserved_bytes_per_page,
+ self.maximum_embedded_payload_fraction,
+ self.minimum_embedded_payload_fraction,
+ self.leaf_payload_fraction,
+ self.file_change_counter,
+ self.database_size_in_pages,
+ self.first_freelist_trunk_page_number,
+ self.number_of_freelist_pages,
+ self.schema_cookie,
+ self.schema_format_number,
+ self.default_page_cache_size,
+ self.largest_root_b_tree_page_number,
+ self.database_text_encoding,
+ self.user_version,
+ self.incremental_vacuum_mode,
+ self.application_id,
+ hexlify(self.reserved_for_expansion),
+ self.version_valid_for_number,
+ self.sqlite_version_number,
+ self.md5_hex_digest)
+
+
+class BTreePageHeader(object):
+
+ __metaclass__ = ABCMeta
+
+ def __init__(self, page, header_length):
+
+ self.offset = 0
+ self.header_length = header_length
+
+ self.contains_sqlite_database_header = False
+
+ """
+
+ The root_page_only_md5_hex_digest is only set when the SQLite database header is detected in the page.
+
+ """
+
+ self.root_page_only_md5_hex_digest = None
+
+ first_page_byte = page[0:1]
+ if first_page_byte == MASTER_PAGE_HEX_ID:
+ self.contains_sqlite_database_header = True
+ self.root_page_only_md5_hex_digest = get_md5_hash(page[SQLITE_DATABASE_HEADER_LENGTH:])
+ self.offset += SQLITE_DATABASE_HEADER_LENGTH
+
+ self.page_type = page[self.offset:self.offset + 1]
+ self.first_freeblock_offset = unpack(b">H", page[self.offset + 1:self.offset + 3])[0]
+ self.number_of_cells_on_page = unpack(b">H", page[self.offset + 3:self.offset + 5])[0]
+ self.cell_content_offset = unpack(b">H", page[self.offset + 5:self.offset + 7])[0]
+ self.number_of_fragmented_free_bytes = ord(page[self.offset + 7:self.offset + 8])
+
+ self.md5_hex_digest = get_md5_hash(page[self.offset:self.header_length])
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding=""):
+ string = padding + "Contains SQLite Database Header: {}\n" \
+ + padding + "Root Page Only MD5 Hex Digest: {}\n" \
+ + padding + "Page Type (Hex): {}\n" \
+ + padding + "Offset: {}\n" \
+ + padding + "Length: {}\n" \
+ + padding + "First Freeblock Offset: {}\n" \
+ + padding + "Number of Cells on Page: {}\n" \
+ + padding + "Cell Content Offset: {}\n" \
+ + padding + "Number of Fragmented Free Bytes: {}\n" \
+ + padding + "MD5 Hex Digest: {}"
+ return string.format(self.contains_sqlite_database_header,
+ self.root_page_only_md5_hex_digest,
+ hexlify(self.page_type),
+ self.offset,
+ self.header_length,
+ self.first_freeblock_offset,
+ self.number_of_cells_on_page,
+ self.cell_content_offset,
+ self.number_of_fragmented_free_bytes,
+ self.md5_hex_digest)
+
+
+class LeafPageHeader(BTreePageHeader):
+
+ def __init__(self, page):
+ super(LeafPageHeader, self).__init__(page, LEAF_PAGE_HEADER_LENGTH)
+
+
+class InteriorPageHeader(BTreePageHeader):
+
+ def __init__(self, page):
+ super(InteriorPageHeader, self).__init__(page, INTERIOR_PAGE_HEADER_LENGTH)
+
+ right_most_pointer_start_offset = self.offset + RIGHT_MOST_POINTER_OFFSET
+ right_most_pointer_end_offset = right_most_pointer_start_offset + RIGHT_MOST_POINTER_LENGTH
+ self.right_most_pointer = unpack(b">I", page[right_most_pointer_start_offset:right_most_pointer_end_offset])[0]
+
+ def stringify(self, padding=""):
+ string = "\n" \
+ + padding + "Right Most Pointer: {}"
+ string = string.format(self.right_most_pointer)
+ return super(InteriorPageHeader, self).stringify(padding) + string
diff --git a/sqlite_dissect/file/database/page.py b/sqlite_dissect/file/database/page.py
new file mode 100644
index 0000000..9cafa35
--- /dev/null
+++ b/sqlite_dissect/file/database/page.py
@@ -0,0 +1,1776 @@
+from abc import ABCMeta
+from binascii import hexlify
+from logging import getLogger
+from re import sub
+from struct import unpack
+from warnings import warn
+from sqlite_dissect.constants import CELL_LOCATION
+from sqlite_dissect.constants import CELL_MODULE
+from sqlite_dissect.constants import CELL_POINTER_BYTE_LENGTH
+from sqlite_dissect.constants import CELL_SOURCE
+from sqlite_dissect.constants import FIRST_OVERFLOW_PAGE_INDEX
+from sqlite_dissect.constants import FIRST_OVERFLOW_PAGE_NUMBER_LENGTH
+from sqlite_dissect.constants import FIRST_OVERFLOW_PARENT_PAGE_NUMBER
+from sqlite_dissect.constants import FREEBLOCK_BYTE_LENGTH
+from sqlite_dissect.constants import FREELIST_HEADER_LENGTH
+from sqlite_dissect.constants import FREELIST_LEAF_PAGE_NUMBER_LENGTH
+from sqlite_dissect.constants import FREELIST_NEXT_TRUNK_PAGE_LENGTH
+from sqlite_dissect.constants import INDEX_INTERIOR_CELL_CLASS
+from sqlite_dissect.constants import INDEX_INTERIOR_PAGE_HEX_ID
+from sqlite_dissect.constants import INDEX_LEAF_CELL_CLASS
+from sqlite_dissect.constants import INDEX_LEAF_PAGE_HEX_ID
+from sqlite_dissect.constants import INTERIOR_PAGE_HEADER_CLASS
+from sqlite_dissect.constants import LEAF_PAGE_HEADER_CLASS
+from sqlite_dissect.constants import LEFT_CHILD_POINTER_BYTE_LENGTH
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import MASTER_PAGE_HEX_ID
+from sqlite_dissect.constants import NEXT_FREEBLOCK_OFFSET_LENGTH
+from sqlite_dissect.constants import OVERFLOW_HEADER_LENGTH
+from sqlite_dissect.constants import PAGE_FRAGMENT_LIMIT
+from sqlite_dissect.constants import PAGE_HEADER_MODULE
+from sqlite_dissect.constants import PAGE_TYPE
+from sqlite_dissect.constants import PAGE_TYPE_LENGTH
+from sqlite_dissect.constants import POINTER_MAP_B_TREE_NON_ROOT_PAGE_TYPE
+from sqlite_dissect.constants import POINTER_MAP_B_TREE_ROOT_PAGE_TYPE
+from sqlite_dissect.constants import POINTER_MAP_ENTRY_LENGTH
+from sqlite_dissect.constants import POINTER_MAP_FREELIST_PAGE_TYPE
+from sqlite_dissect.constants import POINTER_MAP_OVERFLOW_FIRST_PAGE_TYPE
+from sqlite_dissect.constants import POINTER_MAP_OVERFLOW_FOLLOWING_PAGE_TYPE
+from sqlite_dissect.constants import POINTER_MAP_PAGE_TYPES
+from sqlite_dissect.constants import SQLITE_DATABASE_HEADER_LENGTH
+from sqlite_dissect.constants import SQLITE_MASTER_SCHEMA_ROOT_PAGE
+from sqlite_dissect.constants import TABLE_INTERIOR_CELL_CLASS
+from sqlite_dissect.constants import TABLE_INTERIOR_PAGE_HEX_ID
+from sqlite_dissect.constants import TABLE_LEAF_CELL_CLASS
+from sqlite_dissect.constants import TABLE_LEAF_PAGE_HEX_ID
+from sqlite_dissect.constants import ZERO_BYTE
+from sqlite_dissect.exception import BTreePageParsingError
+from sqlite_dissect.exception import CellParsingError
+from sqlite_dissect.exception import PageParsingError
+from sqlite_dissect.file.database.payload import decode_varint
+from sqlite_dissect.file.database.payload import Record
+from sqlite_dissect.utilities import calculate_expected_overflow
+from sqlite_dissect.utilities import get_class_instance
+from sqlite_dissect.utilities import get_md5_hash
+
+"""
+
+page.py
+
+This script holds the Page and Cell related objects for parsing out the different types of SQLite pages in the
+SQLite database file. This also includes freeblock and fragment related objects.
+
+This script holds the following object(s):
+Page(object)
+OverflowPage(Page)
+FreelistTrunkPage(Page)
+FreelistLeafPage(Page)
+PointerMapPage(Page)
+PointerMapEntry(object)
+BTreePage(Page)
+TableInteriorPage(BTreePage)
+TableLeafPage(BTreePage)
+IndexInteriorPage(BTreePage)
+IndexLeafPage(BTreePage)
+BTreeCell(object)
+TableInteriorCell(BTreeCell)
+TableLeafCell(BTreeCell)
+IndexInteriorCell(BTreeCell)
+IndexLeafCell(BTreeCell)
+Freeblock(BTreeCell)
+Fragment(BTreeCell)
+
+Note: In some places, like with unallocated data on the page, it was decided to not store this data in memory
+ and pull it from the file on demand and/or calculate information from it if needed on demand. This was done
+ to prevent the memory used by this program becoming bloated with unneeded data.
+
+Assumptions:
+1.) OverflowPage: All overflow pages are replaced in a chain on modification. This assumes that whenever a cell is
+ modified, that even if the content of the overflow portion does not change, the whole cell including
+ overflow need to be replaced due to the way the cells are stored in SQLite.
+
+"""
+
+
+class Page(object):
+
+ __metaclass__ = ABCMeta
+
+ def __init__(self, version_interface, number):
+
+ self._logger = getLogger(LOGGER_NAME)
+
+ self._version_interface = version_interface
+ self.version_number = self._version_interface.version_number
+ self.page_version_number = self._version_interface.get_page_version(number)
+ self.number = number
+ self.page_type = None
+ self.offset = self._version_interface.get_page_offset(self.number)
+ self.size = self._version_interface.page_size
+ self.md5_hex_digest = None
+ self.unallocated_space_start_offset = None
+ self.unallocated_space_end_offset = None
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding=""):
+ string = padding + "Version Number: {}\n" \
+ + padding + "Page Version Number: {}\n" \
+ + padding + "Number: {}\n" \
+ + padding + "Page Type: {}\n" \
+ + padding + "Offset: {}\n" \
+ + padding + "Size: {}\n" \
+ + padding + "MD5 Hex Digest: {}\n" \
+ + padding + "Unallocated Space Start Offset: {}\n" \
+ + padding + "Unallocated Space End Offset: {}\n" \
+ + padding + "Unallocated Space Size: {}\n" \
+ + padding + "Unallocated Content MD5 Hex Digest: {}\n" \
+ + padding + "Unallocated Content (Hex): {}"
+ return string.format(self.version_number,
+ self.page_version_number,
+ self.number,
+ self.page_type,
+ self.offset,
+ self.size,
+ self.md5_hex_digest,
+ self.unallocated_space_start_offset,
+ self.unallocated_space_end_offset,
+ self.unallocated_space_length,
+ self.unallocated_space_md5_hex_digest,
+ hexlify(self.unallocated_space))
+
+ @property
+ def unallocated_space(self):
+
+ """
+
+ This property returns the unallocated space inside this page.
+
+ :return: bytearray The byte array for unallocated space.
+
+ """
+
+ if self.unallocated_space_length == 0:
+ return bytearray()
+ else:
+ return self._version_interface.get_page_data(self.number, self.unallocated_space_start_offset,
+ self.unallocated_space_length)
+
+ @property
+ def unallocated_space_md5_hex_digest(self):
+
+ """
+
+ This method will compute the md5 hash of the unallocated space of this page and return it. This is
+ calculated when called instead of before hand since this is a superclass and does not know where the
+ unallocated space starts and ends at time of creation. Although this could be computed and stored the first
+ time it is called, it was decided to always compute when called.
+
+ :return: string The hexadecimal md5 hash string.
+
+ """
+
+ return get_md5_hash(self.unallocated_space)
+
+ @property
+ def unallocated_space_length(self):
+
+ """
+
+ This property will compute the unallocated space length of this page and return it. This is calculated
+ when called instead of before hand since this is a superclass and does not know the unallocated space
+ start and end offsets at time of creation.
+
+ :return: int The unallocated space length.
+
+ """
+
+ # Return the length of the unallocated space on this page
+ return self.unallocated_space_end_offset - self.unallocated_space_start_offset
+
+
+class OverflowPage(Page):
+
+ def __init__(self, version_interface, number, parent_cell_page_number, parent_overflow_page_number,
+ index, payload_remaining):
+
+ super(OverflowPage, self).__init__(version_interface, number)
+
+ self.page_type = PAGE_TYPE.OVERFLOW
+
+ if payload_remaining <= 0:
+ log_message = "No payload remaining when overflow page initialized for version number: {} page number: {}."
+ log_message = log_message.format(self.version_number, self.number)
+ self._logger.error(log_message)
+ raise PageParsingError(log_message)
+
+ page = self._version_interface.get_page_data(self.number)
+
+ self.parent_cell_page_number = parent_cell_page_number
+ self.parent_overflow_page_number = parent_overflow_page_number
+ self.index = index
+ self.next_overflow_page_number = unpack(b">I", page[:OVERFLOW_HEADER_LENGTH])[0]
+
+ self.unallocated_space_start_offset = self.size
+ self.unallocated_space_end_offset = self.size
+ self.md5_hex_digest = get_md5_hash(page)
+
+ if payload_remaining <= self.size - OVERFLOW_HEADER_LENGTH:
+
+ # This was found to be the last overflow page in the chain. Make sure there are no other overflow pages.
+ if self.next_overflow_page_number:
+ log_message = "Additional overflow page number: {} found for version number: {} " \
+ "page version number: {} page number: {} when no more overflow pages were expected."
+ log_message = log_message.format(self.next_overflow_page_number, self.version_number,
+ self.page_version_number, self.number)
+ self._logger.error(log_message)
+ raise PageParsingError(log_message)
+
+ self.unallocated_space_start_offset = payload_remaining + OVERFLOW_HEADER_LENGTH
+
+ if self.next_overflow_page_number:
+
+ """
+
+ Here we make the assumption that all overflow pages have to be replaced when any overflow page in a chain
+ is updated. In other words, when a overflow chain is changed in a version, all overflow pages in that chain
+ belong to that version. This is due to the face that all overflow pages in a chain pertain to a cell that
+ was modified and therefore all overflow pages belonging to that record need to be reinserted even if the
+ same as before.
+
+ Here we check the version of the overflow page that this one points to. If the versions of the two pages
+ are different we throw an exception.
+
+ Since overflow pages are in a chain, this check is done on each creation of the next overflow page for the
+ following overflow page if it exists.
+
+ """
+
+ next_overflow_page_version = self._version_interface.get_page_version(self.next_overflow_page_number)
+ if self.page_version_number != next_overflow_page_version:
+ log_message = "The version of the current overflow page: {} on version: {} on page: {} has points to " \
+ "a next overflow page version: {} for page: {} that has a different version."
+ log_message = log_message.format(self.page_version_number, self.version_number, self.number,
+ next_overflow_page_version, self.next_overflow_page_number)
+ self._logger.error(log_message)
+ raise PageParsingError(log_message)
+
+ def stringify(self, padding=""):
+ string = "\n" \
+ + padding + "Parent Cell Page Number: {}\n" \
+ + padding + "Parent Overflow Page Number: {}\n" \
+ + padding + "Index: {}\n" \
+ + padding + "Next Overflow Page Number: {}\n" \
+ + padding + "Content Length: {}\n" \
+ + padding + "Content (Hex): {}"
+ string = string.format(self.parent_cell_page_number,
+ self.parent_overflow_page_number,
+ self.index,
+ self.next_overflow_page_number,
+ self.content_length,
+ hexlify(self.content))
+ return super(OverflowPage, self).stringify(padding) + string
+
+ @property
+ def content(self):
+ return self._version_interface.get_page_data(self.number, OVERFLOW_HEADER_LENGTH, self.content_length)
+
+ @property
+ def content_length(self):
+ return self.unallocated_space_start_offset - OVERFLOW_HEADER_LENGTH
+
+
+class FreelistTrunkPage(Page):
+
+ def __init__(self, version_interface, number, parent_freelist_trunk_page_number, index):
+
+ super(FreelistTrunkPage, self).__init__(version_interface, number)
+
+ self.page_type = PAGE_TYPE.FREELIST_TRUNK
+
+ self.parent_freelist_trunk_page_number = parent_freelist_trunk_page_number
+ self.index = index
+
+ page = self._version_interface.get_page_data(self.number)
+
+ self.next_freelist_trunk_page_number = unpack(b">I", page[:FREELIST_NEXT_TRUNK_PAGE_LENGTH])[0]
+ self.number_of_leaf_page_pointers = unpack(b">I", page[FREELIST_NEXT_TRUNK_PAGE_LENGTH:
+ FREELIST_HEADER_LENGTH])[0]
+ self.freelist_leaf_page_numbers = []
+ self.freelist_leaf_pages = []
+ for index in range(self.number_of_leaf_page_pointers):
+ start_offset = index * FREELIST_LEAF_PAGE_NUMBER_LENGTH + FREELIST_HEADER_LENGTH
+ end_offset = start_offset + FREELIST_LEAF_PAGE_NUMBER_LENGTH
+ freelist_leaf_page_number = unpack(b">I", page[start_offset:end_offset])[0]
+
+ """
+
+ Note: Freelist leaf pages can be in previous commit records to the commit record this current freelist trunk
+ page is in or commit records up to the main commit record version if applicable.
+
+ """
+
+ freelist_leaf_page = FreelistLeafPage(self._version_interface, freelist_leaf_page_number,
+ self.number, index)
+
+ self.freelist_leaf_page_numbers.append(freelist_leaf_page_number)
+ self.freelist_leaf_pages.append(freelist_leaf_page)
+
+ if len(self.freelist_leaf_page_numbers) != self.number_of_leaf_page_pointers:
+ log_message = "In freelist trunk page: {} with page version: {} in version: {} found a different amount " \
+ "of freelist leaf page numbers: {} than freelist leaf page pointers: {} found on the page."
+ log_message = log_message.format(self.number, self.page_version_number, self.version_number,
+ len(self.freelist_leaf_page_numbers), self.number_of_leaf_page_pointers)
+ self._logger.error(log_message)
+ raise PageParsingError(log_message)
+
+ freelist_leaf_page_numbers_size = self.number_of_leaf_page_pointers * FREELIST_LEAF_PAGE_NUMBER_LENGTH
+ self.unallocated_space_start_offset = FREELIST_HEADER_LENGTH + freelist_leaf_page_numbers_size
+ self.unallocated_space_end_offset = self.size
+
+ self.md5_hex_digest = get_md5_hash(page)
+
+ self.next_freelist_trunk_page = None
+ if self.next_freelist_trunk_page_number:
+
+ """
+
+ Here we make the assumption that a freelist trunk page can be updated without updating following freelist
+ trunk pages in the linked list. Since this is an "allowed" assumption, a print statement will print a log
+ info message that this happens and once we observe it, we can then declare it is no longer an assumption.
+
+ """
+
+ next_freelist_trunk_page_version_number = self._version_interface.get_page_version(
+ self.next_freelist_trunk_page_number)
+ if self.page_version_number > next_freelist_trunk_page_version_number:
+ log_message = "Found a freelist trunk page: {} that has page version: {} in version: {} that points " \
+ "to an earlier freelist trunk page version: {}."
+ log_message = log_message.format(self.number, self.page_version_number, self.version_number,
+ next_freelist_trunk_page_version_number)
+ self._logger.info(log_message)
+
+ self.next_freelist_trunk_page = FreelistTrunkPage(self._version_interface,
+ self.next_freelist_trunk_page_number,
+ self.number, self.index + 1)
+
+ def stringify(self, padding=""):
+ string = "\n" \
+ + padding + "Parent Freelist Trunk Page Number: {}\n" \
+ + padding + "Index: {}\n" \
+ + padding + "Next Freelist Trunk Page Number: {}\n" \
+ + padding + "Number of Leaf Page Pointers: {}\n" \
+ + padding + "Freelist Leaf Page Numbers: {}\n" \
+ + padding + "Freelist Leaf Pages length: {}"
+ string = string.format(self.parent_freelist_trunk_page_number,
+ self.index,
+ self.next_freelist_trunk_page_number,
+ self.number_of_leaf_page_pointers,
+ self.freelist_leaf_page_numbers,
+ len(self.freelist_leaf_pages))
+ for freelist_leaf_page in self.freelist_leaf_pages:
+ string += "\n" + padding + "Freelist Leaf Page:\n{}".format(freelist_leaf_page.stringify(padding + "\t"))
+ if self.next_freelist_trunk_page:
+ string += "\n" + padding \
+ + "Next Freelist Trunk Page:\n{}".format(self.next_freelist_trunk_page.stringify(padding + "\t"))
+ return super(FreelistTrunkPage, self).stringify(padding) + string
+
+
+class FreelistLeafPage(Page):
+
+ def __init__(self, version_interface, number, parent_freelist_trunk_page_number, index):
+
+ super(FreelistLeafPage, self).__init__(version_interface, number)
+
+ self.page_type = PAGE_TYPE.FREELIST_LEAF
+
+ self.parent_freelist_trunk_page_number = parent_freelist_trunk_page_number
+ self.index = index
+
+ self.unallocated_space_start_offset = 0
+ self.unallocated_space_end_offset = self.size
+
+ page = self._version_interface.get_page_data(self.number)
+ self.md5_hex_digest = get_md5_hash(page)
+
+ def stringify(self, padding=""):
+ string = "\n" \
+ + padding + "Parent Freelist Trunk Page Number: {}\n" \
+ + padding + "Index: {}"
+ string = string.format(self.parent_freelist_trunk_page_number,
+ self.index)
+ return super(FreelistLeafPage, self).stringify(padding) + string
+
+
+class PointerMapPage(Page):
+
+ def __init__(self, version_interface, number, number_of_entries):
+
+ super(PointerMapPage, self).__init__(version_interface, number)
+
+ self.page_type = PAGE_TYPE.POINTER_MAP
+
+ page = self._version_interface.get_page_data(self.number)
+
+ self.number_of_entries = number_of_entries
+
+ self.unallocated_space_start_offset = self.number_of_entries * POINTER_MAP_ENTRY_LENGTH
+ self.unallocated_space_end_offset = self.size
+
+ self.md5_hex_digest = get_md5_hash(page)
+
+ self.pointer_map_entries = []
+ for index in range(self.number_of_entries):
+
+ offset = index * POINTER_MAP_ENTRY_LENGTH
+
+ if offset >= self.size:
+ log_message = "For pointer map page: {} for page version: {} and version: {} the offset: {} " \
+ "was found to greater or equal to the page size: {} on index: {}."
+ log_message = log_message.format(self.number, self.page_version_number, self.version_number,
+ offset, self.size, index)
+ self._logger.error(log_message)
+ raise PageParsingError(log_message)
+
+ page_type = page[offset:offset + PAGE_TYPE_LENGTH]
+ if page_type == ZERO_BYTE:
+ log_message = "The page type was found to be empty for pointer map page: {} for page version: {} " \
+ "and version: {} on index: {} and offset: {}."
+ log_message = log_message.format(self.number, self.page_version_number, self.version_number,
+ index, offset)
+ self._logger.error(log_message)
+ raise PageParsingError(log_message)
+
+ elif offset + POINTER_MAP_ENTRY_LENGTH > self.size:
+ log_message = "The offset {} and pointer map length: {} go beyond the page size: {} for pointer " \
+ "map page: {} for page version: {} and version: {} on index: {}."
+ log_message = log_message.format(offset, POINTER_MAP_ENTRY_LENGTH, self.size, self.number,
+ self.page_version_number, self.version_number, index)
+ self._logger.error(log_message)
+ raise PageParsingError(log_message)
+
+ elif page_type not in POINTER_MAP_PAGE_TYPES:
+ log_message = "The page type was not recognized: {} as a valid pointer map page type for " \
+ "pointer map page: {} for page version: {} and version: {} on index: {} and offset: {}."
+ log_message = log_message.format(hexlify(page_type), self.number, self.page_version_number,
+ self.version_number, index, offset)
+ self._logger.error(log_message)
+ raise PageParsingError(log_message)
+
+ parent_page_number = unpack(b">I", page[offset + PAGE_TYPE_LENGTH:offset + POINTER_MAP_ENTRY_LENGTH])[0]
+
+ if page_type in [POINTER_MAP_B_TREE_ROOT_PAGE_TYPE, POINTER_MAP_FREELIST_PAGE_TYPE] and parent_page_number:
+ log_message = "The page type: {} has a parent page number: {} which is invalid for " \
+ "pointer map page: {} for page version: {} and version: {} on index: {} and offset: {}."
+ log_message = log_message.format(hexlify(page_type), parent_page_number, self.number,
+ self.page_version_number, self.version_number, index, offset)
+ self._logger.error(log_message)
+ raise PageParsingError(log_message)
+
+ elif page_type in [POINTER_MAP_OVERFLOW_FIRST_PAGE_TYPE, POINTER_MAP_OVERFLOW_FOLLOWING_PAGE_TYPE,
+ POINTER_MAP_B_TREE_NON_ROOT_PAGE_TYPE] and not parent_page_number:
+ log_message = "The page type: {} does not have a parent page number which is invalid for " \
+ "pointer map page: {} for page version: {} and version: {} on index: {} and offset: {}."
+ log_message = log_message.format(hexlify(page_type), self.number, self.page_version_number,
+ self.version_number, index, offset)
+ self._logger.error(log_message)
+ raise PageParsingError(log_message)
+
+ pointer_map_entry_md5_hex_digest = get_md5_hash(page[offset:offset + POINTER_MAP_ENTRY_LENGTH])
+
+ page_number = number + index + 1
+ pointer_map_entry = PointerMapEntry(index, offset, page_number, page_type, parent_page_number,
+ pointer_map_entry_md5_hex_digest)
+ self.pointer_map_entries.append(pointer_map_entry)
+
+ if len(self.pointer_map_entries) != self.number_of_entries:
+ log_message = "In pointer map page: {} with page version: {} in version: {} found a different amount " \
+ "of pointer map entries: {} than expected number of entries: {} found on the page."
+ log_message = log_message.format(self.number, self.page_version_number, self.version_number,
+ len(self.pointer_map_entries), self.number_of_entries)
+ self._logger.error(log_message)
+ raise PageParsingError(log_message)
+
+ remaining_space_offset = self.number_of_entries * POINTER_MAP_ENTRY_LENGTH
+ if remaining_space_offset != self.unallocated_space_start_offset:
+ log_message = "The remaining space offset: {} is not equal to the unallocated space start offset: {} " \
+ "for pointer map page: {} for page version: {} and version: {}."
+ log_message = log_message.format(remaining_space_offset, self.unallocated_space_start_offset, self.number,
+ self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise PageParsingError(log_message)
+
+ """
+
+ Originally here the remaining space was checked to see if it was all zeros, and if not an exception was thrown.
+ This has since been removed since it was realized that this unallocated space can contain information resulting
+ in non-zero unallocated space.
+
+ It was realized that when a database increases in size and then decreases due to auto-vacuuming where freelist
+ pages are truncated from the end of the database, the pointer information from those previous pages remain.
+
+ This information may give an idea into what pages were removed and how they were previously structured. This
+ data should probably be parsed and investigated during the unallocated carving specific to pointer map pages.
+
+ The patterns still need to match 5 bytes, first byte being the pointer map page type and the second 4 bytes
+ being the page number (if existing). This could give an idea of how big the database was previously but will
+ only give the max size at any point in time since it does not appear that the pointer map pages are zero'd out
+ at any point and are just overwritten if need be.
+
+ There may still may be non-pointer map data included beyond the pointer map entries that does not fit the 5
+ byte patterns. For example page 2 where the first pointer map page was placed was previously a b-tree page
+ before vacuuming was turned on. However, there are other details where auto-vacuuming is only possible is
+ turned on before table creation. More research will have to be done here for exactly how everything here works.
+ The page may also be zero'd out at a time such as this as well.
+
+ """
+
+ def stringify(self, padding=""):
+ string = "\n" \
+ + padding + "Number of Entries: {}\n" \
+ + padding + "Pointer Map Entries Size: {}"
+ string = string.format(self.number_of_entries,
+ len(self.pointer_map_entries))
+ for pointer_map_entry in self.pointer_map_entries:
+ string += "\n" + padding + "Pointer Map Entry:\n{}".format(pointer_map_entry.stringify(padding + "\t"))
+ return super(PointerMapPage, self).stringify(padding) + string
+
+
+class PointerMapEntry(object):
+
+ def __init__(self, index, offset, page_number, page_type, parent_page_number, md5_hex_digest):
+ self.index = index
+ self.offset = offset
+ self.page_number = page_number
+ self.page_type = page_type
+ self.parent_page_number = parent_page_number
+ self.md5_hex_digest = md5_hex_digest
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding=""):
+ string = padding + "Index: {}\n" \
+ + padding + "Offset: {}\n" \
+ + padding + "Page Number: {}\n" \
+ + padding + "Page Type: {}\n" \
+ + padding + "Parent Page Number: {}\n" \
+ + padding + "MD5 Hex Digest: {}"
+ return string.format(self.index,
+ self.offset,
+ self.page_number,
+ self.page_type,
+ self.parent_page_number,
+ self.md5_hex_digest)
+
+
+class BTreePage(Page):
+
+ __metaclass__ = ABCMeta
+
+ def __init__(self, version_interface, number, header_class_name, cell_class_name):
+
+ super(BTreePage, self).__init__(version_interface, number)
+
+ page = self._version_interface.get_page_data(self.number)
+
+ self.page_type = None
+ self.hex_type = page[0]
+
+ if self.hex_type == MASTER_PAGE_HEX_ID:
+ master_page_hex_type = page[SQLITE_DATABASE_HEADER_LENGTH]
+ if master_page_hex_type == TABLE_INTERIOR_PAGE_HEX_ID:
+ self.page_type = PAGE_TYPE.B_TREE_TABLE_INTERIOR
+ elif master_page_hex_type == TABLE_LEAF_PAGE_HEX_ID:
+ self.page_type = PAGE_TYPE.B_TREE_TABLE_LEAF
+ else:
+ log_message = "Page hex type for master page is: {} and not a table interior or table leaf page as " \
+ "expected in b-tree page: {} in page version: {} for version: {}."
+ log_message = log_message.format(hexlify(master_page_hex_type), self.number,
+ self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise BTreePageParsingError(log_message)
+
+ elif self.hex_type == TABLE_INTERIOR_PAGE_HEX_ID:
+ self.page_type = PAGE_TYPE.B_TREE_TABLE_INTERIOR
+ elif self.hex_type == TABLE_LEAF_PAGE_HEX_ID:
+ self.page_type = PAGE_TYPE.B_TREE_TABLE_LEAF
+ elif self.hex_type == INDEX_INTERIOR_PAGE_HEX_ID:
+ self.page_type = PAGE_TYPE.B_TREE_INDEX_INTERIOR
+ elif self.hex_type == INDEX_LEAF_PAGE_HEX_ID:
+ self.page_type = PAGE_TYPE.B_TREE_INDEX_LEAF
+ else:
+ log_message = "Page hex type: {} is not a valid b-tree page type for b-tree page: {} in page version: {} " \
+ "for version: {}."
+ log_message = log_message.format(hexlify(self.hex_type), self.number, self.page_version_number,
+ self.version_number)
+ self._logger.error(log_message)
+ raise BTreePageParsingError(log_message)
+
+ header_class = get_class_instance(header_class_name)
+ cell_class = get_class_instance(cell_class_name)
+
+ self.header = header_class(page)
+
+ cell_pointer_array_offset = self.header.header_length
+ if self.header.contains_sqlite_database_header:
+ cell_pointer_array_offset += SQLITE_DATABASE_HEADER_LENGTH
+
+ if self.number != SQLITE_MASTER_SCHEMA_ROOT_PAGE:
+ log_message = "B-tree page found to contain the sqlite database header but is not the root page for " \
+ "b-tree page: {} in page version: {} for version: {}."
+ log_message = log_message.format(self.number, self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise BTreePageParsingError(log_message)
+
+ cell_pointer_array_length = self.header.number_of_cells_on_page * CELL_POINTER_BYTE_LENGTH
+ self.unallocated_space_start_offset = cell_pointer_array_offset + cell_pointer_array_length
+ self.unallocated_space_end_offset = self.header.cell_content_offset
+
+ adjusted_header_length = self.header.header_length
+ if self.header.contains_sqlite_database_header:
+ adjusted_header_length += SQLITE_DATABASE_HEADER_LENGTH
+ preface_size = adjusted_header_length + cell_pointer_array_length
+
+ if preface_size != self.unallocated_space_start_offset:
+ log_message = "The calculated preface size: {} is not equal to the unallocated space start offset: {} " \
+ "for b-tree page: {} in page version: {} for version: {}."
+ log_message = log_message.format(preface_size, self.unallocated_space_start_offset, self.number,
+ self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise BTreePageParsingError(log_message)
+
+ if self.header.cell_content_offset != self.unallocated_space_end_offset:
+ log_message = "The cell content offset in the header: {} is not equal to the unallocated space end " \
+ "offset: {} for b-tree page: {} in page version: {} for version: {}."
+ log_message = log_message.format(self.header.cell_content_offset, self.unallocated_space_end_offset,
+ self.number, self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise BTreePageParsingError(log_message)
+
+ self.cells = []
+ self.calculated_cell_total_byte_size = 0
+ for cell_index in range(self.header.number_of_cells_on_page):
+ cell_start_offset = cell_pointer_array_offset + cell_index * CELL_POINTER_BYTE_LENGTH
+ cell_end_offset = cell_start_offset + CELL_POINTER_BYTE_LENGTH
+ cell_offset = unpack(b">H", page[cell_start_offset:cell_end_offset])[0]
+ file_offset = self.offset + cell_offset
+ cell_instance = cell_class(self._version_interface, self.page_version_number, file_offset, self.number,
+ page, cell_index, cell_offset)
+ self.cells.append(cell_instance)
+ if type(cell_instance) != TableInteriorCell and cell_instance.has_overflow:
+ overflow_adjusted_page_size = cell_instance.end_offset - cell_instance.start_offset
+ self.calculated_cell_total_byte_size += overflow_adjusted_page_size
+ else:
+ self.calculated_cell_total_byte_size += cell_instance.byte_size
+
+ if len(self.cells) != self.header.number_of_cells_on_page:
+ log_message = "The number of cells parsed: {} does not equal the number of cells specified in the " \
+ "header: {} for b-tree page: {} in page version: {} for version: {}."
+ log_message = log_message.format(len(self.cells), self.header.number_of_cells_on_page,
+ self.number, self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise BTreePageParsingError(log_message)
+
+ # Check if there are freeblocks specified in the header (0 if no freeblocks)
+ self.freeblocks = []
+ self.calculated_freeblock_total_byte_size = 0
+ if self.header.first_freeblock_offset != 0:
+ freeblock_index = 0
+ next_freeblock_offset = self.header.first_freeblock_offset
+ file_offset = self.offset + next_freeblock_offset
+ while next_freeblock_offset:
+ freeblock = Freeblock(self._version_interface, self.page_version_number, file_offset, self.number, page,
+ freeblock_index, next_freeblock_offset)
+ self.freeblocks.append(freeblock)
+ next_freeblock_offset = freeblock.next_freeblock_offset
+ self.calculated_freeblock_total_byte_size += freeblock.byte_size
+ freeblock_index += 1
+
+ # Find fragments
+ self.fragments = []
+ self.calculated_fragment_total_byte_size = 0
+ fragment_index = 0
+ aggregated_cells = sorted(self.cells + self.freeblocks, key=lambda b_tree_cell: b_tree_cell.start_offset)
+ last_accounted_for_offset = self.unallocated_space_end_offset
+ for cell in aggregated_cells:
+ if last_accounted_for_offset >= self.size:
+ log_message = "The last accounted for offset: {} while determining fragments is greater than or " \
+ "equal to the page size: {} for b-tree page: {} in page version: {} for version: {}."
+ log_message = log_message.format(last_accounted_for_offset, self.size, self.number,
+ self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise BTreePageParsingError(log_message)
+
+ if cell.start_offset != last_accounted_for_offset:
+ file_offset = self.offset + last_accounted_for_offset
+ fragment = Fragment(self._version_interface, self.page_version_number, file_offset, self.number, page,
+ fragment_index, last_accounted_for_offset, cell.start_offset)
+ self.fragments.append(fragment)
+ self.calculated_fragment_total_byte_size += fragment.byte_size
+ fragment_index += 1
+ last_accounted_for_offset = cell.end_offset
+
+ if self.header.number_of_fragmented_free_bytes > PAGE_FRAGMENT_LIMIT:
+ log_message = "The number of fragmented free bytes: {} is greater than the page fragment limit: {} " \
+ "for b-tree page: {} in page version: {} for version: {}."
+ log_message = log_message.format(self.header.number_of_fragmented_free_bytes, PAGE_FRAGMENT_LIMIT,
+ self.number, self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise BTreePageParsingError(log_message)
+
+ if self.calculated_fragment_total_byte_size != self.header.number_of_fragmented_free_bytes:
+ log_message = "The calculated fragment total byte size: {} does not equal the number of fragmented free " \
+ "bytes specified in the header: {} for b-tree page: {} in page version: {} for version: {}."
+ log_message = log_message.format(self.calculated_fragment_total_byte_size,
+ self.header.number_of_fragmented_free_bytes,
+ self.number, self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ if version_interface.strict_format_checking:
+ raise BTreePageParsingError(log_message)
+ else:
+ warn(log_message, RuntimeWarning)
+
+ # Account for all space within the page
+ unallocated_space_size = self.unallocated_space_end_offset - self.unallocated_space_start_offset
+ body_size = self.calculated_cell_total_byte_size
+ body_size += self.calculated_freeblock_total_byte_size + self.calculated_fragment_total_byte_size
+
+ accounted_for_space = preface_size + unallocated_space_size + body_size
+ if accounted_for_space != self.size:
+ log_message = "The calculated accounted for space: {} does not equal the page size: {} " \
+ "for b-tree page: {} in page version: {} for version: {}."
+ log_message = log_message.format(accounted_for_space, self.size, self.number,
+ self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ if version_interface.strict_format_checking:
+ raise BTreePageParsingError(log_message)
+ else:
+ warn(log_message, RuntimeWarning)
+
+ self.md5_hex_digest = get_md5_hash(page)
+
+ def stringify(self, padding=""):
+ string = "\n" \
+ + padding + "Hex Type (Hex): {}\n" \
+ + padding + "Header:\n{}\n"\
+ + padding + "Cells Length: {}\n" \
+ + padding + "Calculated Cell Total Byte Size: {}\n" \
+ + padding + "Freeblocks Length: {}\n" \
+ + padding + "Calculated Freeblock Total Byte Size: {}\n" \
+ + padding + "Fragments Length: {}\n" \
+ + padding + "Calculated Fragment Total Byte Size: {}"
+ string = string.format(hexlify(self.hex_type),
+ self.header.stringify(padding + "\t"),
+ len(self.cells),
+ self.calculated_cell_total_byte_size,
+ len(self.freeblocks),
+ self.calculated_freeblock_total_byte_size,
+ len(self.fragments),
+ self.calculated_fragment_total_byte_size)
+ for cell in self.cells:
+ string += "\n" + padding + "Cell:\n{}".format(cell.stringify(padding + "\t"))
+ for freeblock in self.freeblocks:
+ string += "\n" + padding + "Freeblock:\n{}".format(freeblock.stringify(padding + "\t"))
+ for fragment in self.fragments:
+ string += "\n" + padding + "Fragment:\n{}".format(fragment.stringify(padding + "\t"))
+ return super(BTreePage, self).stringify(padding) + string
+
+
+class TableInteriorPage(BTreePage):
+
+ def __init__(self, version_interface, number):
+ header_class_name = "{}.{}".format(PAGE_HEADER_MODULE, INTERIOR_PAGE_HEADER_CLASS)
+ cell_class_name = "{}.{}".format(CELL_MODULE, TABLE_INTERIOR_CELL_CLASS)
+ super(TableInteriorPage, self).__init__(version_interface, number, header_class_name, cell_class_name)
+
+ """
+
+ Note: A table interior page can be updated without updating the right most pointer page in a version.
+
+ """
+
+ if not self.header.right_most_pointer:
+ log_message = "The right most pointer is not set for b-tree table interior page: {} " \
+ "in page version: {} for version: {}."
+ log_message = log_message.format(self.number, self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise BTreePageParsingError(log_message)
+
+ right_most_pointer_page_hex_type = self._version_interface.get_page_data(self.header.right_most_pointer,
+ 0, PAGE_TYPE_LENGTH)
+
+ if right_most_pointer_page_hex_type == TABLE_INTERIOR_PAGE_HEX_ID:
+ self.right_most_page = TableInteriorPage(self._version_interface, self.header.right_most_pointer)
+ elif right_most_pointer_page_hex_type == TABLE_LEAF_PAGE_HEX_ID:
+ self.right_most_page = TableLeafPage(self._version_interface, self.header.right_most_pointer)
+ else:
+ log_message = "The right most pointer does not point to a table interior or leaf page but instead has " \
+ "a hex type of: {} for b-tree table interior page: {} in page version: {} for version: {}."
+ log_message = log_message.format(hexlify(right_most_pointer_page_hex_type), self.number,
+ self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise BTreePageParsingError(log_message)
+
+ def stringify(self, padding=""):
+ string = "\n" + padding + "Right Most Page:\n{}"
+ string = string.format(self.right_most_page.stringify(padding + "\t") if self.right_most_page else None)
+ return super(TableInteriorPage, self).stringify(padding) + string
+
+
+class TableLeafPage(BTreePage):
+
+ def __init__(self, version, number):
+ header_class_name = "{}.{}".format(PAGE_HEADER_MODULE, LEAF_PAGE_HEADER_CLASS)
+ cell_class_name = "{}.{}".format(CELL_MODULE, TABLE_LEAF_CELL_CLASS)
+ super(TableLeafPage, self).__init__(version, number, header_class_name, cell_class_name)
+
+
+class IndexInteriorPage(BTreePage):
+
+ def __init__(self, version, number):
+
+ header_class_name = "{}.{}".format(PAGE_HEADER_MODULE, INTERIOR_PAGE_HEADER_CLASS)
+ cell_class_name = "{}.{}".format(CELL_MODULE, INDEX_INTERIOR_CELL_CLASS)
+ super(IndexInteriorPage, self).__init__(version, number, header_class_name, cell_class_name)
+
+ """
+
+ Note: A index interior page can be updated without updating the right most pointer page in a version.
+
+ """
+
+ if not self.header.right_most_pointer:
+ log_message = "The right most pointer is not set for b-tree index interior page: {} " \
+ "in page version: {} for version: {}."
+ log_message = log_message.format(self.number, self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise BTreePageParsingError(log_message)
+
+ right_most_pointer_page_hex_type = self._version_interface.get_page_data(self.header.right_most_pointer,
+ 0, PAGE_TYPE_LENGTH)
+
+ if right_most_pointer_page_hex_type == INDEX_INTERIOR_PAGE_HEX_ID:
+ self.right_most_page = IndexInteriorPage(self._version_interface, self.header.right_most_pointer)
+ elif right_most_pointer_page_hex_type == INDEX_LEAF_PAGE_HEX_ID:
+ self.right_most_page = IndexLeafPage(self._version_interface, self.header.right_most_pointer)
+ else:
+ log_message = "The right most pointer does not point to a index interior or leaf page but instead has " \
+ "a hex type of: {} for b-tree index interior page: {} in page version: {} for version: {}."
+ log_message = log_message.format(hexlify(right_most_pointer_page_hex_type), self.number,
+ self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise BTreePageParsingError(log_message)
+
+ def stringify(self, padding=""):
+ string = "\n" + padding + "Right Most Page:\n{}"
+ string = string.format(self.right_most_page.stringify(padding + "\t") if self.right_most_page else None)
+ return super(IndexInteriorPage, self).stringify(padding) + string
+
+
+class IndexLeafPage(BTreePage):
+
+ def __init__(self, version, number):
+ header_class_name = "{}.{}".format(PAGE_HEADER_MODULE, LEAF_PAGE_HEADER_CLASS)
+ cell_class_name = "{}.{}".format(CELL_MODULE, INDEX_LEAF_CELL_CLASS)
+ super(IndexLeafPage, self).__init__(version, number, header_class_name, cell_class_name)
+
+
+class BTreeCell(object):
+
+ __metaclass__ = ABCMeta
+
+ def __init__(self, version_interface, page_version_number, file_offset, page_number, index, offset,
+ source=CELL_SOURCE.B_TREE, location=None):
+
+ self._logger = getLogger(LOGGER_NAME)
+
+ self._version_interface = version_interface
+ self._page_size = self._version_interface.page_size
+ self.version_number = self._version_interface.version_number
+ self.page_version_number = page_version_number
+ self.file_offset = file_offset
+ self.page_number = page_number
+ self.index = index
+ self.start_offset = offset
+ self.location = location if location else CELL_LOCATION.ALLOCATED_SPACE
+ self.source = source
+ self.end_offset = None
+ self.byte_size = None
+ self.md5_hex_digest = None
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding=""):
+ string = padding + "Version Number: {}\n" \
+ + padding + "Page Version Number: {}\n" \
+ + padding + "File Offset: {}\n" \
+ + padding + "Page Number: {}\n" \
+ + padding + "Source: {}\n" \
+ + padding + "Location: {}\n" \
+ + padding + "Index: {}\n" \
+ + padding + "Start Offset: {}\n" \
+ + padding + "End Offset: {}\n" \
+ + padding + "Byte Size: {}\n" \
+ + padding + "MD5 Hex Digest: {}"
+ return string.format(self.version_number,
+ self.page_version_number,
+ self.file_offset,
+ self.page_number,
+ self.source,
+ self.location,
+ self.index,
+ self.start_offset,
+ self.end_offset,
+ self.byte_size,
+ self.md5_hex_digest)
+
+
+class TableInteriorCell(BTreeCell):
+
+ """
+
+
+
+ Note: B-Tree table interior cells never contain overflow. Therefore they have no payload (ie. record). This is
+ the only type of b-tree page that does not have a payload.
+
+ """
+
+ def __init__(self, version_interface, page_version_number, file_offset, page_number, page, index, offset):
+
+ super(TableInteriorCell, self).__init__(version_interface, page_version_number, file_offset,
+ page_number, index, offset)
+ left_child_pointer_end_offset = self.start_offset + LEFT_CHILD_POINTER_BYTE_LENGTH
+ self.left_child_pointer = unpack(b">I", page[self.start_offset:left_child_pointer_end_offset])[0]
+ self.row_id, self.row_id_varint_length = decode_varint(page, left_child_pointer_end_offset)
+
+ self.byte_size = LEFT_CHILD_POINTER_BYTE_LENGTH + self.row_id_varint_length
+ self.end_offset = self.start_offset + self.byte_size
+
+ self.md5_hex_digest = get_md5_hash(page[self.start_offset:self.end_offset])
+
+ """
+
+ Note: A table interior cell can be updated without updating the left child page in a version.
+
+ """
+
+ if not self.left_child_pointer:
+ log_message = "The left child pointer is not set for b-tree table interior cell index: {} " \
+ "at offset: {} for page: {} in page version: {} for version: {}."
+ log_message = log_message.format(self.index, self.start_offset, self.page_number,
+ self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise CellParsingError(log_message)
+
+ left_child_pointer_page_hex_type = self._version_interface.get_page_data(self.left_child_pointer,
+ 0, PAGE_TYPE_LENGTH)
+
+ if left_child_pointer_page_hex_type == TABLE_INTERIOR_PAGE_HEX_ID:
+ self.left_child_page = TableInteriorPage(self._version_interface, self.left_child_pointer)
+ elif left_child_pointer_page_hex_type == TABLE_LEAF_PAGE_HEX_ID:
+ self.left_child_page = TableLeafPage(self._version_interface, self.left_child_pointer)
+ else:
+ log_message = "The left child pointer: {} does not point to a table interior or leaf page but instead " \
+ "has a hex type of: {} for b-tree table interior cell index: {} at offset: {} for page: {} " \
+ "in page version: {} for version: {}."
+ log_message = log_message.format(self.left_child_pointer, hexlify(left_child_pointer_page_hex_type),
+ self.index, self.start_offset, self.page_number, self.page_version_number,
+ self.version_number)
+ self._logger.error(log_message)
+ raise CellParsingError(log_message)
+
+ def stringify(self, padding=""):
+ string = "\n" \
+ + padding + "Left Child Pointer: {}\n" \
+ + padding + "Row ID: {}\n" \
+ + padding + "Row ID VARINT Length: {}"
+ string = string.format(self.left_child_pointer,
+ self.row_id,
+ self.row_id_varint_length)
+ string += "\n" + padding + "Left Child Page:\n{}"
+ string = string.format(self.left_child_page.stringify(padding + "\t") if self.left_child_page else None)
+ return super(TableInteriorCell, self).stringify(padding) + string
+
+
+class TableLeafCell(BTreeCell):
+
+ def __init__(self, version_interface, page_version_number, file_offset, page_number, page, index, offset):
+
+ super(TableLeafCell, self).__init__(version_interface, page_version_number, file_offset,
+ page_number, index, offset)
+
+ self.payload_byte_size, self.payload_byte_size_varint_length = decode_varint(page, self.start_offset)
+ row_id_offset = self.start_offset + self.payload_byte_size_varint_length
+ self.row_id, self.row_id_varint_length = decode_varint(page, row_id_offset)
+ self.payload_offset = self.start_offset + self.payload_byte_size_varint_length + self.row_id_varint_length
+
+ self.has_overflow = False
+ self.overflow_pages = None
+ self.overflow_page_number_offset = None
+ self.overflow_page_number = None
+ self.overflow_page = None
+ self.last_overflow_page_content_size = 0
+
+ u = self._page_size
+ p = self.payload_byte_size
+
+ """
+
+ Note: According to the SQLite documentation (as of version 3.9.2) table leaf cell overflow is calculated
+ by seeing if the payload size p is less than or equal to u - 35. If it is then there is no overflow.
+ If p is greater than u - 35, then there is overflow. At this point m = (((u - 12) * 32) / 255) - 23.
+ If p is greater than u - 35 then the number of bytes stored on the b-tree leaf page is the smaller of
+ m + ((p - m) % (u - 4)) and u - 35. The remaining bytes are then moved to overflow pages.
+
+ The above was found to be wrong in the SQLite documentation.
+
+ The documentation is incorrect that it is the smaller of m + ((p - m) % (u - 4)) and u - 35. After
+ a lot of testing and reviewing of the actual SQLite c code it was found out that the actual number of
+ bytes stored on the b-tree leaf page is m + ((p - m) % (u - 4)) unless m + ((p - m) % (u - 4)) > u - 35
+ in which case the bytes stored on the b-tree table leaf page is m itself.
+
+ Therefore let b be the bytes on the b-tree table leaf page:
+ u = page size
+ p = payload byte size
+ if p > u - 35
+ m = (((u - 12) * 32) / 255) - 23
+ b = m + ((p - m) % (u - 4))
+ if b > u - 35
+ b = m
+
+ Additionally, the bytes stored on the b-tree table leaf page will always be greater to or equal to m
+ once calculated.
+
+ """
+
+ self.bytes_on_first_page = p
+ if p > u - 35:
+ m = (((u - 12) * 32) / 255) - 23
+ self.bytes_on_first_page = m + ((p - m) % (u - 4))
+ if self.bytes_on_first_page > u - 35:
+ self.bytes_on_first_page = m
+ self.has_overflow = True
+ self.overflow_page_number_offset = self.payload_offset + self.bytes_on_first_page
+ overflow_page_number_end_offset = self.overflow_page_number_offset + FIRST_OVERFLOW_PAGE_NUMBER_LENGTH
+ self.overflow_page_number = unpack(b">I", page[self.overflow_page_number_offset:
+ overflow_page_number_end_offset])[0]
+ if self.bytes_on_first_page < m:
+ log_message = "When calculating overflow, the bytes on the first page: {} calculated are less than " \
+ "m: {} for b-tree table leaf cell index: {} at offset: {} for page: {} in " \
+ "page version: {} for version: {}."
+ log_message = log_message.format(self.bytes_on_first_page, m, self.index, self.start_offset,
+ self.page_number, self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise CellParsingError(log_message)
+
+ self.byte_size = self.payload_byte_size_varint_length + self.row_id_varint_length + self.payload_byte_size
+ self.byte_size += FIRST_OVERFLOW_PAGE_NUMBER_LENGTH if self.has_overflow else 0
+ self.end_offset = self.start_offset + self.byte_size - self.payload_byte_size + self.bytes_on_first_page
+
+ self.overflow_byte_size = self.payload_byte_size - self.bytes_on_first_page
+ self.expected_number_of_overflow_pages, \
+ self.expected_last_overflow_page_content_size = calculate_expected_overflow(self.overflow_byte_size, u)
+
+ self.md5_hex_digest = get_md5_hash(page[self.start_offset:self.end_offset])
+
+ if self.has_overflow:
+
+ """
+
+ The overflow pages are in a dictionary keyed off of their page number in the format:
+ overflow_page[OVERFLOW_PAGE_NUMBER] = OVERFLOW_PAGE
+
+ Originally, the overflow pages were nested objects, ie. each overflow page had the following overflow
+ page within it, and so on. However, this lead to recursion depth problems with larger cell content.
+ It was changed to be a dictionary of pages here instead.
+
+ Note: Although overflow pages have to be replaced when any overflow page in a chain is updated, the
+ overflow here may not be updated due to a different cell in this page being updated. Therefore,
+ we allow the first overflow page to be in a earlier version. However, the overflow pages still
+ check that all overflow versions in respect to the first overflow page and beyond in the linked
+ list are all equal.
+
+ """
+
+ self.overflow_pages = {}
+ payload_remaining = self.overflow_byte_size
+
+ overflow_page = OverflowPage(self._version_interface, self.overflow_page_number, self.page_number,
+ FIRST_OVERFLOW_PARENT_PAGE_NUMBER, FIRST_OVERFLOW_PAGE_INDEX,
+ payload_remaining)
+
+ self.overflow_pages[overflow_page.number] = overflow_page
+ self.last_overflow_page_content_size = overflow_page.content_length
+
+ while overflow_page.next_overflow_page_number:
+ payload_remaining = payload_remaining - overflow_page.size + OVERFLOW_HEADER_LENGTH
+ overflow_page = OverflowPage(self._version_interface, overflow_page.next_overflow_page_number,
+ self.page_number, overflow_page.number, overflow_page.index + 1,
+ payload_remaining)
+ self.overflow_pages[overflow_page.number] = overflow_page
+ self.last_overflow_page_content_size = overflow_page.content_length
+
+ if self.expected_number_of_overflow_pages != self.number_of_overflow_pages:
+ log_message = "The number of expected overflow pages: {} was not the actual number of overflow pages " \
+ "parsed: {} for b-tree table leaf cell index: {} at offset: {} for page: {} in " \
+ "page version: {} for version: {}."
+ log_message = log_message.format(self.expected_number_of_overflow_pages, self.number_of_overflow_pages,
+ self.index, self.start_offset, self.page_number, self.page_version_number,
+ self.version_number)
+ self._logger.error(log_message)
+ raise CellParsingError(log_message)
+
+ if self.expected_last_overflow_page_content_size != self.last_overflow_page_content_size:
+ log_message = "The expected last overflow page content size: {} was not the actual last overflow page " \
+ "content size parsed: {} for b-tree table leaf cell index: {} at offset: {} for page: {} " \
+ "in page version: {} for version: {}."
+ log_message = log_message.format(self.expected_last_overflow_page_content_size,
+ self.last_overflow_page_content_size, self.index, self.start_offset,
+ self.page_number, self.page_version_number, self.version_number)
+ raise CellParsingError(log_message)
+
+ self.payload = Record(page, self.payload_offset, self.payload_byte_size,
+ self.bytes_on_first_page, self.overflow)
+
+ def stringify(self, padding=""):
+ string = "\n" \
+ + padding + "Payload Byte Size: {}\n" \
+ + padding + "Payload Byte Size VARINT Length: {}\n" \
+ + padding + "Row ID: {}\n" \
+ + padding + "Row ID VARINT Length: {}\n" \
+ + padding + "Payload Offset: {}\n" \
+ + padding + "Bytes on First Page: {}\n" \
+ + padding + "Has Overflow: {}\n" \
+ + padding + "Overflow Byte Size: {}\n" \
+ + padding + "Expected Number of Overflow Pages: {}\n" \
+ + padding + "Expected Last Overflow Page Content Size: {}\n" \
+ + padding + "Number of Overflow Pages: {}\n" \
+ + padding + "Overflow Page Number Offset: {}\n" \
+ + padding + "Overflow Page Number: {}\n" \
+ + padding + "Last Overflow Page Content Size: {}\n" \
+ + padding + "Overflow (Hex): {}"
+ string = string.format(self.payload_byte_size,
+ self.payload_byte_size_varint_length,
+ self.row_id,
+ self.row_id_varint_length,
+ self.payload_offset,
+ self.bytes_on_first_page,
+ self.has_overflow,
+ self.overflow_byte_size,
+ self.expected_number_of_overflow_pages,
+ self.expected_last_overflow_page_content_size,
+ self.number_of_overflow_pages,
+ self.overflow_page_number_offset,
+ self.overflow_page_number,
+ self.last_overflow_page_content_size,
+ hexlify(self.overflow))
+ string += "\n" + padding + "Payload:\n{}".format(self.payload.stringify(padding + "\t"))
+ if self.has_overflow:
+ overflow_page = self.overflow_pages[self.overflow_page_number]
+ string += "\n" + padding + "Overflow Page:\n{}".format(self.overflow_page.stringify(padding + "\t"))
+ while overflow_page.next_overflow_page_number:
+ overflow_page = self.overflow_pages[overflow_page.next_overflow_page_number]
+ string += "\n" + padding + "Overflow Page:\n{}".format(self.overflow_page.stringify(padding + "\t"))
+ return super(TableLeafCell, self).stringify(padding) + string
+
+ @property
+ def number_of_overflow_pages(self):
+ return len(self.overflow_pages) if self.overflow_pages else 0
+
+ @property
+ def overflow(self):
+
+ overflow = bytearray()
+
+ if not self.has_overflow:
+
+ return overflow
+
+ else:
+
+ overflow_page = self.overflow_pages[self.overflow_page_number]
+ overflow += overflow_page.content
+ while overflow_page.next_overflow_page_number:
+ overflow_page = self.overflow_pages[overflow_page.next_overflow_page_number]
+ overflow += overflow_page.content
+
+ if len(overflow) != self.overflow_byte_size:
+ log_message = "The expected overflow size: {} did not match the overflow size parsed: {} " \
+ "for b-tree table leaf cell index: {} at offset: {} for page: {} " \
+ "in page version: {} for version: {}."
+ log_message = log_message.format(self.overflow_byte_size, len(overflow), self.index, self.start_offset,
+ self.page_number, self.page_version_number, self.version_number)
+ raise CellParsingError(log_message)
+
+ return overflow
+
+
+class IndexInteriorCell(BTreeCell):
+
+ def __init__(self, version_interface, page_version_number, file_offset, page_number, page, index, offset):
+
+ super(IndexInteriorCell, self).__init__(version_interface, page_version_number, file_offset,
+ page_number, index, offset)
+
+ left_child_pointer_end_offset = self.start_offset + LEFT_CHILD_POINTER_BYTE_LENGTH
+ self.left_child_pointer = unpack(b">I", page[self.start_offset:left_child_pointer_end_offset])[0]
+ self.payload_byte_size, self.payload_byte_size_varint_length = decode_varint(page,
+ left_child_pointer_end_offset)
+ self.payload_offset = left_child_pointer_end_offset + self.payload_byte_size_varint_length
+
+ self.has_overflow = False
+ self.overflow_pages = None
+ self.overflow_page_number_offset = None
+ self.overflow_page_number = None
+ self.overflow_page = None
+ self.last_overflow_page_content_size = 0
+
+ u = self._page_size
+ p = self.payload_byte_size
+ x = (((u - 12) * 64) / 255) - 23
+
+ """
+
+ Note: According to the SQLite documentation (as of version 3.9.2) index interior and leaf cell overflow is
+ calculated by first calculating x as (((u - 12) * 64) / 255) - 23. If the payload size p is less than
+ or equal to x, then there is no overflow. If p is greater than x, than m = (((u - 12) * 32) / 255) - 23.
+ If p is greater than x then the number of bytes stored on the b-tree leaf page is the smaller of
+ m + ((p - m) % (u - 4)) and x. The remaining bytes are then moved to overflow pages.
+
+ The above was found to be wrong in the SQLite documentation.
+
+ The documentation is incorrect that it is the smaller of m + ((p - m) % (u - 4)) and x. After
+ a lot of testing and reviewing of the actual SQLite c code it was found out that the actual number of
+ bytes stored on the b-tree leaf page is m + ((p - m) % (u - 4)) unless m + ((p - m) % (u - 4)) > x
+ in which case the bytes stored on the b-tree index interior or index leaf page is m itself.
+
+ Therefore let b be the bytes on the b-tree index interior or index leaf page:
+ u = page size
+ p = payload byte size
+ x = (((u - 12) * 64) / 255) - 23
+ if p > x
+ m = (((u - 12) * 32) / 255) - 23
+ b = m + ((p - m) % (u - 4))
+ if b > x
+ b = m
+
+ Additionally, the bytes stored on the b-tree index interior or index leaf page will always be greater
+ to or equal to m once calculated.
+
+ """
+
+ self.bytes_on_first_page = p
+ if p > x:
+ m = (((u - 12) * 32) / 255) - 23
+ self.bytes_on_first_page = m + ((p - m) % (u - 4))
+ if self.bytes_on_first_page > x:
+ self.bytes_on_first_page = m
+ self.has_overflow = True
+ self.overflow_page_number_offset = self.payload_offset + self.bytes_on_first_page
+ overflow_page_number_end_offset = self.overflow_page_number_offset + FIRST_OVERFLOW_PAGE_NUMBER_LENGTH
+ self.overflow_page_number = unpack(b">I", page[self.overflow_page_number_offset:
+ overflow_page_number_end_offset])[0]
+ if self.bytes_on_first_page < m:
+ log_message = "When calculating overflow, the bytes on the first page: {} calculated are less than " \
+ "m: {} for b-tree index interior cell index: {} at offset: {} for page: {} in " \
+ "page version: {} for version: {}."
+ log_message = log_message.format(self.bytes_on_first_page, m, self.index, self.start_offset,
+ self.page_number, self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise CellParsingError(log_message)
+
+ self.byte_size = LEFT_CHILD_POINTER_BYTE_LENGTH
+ self.byte_size += self.payload_byte_size_varint_length + self.payload_byte_size
+ self.byte_size += FIRST_OVERFLOW_PAGE_NUMBER_LENGTH if self.has_overflow else 0
+ self.end_offset = self.start_offset + self.byte_size - self.payload_byte_size + self.bytes_on_first_page
+
+ self.overflow_byte_size = self.payload_byte_size - self.bytes_on_first_page
+ self.expected_number_of_overflow_pages, \
+ self.expected_last_overflow_page_content_size = calculate_expected_overflow(self.overflow_byte_size, u)
+
+ self.md5_hex_digest = get_md5_hash(page[self.start_offset:self.end_offset])
+
+ if self.has_overflow:
+
+ """
+
+ The overflow pages are in a dictionary keyed off of their page number in the format:
+ overflow_page[OVERFLOW_PAGE_NUMBER] = OVERFLOW_PAGE
+
+ Originally, the overflow pages were nested objects, ie. each overflow page had the following overflow
+ page within it, and so on. However, this lead to recursion depth problems with larger cell content.
+ It was changed to be a dictionary of pages here instead.
+
+ Note: Although overflow pages have to be replaced when any overflow page in a chain is updated, the
+ overflow here may not be updated due to a different cell in this page being updated. Therefore,
+ we allow the first overflow page to be in a earlier version. However, the overflow pages still
+ check that all overflow versions in respect to the first overflow page and beyond in the linked
+ list are all equal.
+
+ """
+
+ self.overflow_pages = {}
+ payload_remaining = self.overflow_byte_size
+
+ overflow_page = OverflowPage(self._version_interface, self.overflow_page_number, self.page_number,
+ FIRST_OVERFLOW_PARENT_PAGE_NUMBER, FIRST_OVERFLOW_PAGE_INDEX,
+ payload_remaining)
+
+ self.overflow_pages[overflow_page.number] = overflow_page
+ self.last_overflow_page_content_size = overflow_page.content_length
+
+ while overflow_page.next_overflow_page_number:
+ payload_remaining = payload_remaining - overflow_page.size + OVERFLOW_HEADER_LENGTH
+ overflow_page = OverflowPage(self._version_interface, overflow_page.next_overflow_page_number,
+ self.page_number, overflow_page.number, overflow_page.index + 1,
+ payload_remaining)
+ self.overflow_pages[overflow_page.number] = overflow_page
+ self.last_overflow_page_content_size = overflow_page.content_length
+
+ if self.expected_number_of_overflow_pages != self.number_of_overflow_pages:
+ log_message = "The number of expected overflow pages: {} was not the actual number of overflow pages " \
+ "parsed: {} for b-tree index interior cell index: {} at offset: {} for page: {} in " \
+ "page version: {} for version: {}."
+ log_message = log_message.format(self.expected_number_of_overflow_pages, self.number_of_overflow_pages,
+ self.index, self.start_offset, self.page_number, self.page_version_number,
+ self.version_number)
+ self._logger.error(log_message)
+ raise CellParsingError(log_message)
+
+ if self.expected_last_overflow_page_content_size != self.last_overflow_page_content_size:
+ log_message = "The expected last overflow page content size: {} was not the actual last overflow page " \
+ "content size parsed: {} for b-tree index interior cell index: {} at offset: {} for " \
+ "page: {} in page version: {} for version: {}."
+ log_message = log_message.format(self.expected_last_overflow_page_content_size,
+ self.last_overflow_page_content_size, self.index, self.start_offset,
+ self.page_number, self.page_version_number, self.version_number)
+ raise CellParsingError(log_message)
+
+ self.payload = Record(page, self.payload_offset, self.payload_byte_size,
+ self.bytes_on_first_page, self.overflow)
+
+ """
+
+ Note: An index interior cell can be updated without updating the left child page in a version.
+
+ """
+
+ if not self.left_child_pointer:
+ log_message = "The left child pointer is not set for b-tree index interior cell index: {} " \
+ "at offset: {} for page: {} in page version: {} for version: {}."
+ log_message = log_message.format(self.index, self.start_offset, self.page_number,
+ self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise CellParsingError(log_message)
+
+ left_child_pointer_page_hex_type = self._version_interface.get_page_data(self.left_child_pointer,
+ 0, PAGE_TYPE_LENGTH)
+
+ if left_child_pointer_page_hex_type == INDEX_INTERIOR_PAGE_HEX_ID:
+ self.left_child_page = IndexInteriorPage(self._version_interface, self.left_child_pointer)
+ elif left_child_pointer_page_hex_type == INDEX_LEAF_PAGE_HEX_ID:
+ self.left_child_page = IndexLeafPage(self._version_interface, self.left_child_pointer)
+ else:
+ log_message = "The left child pointer does not point to a index interior or index page but instead has " \
+ "a hex type of: {} for b-tree index interior cell index: {} at offset: {} for page: {} " \
+ "in page version: {} for version: {}."
+ log_message = log_message.format(hexlify(left_child_pointer_page_hex_type), self.index, self.start_offset,
+ self.page_number, self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise CellParsingError(log_message)
+
+ def stringify(self, padding=""):
+ string = "\n" \
+ + padding + "Left Child Pointer: {}\n" \
+ + padding + "Payload Byte Size: {}\n" \
+ + padding + "Payload Byte Size VARINT Length: {}\n" \
+ + padding + "Payload Offset: {}\n" \
+ + padding + "Bytes on First Page: {}\n" \
+ + padding + "Has Overflow: {}\n" \
+ + padding + "Overflow Byte Size: {}\n" \
+ + padding + "Expected Number of Overflow Pages: {}\n" \
+ + padding + "Expected Last Overflow Page Content Size: {}\n" \
+ + padding + "Number of Overflow Pages: {}\n" \
+ + padding + "Overflow Page Number Offset: {}\n" \
+ + padding + "Overflow Page Number: {}\n" \
+ + padding + "Last Overflow Page Content Size: {}\n" \
+ + padding + "Overflow (Hex): {}"
+ string = string.format(self.left_child_pointer,
+ self.payload_byte_size,
+ self.payload_byte_size_varint_length,
+ self.payload_offset,
+ self.bytes_on_first_page,
+ self.has_overflow,
+ self.overflow_byte_size,
+ self.expected_number_of_overflow_pages,
+ self.expected_last_overflow_page_content_size,
+ self.number_of_overflow_pages,
+ self.overflow_page_number_offset,
+ self.overflow_page_number,
+ self.last_overflow_page_content_size,
+ hexlify(self.overflow))
+ string += "\n" + padding + "Payload:\n{}".format(self.payload.stringify(padding + "\t"))
+ if self.has_overflow:
+ overflow_page = self.overflow_pages[self.overflow_page_number]
+ string += "\n" + padding + "Overflow Page:\n{}".format(self.overflow_page.stringify(padding + "\t"))
+ while overflow_page.next_overflow_page_number:
+ overflow_page = self.overflow_pages[overflow_page.next_overflow_page_number]
+ string += "\n" + padding + "Overflow Page:\n{}".format(self.overflow_page.stringify(padding + "\t"))
+ string += "\n" + padding + "Left Child Page:\n{}"
+ string = string.format(self.left_child_page.stringify(padding + "\t") if self.left_child_page else None)
+ return super(IndexInteriorCell, self).stringify(padding) + string
+
+ @property
+ def number_of_overflow_pages(self):
+ return len(self.overflow_pages) if self.overflow_pages else 0
+
+ @property
+ def overflow(self):
+ overflow = bytearray()
+
+ if not self.has_overflow:
+
+ return overflow
+
+ else:
+
+ overflow_page = self.overflow_pages[self.overflow_page_number]
+ overflow += overflow_page.content
+ while overflow_page.next_overflow_page_number:
+ overflow_page = self.overflow_pages[overflow_page.next_overflow_page_number]
+ overflow += overflow_page.content
+
+ if len(overflow) != self.overflow_byte_size:
+ log_message = "The expected overflow size: {} did not match the overflow size parsed: {} " \
+ "for b-tree table leaf cell index: {} at offset: {} for page: {} " \
+ "in page version: {} for version: {}."
+ log_message = log_message.format(self.overflow_byte_size, len(overflow), self.index, self.start_offset,
+ self.page_number, self.page_version_number, self.version_number)
+ raise CellParsingError(log_message)
+
+ return overflow
+
+
+class IndexLeafCell(BTreeCell):
+
+ def __init__(self, version_interface, page_version_number, file_offset, page_number, page, index, offset):
+
+ super(IndexLeafCell, self).__init__(version_interface, page_version_number, file_offset,
+ page_number, index, offset)
+
+ self.payload_byte_size, self.payload_byte_size_varint_length = decode_varint(page, self.start_offset)
+ self.payload_offset = self.start_offset + self.payload_byte_size_varint_length
+
+ self.has_overflow = False
+ self.overflow_pages = 0
+ self.overflow_page_number_offset = None
+ self.overflow_page_number = None
+ self.overflow_page = None
+ self.last_overflow_page_content_size = 0
+
+ u = self._page_size
+ p = self.payload_byte_size
+ x = (((u - 12) * 64) / 255) - 23
+
+ """
+
+ Note: According to the SQLite documentation (as of version 3.9.2) index interior and leaf cell overflow is
+ calculated by first calculating x as (((u - 12) * 64) / 255) - 23. If the payload size p is less than
+ or equal to x, then there is no overflow. If p is greater than x, than m = (((u - 12) * 32) / 255) - 23.
+ If p is greater than x then the number of bytes stored on the b-tree leaf page is the smaller of
+ m + ((p - m) % (u - 4)) and x. The remaining bytes are then moved to overflow pages.
+
+ The above was found to be wrong in the SQLite documentation.
+
+ The documentation is incorrect that it is the smaller of m + ((p - m) % (u - 4)) and x. After
+ a lot of testing and reviewing of the actual SQLite c code it was found out that the actual number of
+ bytes stored on the b-tree leaf page is m + ((p - m) % (u - 4)) unless m + ((p - m) % (u - 4)) > x
+ in which case the bytes stored on the b-tree index interior or index leaf page is m itself.
+
+ Therefore let b be the bytes on the b-tree index interior or index leaf page:
+ u = page size
+ p = payload byte size
+ x = (((u - 12) * 64) / 255) - 23
+ if p > x
+ m = (((u - 12) * 32) / 255) - 23
+ b = m + ((p - m) % (u - 4))
+ if b > x
+ b = m
+
+ Additionally, the bytes stored on the b-tree index interior or index leaf page will always be greater
+ to or equal to m once calculated.
+
+ """
+
+ self.bytes_on_first_page = p
+ if p > x:
+ m = (((u - 12) * 32) / 255) - 23
+ self.bytes_on_first_page = m + ((p - m) % (u - 4))
+ if self.bytes_on_first_page > x:
+ self.bytes_on_first_page = m
+ self.has_overflow = True
+ self.overflow_page_number_offset = self.payload_offset + self.bytes_on_first_page
+ overflow_page_number_end_offset = self.overflow_page_number_offset + FIRST_OVERFLOW_PAGE_NUMBER_LENGTH
+ self.overflow_page_number = unpack(b">I", page[self.overflow_page_number_offset:
+ overflow_page_number_end_offset])[0]
+ if self.bytes_on_first_page < m:
+ log_message = "When calculating overflow, the bytes on the first page: {} calculated are less than " \
+ "m: {} for b-tree leaf interior cell index: {} at offset: {} for page: {} in " \
+ "page version: {} for version: {}."
+ log_message = log_message.format(self.bytes_on_first_page, m, self.index, self.start_offset,
+ self.page_number, self.page_version_number, self.version_number)
+ self._logger.error(log_message)
+ raise CellParsingError(log_message)
+
+ self.byte_size = self.payload_byte_size_varint_length + self.payload_byte_size
+ self.byte_size += FIRST_OVERFLOW_PAGE_NUMBER_LENGTH if self.has_overflow else 0
+ self.end_offset = self.start_offset + self.byte_size - self.payload_byte_size + self.bytes_on_first_page
+
+ self.overflow_byte_size = self.payload_byte_size - self.bytes_on_first_page
+ self.expected_number_of_overflow_pages, \
+ self.expected_last_overflow_page_content_size = calculate_expected_overflow(self.overflow_byte_size, u)
+
+ self.md5_hex_digest = get_md5_hash(page[self.start_offset:self.end_offset])
+
+ if self.has_overflow:
+
+ """
+
+ The overflow pages are in a dictionary keyed off of their page number in the format:
+ overflow_page[OVERFLOW_PAGE_NUMBER] = OVERFLOW_PAGE
+
+ Originally, the overflow pages were nested objects, ie. each overflow page had the following overflow
+ page within it, and so on. However, this lead to recursion depth problems with larger cell content.
+ It was changed to be a dictionary of pages here instead.
+
+ Note: Although overflow pages have to be replaced when any overflow page in a chain is updated, the
+ overflow here may not be updated due to a different cell in this page being updated. Therefore,
+ we allow the first overflow page to be in a earlier version. However, the overflow pages still
+ check that all overflow versions in respect to the first overflow page and beyond in the linked
+ list are all equal.
+
+ """
+
+ self.overflow_pages = {}
+ payload_remaining = self.overflow_byte_size
+
+ overflow_page = OverflowPage(self._version_interface, self.overflow_page_number, self.page_number,
+ FIRST_OVERFLOW_PARENT_PAGE_NUMBER, FIRST_OVERFLOW_PAGE_INDEX,
+ payload_remaining)
+
+ self.overflow_pages[overflow_page.number] = overflow_page
+ self.last_overflow_page_content_size = overflow_page.content_length
+
+ while overflow_page.next_overflow_page_number:
+ payload_remaining = payload_remaining - overflow_page.size + OVERFLOW_HEADER_LENGTH
+ overflow_page = OverflowPage(self._version_interface, overflow_page.next_overflow_page_number,
+ self.page_number, overflow_page.number, overflow_page.index + 1,
+ payload_remaining)
+ self.overflow_pages[overflow_page.number] = overflow_page
+ self.last_overflow_page_content_size = overflow_page.content_length
+
+ if self.expected_number_of_overflow_pages != self.number_of_overflow_pages:
+ log_message = "The number of expected overflow pages: {} was not the actual number of overflow pages " \
+ "parsed: {} for b-tree index leaf cell index: {} at offset: {} for page: {} in " \
+ "page version: {} for version: {}."
+ log_message = log_message.format(self.expected_number_of_overflow_pages, self.number_of_overflow_pages,
+ self.index, self.start_offset, self.page_number, self.page_version_number,
+ self.version_number)
+ self._logger.error(log_message)
+ raise CellParsingError(log_message)
+
+ if self.expected_last_overflow_page_content_size != self.last_overflow_page_content_size:
+ log_message = "The expected last overflow page content size: {} was not the actual last overflow page " \
+ "content size parsed: {} for b-tree index leaf cell index: {} at offset: {} for " \
+ "page: {} in page version: {} for version: {}."
+ log_message = log_message.format(self.expected_last_overflow_page_content_size,
+ self.last_overflow_page_content_size, self.index, self.start_offset,
+ self.page_number, self.page_version_number, self.version_number)
+ raise CellParsingError(log_message)
+
+ self.payload = Record(page, self.payload_offset, self.payload_byte_size,
+ self.bytes_on_first_page, self.overflow)
+
+ def stringify(self, padding=""):
+ string = "\n" \
+ + padding + "Payload Byte Size: {}\n" \
+ + padding + "Payload Byte Size VARINT Length: {}\n" \
+ + padding + "Payload Offset: {}\n" \
+ + padding + "Bytes on First Page: {}\n" \
+ + padding + "Has Overflow: {}\n" \
+ + padding + "Overflow Byte Size: {}\n" \
+ + padding + "Expected Number of Overflow Pages: {}\n" \
+ + padding + "Expected Last Overflow Page Content Size: {}\n" \
+ + padding + "Number of Overflow Pages: {}\n" \
+ + padding + "Overflow Page Number Offset: {}\n" \
+ + padding + "Overflow Page Number: {}\n" \
+ + padding + "Last Overflow Page Content Size: {}\n" \
+ + padding + "Overflow (Hex): {}"
+ string = string.format(self.payload_byte_size,
+ self.payload_byte_size_varint_length,
+ self.payload_offset,
+ self.bytes_on_first_page,
+ self.has_overflow,
+ self.overflow_byte_size,
+ self.expected_number_of_overflow_pages,
+ self.expected_last_overflow_page_content_size,
+ self.number_of_overflow_pages,
+ self.overflow_page_number_offset,
+ self.overflow_page_number,
+ self.last_overflow_page_content_size,
+ hexlify(self.overflow))
+ string += "\n" + padding + "Payload:\n{}".format(self.payload.stringify(padding + "\t"))
+ if self.has_overflow:
+ overflow_page = self.overflow_pages[self.overflow_page_number]
+ string += "\n" + padding + "Overflow Page:\n{}".format(self.overflow_page.stringify(padding + "\t"))
+ while overflow_page.next_overflow_page_number:
+ overflow_page = self.overflow_pages[overflow_page.next_overflow_page_number]
+ string += "\n" + padding + "Overflow Page:\n{}".format(self.overflow_page.stringify(padding + "\t"))
+ return super(IndexLeafCell, self).stringify(padding) + string
+
+ @property
+ def number_of_overflow_pages(self):
+ return len(self.overflow_pages) if self.overflow_pages else 0
+
+ @property
+ def overflow(self):
+ overflow = bytearray()
+
+ if not self.has_overflow:
+
+ return overflow
+
+ else:
+
+ overflow_page = self.overflow_pages[self.overflow_page_number]
+ overflow += overflow_page.content
+ while overflow_page.next_overflow_page_number:
+ overflow_page = self.overflow_pages[overflow_page.next_overflow_page_number]
+ overflow += overflow_page.content
+
+ if len(overflow) != self.overflow_byte_size:
+ log_message = "The expected overflow size: {} did not match the overflow size parsed: {} " \
+ "for b-tree table leaf cell index: {} at offset: {} for page: {} " \
+ "in page version: {} for version: {}."
+ log_message = log_message.format(self.overflow_byte_size, len(overflow), self.index, self.start_offset,
+ self.page_number, self.page_version_number, self.version_number)
+ raise CellParsingError(log_message)
+
+ return overflow
+
+
+class Freeblock(BTreeCell):
+
+ def __init__(self, version_interface, page_version_number, file_offset, page_number, page, index, offset):
+
+ super(Freeblock, self).__init__(version_interface, page_version_number, file_offset, page_number, index, offset)
+
+ next_freeblock_end_offset = self.start_offset + NEXT_FREEBLOCK_OFFSET_LENGTH
+ self.next_freeblock_offset = unpack(b">H", page[self.start_offset:next_freeblock_end_offset])[0]
+ self.content_start_offset = next_freeblock_end_offset + FREEBLOCK_BYTE_LENGTH
+ self.byte_size = unpack(b">H", page[next_freeblock_end_offset:self.content_start_offset])[0]
+ self.content_end_offset = self.start_offset + self.byte_size
+ self.end_offset = self.content_end_offset
+
+ self.content_length = self.end_offset - self.content_start_offset
+
+ self.md5_hex_digest = get_md5_hash(page[self.start_offset:self.end_offset])
+
+ def stringify(self, padding=""):
+ string = "\n" \
+ + padding + "Next Freeblock Offset: {}\n" \
+ + padding + "Content Start Offset: {}\n" \
+ + padding + "Content End Offset: {}\n" \
+ + padding + "Content Length: {}\n" \
+ + padding + "Content (Hex): {}"
+ string = string.format(self.next_freeblock_offset,
+ self.content_start_offset,
+ self.content_end_offset,
+ self.content_length,
+ hexlify(self.content))
+ return super(Freeblock, self).stringify(padding) + string
+
+ @property
+ def content(self):
+
+ """
+
+ This property returns the content inside this freeblock. This is only the body of the freeblock, unallocated
+ portion, and does not include the 4 byte freeblock header.
+
+ :return: bytearray The byte array for freeblock content.
+
+ """
+
+ if self.content_length == 0:
+ return bytearray()
+ else:
+ return self._version_interface.get_page_data(self.page_number, self.content_start_offset,
+ self.content_length)
+
+
+class Fragment(BTreeCell):
+
+ """
+
+
+
+ Note: It is important to note that fragments are three bytes in length or less. If four bytes or more become
+ unallocated within the cell area of the page, then a freeblock is created since four bytes are required.
+ (The first two bytes pointing to the offset of the next freeblock offset in the freeblock linked list
+ on the page and the second two bytes being the size of the freeblock in bytes including this 4 byte header.)
+
+ However, fragments can be found with byte sizes greater than three. This occurs due to the fact that
+ multiple cells could be added and deleted next to each other creating fragments of size of 3 or less next
+ to each other. Since we cannot determine exactly where the break between these fragments are, we specify
+ the whole block as a fragment resulting in fragment sizes greater than the limit of 3 bytes.
+
+ Therefore, if the fragment is greater than 3 bytes it is comprised of multiple fragments. Keep in mind
+ however that although this is true, the inverse is not true. If a fragment is three bytes or less, it could
+ still be an aggregate of multiple fragments such as a fragment of 1 byte and another fragment of 2 bytes.
+
+ Note: Since the byte size is the size of the actual content, there is not content size.
+
+ """
+
+ def __init__(self, version_interface, page_version_number, file_offset, page_number,
+ page, index, start_offset, end_offset):
+
+ super(Fragment, self).__init__(version_interface, page_version_number, file_offset,
+ page_number, index, start_offset)
+
+ self.end_offset = end_offset
+ self.byte_size = self.end_offset - self.start_offset
+
+ self.md5_hex_digest = get_md5_hash(page[self.start_offset:self.end_offset])
+
+ def stringify(self, padding=""):
+ string = "\n" + padding + "Content (Hex): {}"
+ string = string.format(hexlify(self.content))
+ return super(Fragment, self).stringify(padding) + string
+
+ @property
+ def content(self):
+
+ """
+
+ This property returns the content inside this fragment.
+
+ :return: bytearray The byte array for fragment content.
+
+ """
+
+ return self._version_interface.get_page_data(self.page_number, self.start_offset, self.end_offset)
diff --git a/sqlite_dissect/file/database/payload.py b/sqlite_dissect/file/database/payload.py
new file mode 100644
index 0000000..4a71e65
--- /dev/null
+++ b/sqlite_dissect/file/database/payload.py
@@ -0,0 +1,221 @@
+from abc import ABCMeta
+from binascii import hexlify
+from logging import getLogger
+from re import sub
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.exception import RecordParsingError
+from sqlite_dissect.utilities import decode_varint
+from sqlite_dissect.utilities import get_md5_hash
+from sqlite_dissect.utilities import get_record_content
+from sqlite_dissect.utilities import get_serial_type_signature
+
+"""
+
+payload.py
+
+This script holds the objects used for parsing payloads from the cells in SQLite b-tree pages for
+index leaf, index interior, and table leaf. (Table Interior pages do not have payloads in their cells.)
+
+This script holds the following object(s):
+Payload(object)
+Record(Payload)
+RecordColumn(object)
+
+"""
+
+
+class Payload(object):
+
+ __metaclass__ = ABCMeta
+
+ def __init__(self):
+
+ self.start_offset = None
+ self.byte_size = None
+ self.end_offset = None
+
+ self.has_overflow = False
+ self.bytes_on_first_page = None
+ self.overflow_byte_size = None
+
+ self.header_byte_size = None
+ self.header_byte_size_varint_length = None
+ self.header_start_offset = None
+ self.header_end_offset = None
+ self.body_start_offset = None
+ self.body_end_offset = None
+
+ self.md5_hex_digest = None
+
+ self.record_columns = []
+ self.serial_type_signature = ""
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding="", print_record_columns=True):
+ string = padding + "Start Offset: {}\n" \
+ + padding + "End Offset: {}\n" \
+ + padding + "Byte Size: {}\n" \
+ + padding + "MD5 Hex Digest: {}\n" \
+ + padding + "Header Byte Size: {}\n" \
+ + padding + "Header Byte Size VARINT Length: {}\n" \
+ + padding + "Header Start Offset: {}\n" \
+ + padding + "Header End Offset: {}\n" \
+ + padding + "Body Start Offset: {}\n" \
+ + padding + "Body End Offset: {}\n" \
+ + padding + "Has Overflow: {}\n" \
+ + padding + "Bytes on First Page: {}\n" \
+ + padding + "Overflow Byte Size: {}\n" \
+ + padding + "Serial Type Signature: {}"
+ string = string.format(self.start_offset,
+ self.end_offset,
+ self.byte_size,
+ self.md5_hex_digest,
+ self.header_byte_size,
+ self.header_byte_size_varint_length,
+ self.header_start_offset,
+ self.header_end_offset,
+ self.body_start_offset,
+ self.body_end_offset,
+ self.has_overflow,
+ self.bytes_on_first_page,
+ self.overflow_byte_size,
+ self.serial_type_signature)
+ if print_record_columns:
+ for record_column in self.record_columns:
+ string += "\n" + padding + "Record Column:\n{}".format(record_column.stringify(padding + "\t"))
+ return string
+
+
+class Record(Payload):
+
+ def __init__(self, page, payload_offset, payload_byte_size, bytes_on_first_page=None, overflow=bytearray()):
+
+ super(Record, self).__init__()
+
+ logger = getLogger(LOGGER_NAME)
+
+ if bytes_on_first_page is None:
+
+ bytes_on_first_page = payload_byte_size
+
+ if overflow:
+ log_message = "Bytes on first page not specified on page in record when overflow was (hex): {}."
+ log_message = log_message.format(hexlify(overflow))
+ logger.error(log_message)
+ raise RecordParsingError(log_message)
+
+ if bytes_on_first_page < payload_byte_size and not overflow:
+ log_message = "Bytes on first page: {} less than payload byte size: {} on page with overflow not set."
+ log_message = log_message.format(bytes_on_first_page, payload_byte_size)
+ logger.error(log_message)
+ raise RecordParsingError(log_message)
+
+ if bytes_on_first_page > payload_byte_size:
+ log_message = "Bytes on first page: {} greater than payload byte size: {} on page."
+ log_message = log_message.format(bytes_on_first_page, payload_byte_size)
+ logger.error(log_message)
+ raise RecordParsingError(log_message)
+
+ self.start_offset = payload_offset
+ self.byte_size = payload_byte_size
+ self.end_offset = self.start_offset + bytes_on_first_page
+
+ self.has_overflow = False if not overflow else True
+ self.bytes_on_first_page = bytes_on_first_page
+ self.overflow_byte_size = self.byte_size - self.bytes_on_first_page
+
+ if self.overflow_byte_size == 0 and overflow:
+ log_message = "Overflow determined to exist with byte size: {} on page with overflow set: {}."
+ log_message = log_message.format(self.overflow_byte_size, hexlify(overflow))
+ logger.error(log_message)
+ raise RecordParsingError(log_message)
+
+ self.header_byte_size, self.header_byte_size_varint_length = decode_varint(page, self.start_offset)
+ self.header_start_offset = self.start_offset
+ self.header_end_offset = self.start_offset + self.header_byte_size
+ self.body_start_offset = self.header_end_offset
+ self.body_end_offset = self.end_offset
+
+ current_page_record_content = page[self.start_offset:self.end_offset]
+
+ total_record_content = current_page_record_content + overflow
+
+ if len(total_record_content) != self.byte_size:
+ log_message = "The record content was found to be a different length of: {} than the specified byte " \
+ "size: {} on page."
+ log_message = log_message.format(len(total_record_content), self.byte_size)
+ logger.error(log_message)
+ raise RecordParsingError(log_message)
+
+ self.md5_hex_digest = get_md5_hash(total_record_content)
+
+ current_header_offset = self.header_byte_size_varint_length
+ current_body_offset = 0
+ column_index = 0
+ while current_header_offset < self.header_byte_size:
+
+ serial_type, serial_type_varint_length = decode_varint(total_record_content, current_header_offset)
+
+ self.serial_type_signature += str(get_serial_type_signature(serial_type))
+
+ record_column_md5_hash_string = total_record_content[current_header_offset:
+ current_header_offset + serial_type_varint_length]
+
+ body_content = total_record_content[self.header_byte_size:self.byte_size]
+
+ content_size, value = get_record_content(serial_type, body_content, current_body_offset)
+
+ """
+
+ Note: If content_size == 0 then this will read out no data
+
+ """
+
+ record_column_md5_hash_string += body_content[current_body_offset:current_body_offset + content_size]
+
+ record_column_md5_hex_digest = get_md5_hash(record_column_md5_hash_string)
+
+ record_column = RecordColumn(column_index, serial_type, serial_type_varint_length,
+ content_size, value, record_column_md5_hex_digest)
+
+ self.record_columns.append(record_column)
+
+ current_header_offset += serial_type_varint_length
+ current_body_offset += content_size
+ column_index += 1
+
+
+class RecordColumn(object):
+
+ def __init__(self, index, serial_type, serial_type_varint_length, content_size, value, md5_hex_digest):
+ self.index = index
+ self.serial_type = serial_type
+ self.serial_type_varint_length = serial_type_varint_length
+ self.content_size = content_size
+ self.value = value
+ self.md5_hex_digest = md5_hex_digest
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding=""):
+ string = padding + "Index: {}\n" \
+ + padding + "Serial Type: {}\n" \
+ + padding + "Serial Type VARINT Length: {}\n" \
+ + padding + "Content Size: {}\n" \
+ + padding + "Value: {}\n" \
+ + padding + "MD5 Hex Digest: {}"
+ return string.format(self.index,
+ self.serial_type,
+ self.serial_type_varint_length,
+ self.content_size,
+ self.value,
+ self.md5_hex_digest)
diff --git a/sqlite_dissect/file/database/utilities.py b/sqlite_dissect/file/database/utilities.py
new file mode 100644
index 0000000..bae73e6
--- /dev/null
+++ b/sqlite_dissect/file/database/utilities.py
@@ -0,0 +1,268 @@
+from logging import getLogger
+from math import floor
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import PAGE_TYPE
+from sqlite_dissect.constants import POINTER_MAP_ENTRY_LENGTH
+from sqlite_dissect.exception import ParsingError
+from sqlite_dissect.file.database.page import IndexInteriorPage
+from sqlite_dissect.file.database.page import IndexLeafPage
+from sqlite_dissect.file.database.page import PointerMapPage
+from sqlite_dissect.file.database.page import TableInteriorPage
+from sqlite_dissect.file.database.page import TableLeafPage
+
+"""
+
+utilities.py
+
+This script holds utility functions for dealing with database specific objects such as pages rather than more general
+utility methods.
+
+This script holds the following function(s):
+aggregate_leaf_cells(b_tree_page, accounted_for_cell_md5s=None, records_only=False)
+create_pointer_map_pages(version, database_size_in_pages, page_size)
+get_maximum_pointer_map_entries_per_page(page_size)
+get_page_numbers_and_types_from_b_tree_page(b_tree_page)
+get_pages_from_b_tree_page(b_tree_page)
+
+"""
+
+
+def aggregate_leaf_cells(b_tree_page, accounted_for_cell_md5s=None, payloads_only=False):
+
+ """
+
+ This function will parse through all records across all leaf pages in a b-tree recursively and return a total
+ number of cells found along with a dictionary of cells where the dictionary is in the form of:
+ cells[CELL_MD5_HEX_DIGEST] = cell. Therefore, without the accounted for cell md5s specified,
+ the number of cells will match the length of the records dictionary.
+
+ If the accounted for cell md5s field is set with entries, then those entries will be ignored from the dictionary
+ but the number of cells will include the number of accounted for cell md5s in it. Therefore, with the accounted
+ for cell md5s specified, the number of cells will match the length of the records dictionary + the number of
+ accounted for cell md5s found.
+
+ If the payloads only flag is specified, the dictionary will only contain payloads (ie. records) and not the cells:
+ cells[CELL_MD5_HEX_DIGEST] = cell.payload.
+
+ Note: As this function name implies, this only parses through the leaf pages of table and index b-tree pages.
+ Cells of interior pages will be not be handled by this function.
+
+ :param b_tree_page:
+ :param accounted_for_cell_md5s:
+ :param payloads_only:
+
+ :return: tuple(number_of_records, records)
+
+ :raise:
+
+ """
+
+ accounted_for_cell_md5s = set() if accounted_for_cell_md5s is None else accounted_for_cell_md5s
+
+ number_of_cells = 0
+ cells = {}
+
+ if isinstance(b_tree_page, TableLeafPage) or isinstance(b_tree_page, IndexLeafPage):
+
+ number_of_cells += len(b_tree_page.cells)
+
+ if payloads_only:
+ for cell in b_tree_page.cells:
+ if cell.md5_hex_digest not in accounted_for_cell_md5s:
+ accounted_for_cell_md5s.add(cell.md5_hex_digest)
+ cells[cell.md5_hex_digest] = cell.payload
+ else:
+ for cell in b_tree_page.cells:
+ if cell.md5_hex_digest not in accounted_for_cell_md5s:
+ accounted_for_cell_md5s.add(cell.md5_hex_digest)
+ cells[cell.md5_hex_digest] = cell
+
+ elif isinstance(b_tree_page, TableInteriorPage) or isinstance(b_tree_page, IndexInteriorPage):
+
+ right_most_page_number_of_records, right_most_page_records = aggregate_leaf_cells(b_tree_page.right_most_page,
+ accounted_for_cell_md5s,
+ payloads_only)
+ number_of_cells += right_most_page_number_of_records
+ cells.update(right_most_page_records)
+
+ for cell in b_tree_page.cells:
+
+ left_child_page_number_of_records, left_child_page_records = aggregate_leaf_cells(cell.left_child_page,
+ accounted_for_cell_md5s,
+ payloads_only)
+ number_of_cells += left_child_page_number_of_records
+ cells.update(left_child_page_records)
+
+ else:
+
+ log_message = "Invalid page type found: {} to aggregate cells on.".format(type(b_tree_page))
+ getLogger(LOGGER_NAME).error(log_message)
+ raise ValueError(log_message)
+
+ return number_of_cells, cells
+
+
+def create_pointer_map_pages(version, database_size_in_pages, page_size):
+
+ """
+
+
+
+ Note: When calling this function, the caller should have already determined if pointer map pages exist in the file
+ they are parsing or not. This can be done by checking the largest root b-tree page number exists in the
+ database header. If it does not exist, then pointer map pages are not enabled. This function does not
+ have any way nor need to check that field and solely computes what the pointer map pages would be off of
+ the database size in pages and page size.
+
+ :param version:
+ :param database_size_in_pages:
+ :param page_size:
+
+ :return:
+
+ """
+
+ logger = getLogger(LOGGER_NAME)
+
+ maximum_entries_per_page = get_maximum_pointer_map_entries_per_page(page_size)
+
+ number_of_pointer_map_pages = 1
+ if database_size_in_pages - 2 > maximum_entries_per_page:
+ database_pages_left = database_size_in_pages - 2 - maximum_entries_per_page
+ while database_pages_left > 0:
+ database_pages_left -= maximum_entries_per_page - 1
+ number_of_pointer_map_pages += 1
+
+ pointer_map_pages = []
+ pointer_map_page_number = 2
+ number_of_pointer_map_pages = 0
+ while pointer_map_page_number < database_size_in_pages:
+
+ number_of_pointer_map_pages += 1
+ entries = number_of_pointer_map_pages * maximum_entries_per_page
+ next_pointer_map_page_number = entries + 2 + number_of_pointer_map_pages
+
+ number_of_entries = maximum_entries_per_page
+ if next_pointer_map_page_number > database_size_in_pages:
+ previous_entries = ((number_of_pointer_map_pages - 1) * maximum_entries_per_page)
+ number_of_entries = database_size_in_pages - previous_entries - number_of_pointer_map_pages - 1
+
+ pointer_map_pages.append(PointerMapPage(version, pointer_map_page_number, number_of_entries))
+ pointer_map_page_number = next_pointer_map_page_number
+
+ if pointer_map_page_number == database_size_in_pages:
+ log_message = "The next pointer map page number: {} is equal to the database size in pages: {} " \
+ "for version: {} resulting in erroneous pointer map pages."
+ log_message = log_message.format(pointer_map_page_number, database_size_in_pages, version.version_number)
+ logger.error(log_message)
+ raise ParsingError(log_message)
+
+ """
+
+ Iterate through the pointer map pages that were created and tally up all the pointer map pages along with their
+ pointer map entries. This total should match the total number of pages in the database.
+
+ Note: The first pointer map page in the database is page 2 and therefore the root page always appears before the
+ first pointer map page at page 2. Below the calculated database pages starts at one to account for the root
+ database page.
+
+ """
+
+ calculated_database_pages = 1
+ for pointer_map_page in pointer_map_pages:
+ calculated_database_pages += 1
+ calculated_database_pages += pointer_map_page.number_of_entries
+
+ if calculated_database_pages != database_size_in_pages:
+ log_message = "The calculated number of database pages from the pointer map pages: {} does not equal the " \
+ "database size in pages: {} for version: {}."
+ log_message = log_message.format(calculated_database_pages, database_size_in_pages, version.version_number)
+ logger.error(log_message)
+ raise ParsingError(log_message)
+
+ return pointer_map_pages
+
+
+def get_maximum_pointer_map_entries_per_page(page_size):
+ return int(floor(float(page_size)/POINTER_MAP_ENTRY_LENGTH))
+
+
+def get_page_numbers_and_types_from_b_tree_page(b_tree_page):
+
+ logger = getLogger(LOGGER_NAME)
+
+ b_tree_page_numbers = {}
+
+ if isinstance(b_tree_page, TableLeafPage):
+ b_tree_page_numbers[b_tree_page.number] = PAGE_TYPE.B_TREE_TABLE_LEAF
+ elif isinstance(b_tree_page, IndexLeafPage):
+ b_tree_page_numbers[b_tree_page.number] = PAGE_TYPE.B_TREE_INDEX_LEAF
+ elif isinstance(b_tree_page, TableInteriorPage):
+ b_tree_page_numbers[b_tree_page.number] = PAGE_TYPE.B_TREE_TABLE_INTERIOR
+ b_tree_page_numbers.update(get_page_numbers_and_types_from_b_tree_page(b_tree_page.right_most_page))
+ for b_tree_cell in b_tree_page.cells:
+ b_tree_page_numbers.update(get_page_numbers_and_types_from_b_tree_page(b_tree_cell.left_child_page))
+ elif isinstance(b_tree_page, IndexInteriorPage):
+ b_tree_page_numbers[b_tree_page.number] = PAGE_TYPE.B_TREE_INDEX_INTERIOR
+ b_tree_page_numbers.update(get_page_numbers_and_types_from_b_tree_page(b_tree_page.right_most_page))
+ for b_tree_cell in b_tree_page.cells:
+ b_tree_page_numbers.update(get_page_numbers_and_types_from_b_tree_page(b_tree_cell.left_child_page))
+ else:
+ log_message = "The b-tree page is not a BTreePage object but has a type of: {}."
+ log_message = log_message.format(type(b_tree_page))
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ if not isinstance(b_tree_page, TableInteriorPage):
+ for cell in b_tree_page.cells:
+ if cell.has_overflow:
+ overflow_page = cell.overflow_pages[cell.overflow_page_number]
+ b_tree_page_numbers[overflow_page.number] = PAGE_TYPE.OVERFLOW
+ while overflow_page.next_overflow_page_number:
+ overflow_page = cell.overflow_pages[overflow_page.next_overflow_page_number]
+ b_tree_page_numbers[overflow_page.number] = PAGE_TYPE.OVERFLOW
+
+ return b_tree_page_numbers
+
+
+def get_pages_from_b_tree_page(b_tree_page):
+
+ """
+
+
+
+ Note: The b-tree page sent in is included in the return result.
+
+ :param b_tree_page:
+
+ :return:
+
+ """
+
+ logger = getLogger(LOGGER_NAME)
+
+ b_tree_pages = []
+
+ if isinstance(b_tree_page, TableLeafPage) or isinstance(b_tree_page, IndexLeafPage):
+ b_tree_pages.append(b_tree_page)
+ elif isinstance(b_tree_page, TableInteriorPage) or isinstance(b_tree_page, IndexInteriorPage):
+ b_tree_pages.append(b_tree_page)
+ b_tree_pages.extend(get_pages_from_b_tree_page(b_tree_page.right_most_page))
+ for b_tree_cell in b_tree_page.cells:
+ b_tree_pages.extend(get_pages_from_b_tree_page(b_tree_cell.left_child_page))
+ else:
+ log_message = "The b-tree page is not a BTreePage object but has a type of: {}."
+ log_message = log_message.format(type(b_tree_page))
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ if not isinstance(b_tree_page, TableInteriorPage):
+ for cell in b_tree_page.cells:
+ if cell.has_overflow:
+ overflow_page = cell.overflow_pages[cell.overflow_page_number]
+ b_tree_pages.append(overflow_page)
+ while overflow_page.next_overflow_page_number:
+ overflow_page = cell.overflow_pages[overflow_page.next_overflow_page_number]
+ b_tree_pages.append(overflow_page)
+
+ return b_tree_pages
diff --git a/sqlite_dissect/file/file_handle.py b/sqlite_dissect/file/file_handle.py
new file mode 100644
index 0000000..7791ceb
--- /dev/null
+++ b/sqlite_dissect/file/file_handle.py
@@ -0,0 +1,262 @@
+import os
+from logging import getLogger
+from re import sub
+from warnings import warn
+from sqlite_dissect.constants import FILE_TYPE
+from sqlite_dissect.constants import LOCK_BYTE_PAGE_START_OFFSET
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import ROLLBACK_JOURNAL_HEADER_LENGTH
+from sqlite_dissect.constants import SQLITE_DATABASE_HEADER_LENGTH
+from sqlite_dissect.constants import UTF_8
+from sqlite_dissect.constants import UTF_8_DATABASE_TEXT_ENCODING
+from sqlite_dissect.constants import UTF_16BE
+from sqlite_dissect.constants import UTF_16BE_DATABASE_TEXT_ENCODING
+from sqlite_dissect.constants import UTF_16LE
+from sqlite_dissect.constants import UTF_16LE_DATABASE_TEXT_ENCODING
+from sqlite_dissect.constants import WAL_HEADER_LENGTH
+from sqlite_dissect.constants import WAL_INDEX_HEADER_LENGTH
+from sqlite_dissect.file.database.header import DatabaseHeader
+from sqlite_dissect.file.journal.header import RollbackJournalHeader
+from sqlite_dissect.file.wal.header import WriteAheadLogHeader
+from sqlite_dissect.file.wal_index.header import WriteAheadLogIndexHeader
+
+"""
+
+file_handle.py
+
+This script holds the file handle for file objects to be worked with in relation to the database, wal, journal and other
+supported file types specified in the FILE_TYPE file types list.
+
+This script holds the following object(s):
+FileHandle(object)
+
+"""
+
+
+class FileHandle(object):
+
+ def __init__(self, file_type, file_identifier, database_text_encoding=None, file_size=None):
+
+ """
+
+ Constructor. This constructor initializes this object.
+
+ Note: Either the file name or the file object needs to be specified as the file_identifier. The file name
+ is derived from the file object in order to derive the file size of the object by calling getsize on
+ the file name as well as for informational and logging purposes.
+
+ :param file_type: str The type of the file. Must be one of the file types in the FILE_TYPE list.
+ :param file_identifier: str or file The full file path to the file to be opened or the file object.
+ :param database_text_encoding: str The encoding of the text strings in the sqlite database file.
+ :param file_size: int Optional parameter to supply the file size.
+
+ :raise: IOError If the file_name is specified and upon opening the file:
+ 1.) the file name specifies a file that does not exist, or
+ 2.) the file name specified a file that is not a file, or
+ 3.) the file name is unable to be opened in "rb" mode.
+ :raise: ValueError If:
+ 1.) both the file name and file are set, or
+ 2.) neither the file name or file are set, or
+ 3.) the file type is not a valid file type.
+
+ """
+
+ self._logger = getLogger(LOGGER_NAME)
+
+ self.file_type = file_type
+ self.file_object = None
+ self.file_externally_controlled = False
+ self._database_text_encoding = database_text_encoding
+
+ if isinstance(file_identifier, basestring):
+
+ """
+
+ Note: The file identifier is the name (full path) of the file if it is an instance of basestring. We check
+ to make sure the file exists and it is actually a file.
+
+ """
+
+ if not os.path.exists(file_identifier):
+ log_message = "The file name specified does not exist: {}".format(file_identifier)
+ self._logger.error(log_message)
+ raise IOError(log_message)
+
+ if not os.path.isfile(file_identifier):
+ log_message = "The file name specified is not a file: {}".format(file_identifier)
+ self._logger.error(log_message)
+ raise IOError(log_message)
+
+ try:
+ self.file_object = open(file_identifier, "rb")
+ except IOError:
+ log_message = "Unable to open the file in \"rb\" mode with file name: {}.".format(file_identifier)
+ self._logger.error(log_message)
+ raise
+
+ else:
+ self.file_object = file_identifier
+ self.file_externally_controlled = True
+
+ if file_size:
+ self.file_size = file_size
+ else:
+ try:
+ self.file_size = os.fstat(self.file_object.fileno()).st_size
+ except AttributeError:
+ # If all else fails, use the seek to the end of the file trick.
+ self.file_object.seek(0, os.SEEK_END)
+ self.file_size = self.file_object.tell()
+ self.file_object.seek(0)
+
+ if self.file_type == FILE_TYPE.DATABASE:
+
+ if self.file_size > LOCK_BYTE_PAGE_START_OFFSET:
+ log_message = "The file size: {} is >= lock byte offset: {} and the lock byte page is not supported."
+ self._logger.error(log_message)
+ raise NotImplementedError(log_message)
+
+ try:
+
+ database_header = DatabaseHeader(self.file_object.read(SQLITE_DATABASE_HEADER_LENGTH))
+
+ if self._database_text_encoding:
+ log_message = "Database text encoding specified as: {} when should not be set."
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ if database_header.database_text_encoding == UTF_8_DATABASE_TEXT_ENCODING:
+ self._database_text_encoding = UTF_8
+ elif database_header.database_text_encoding == UTF_16LE_DATABASE_TEXT_ENCODING:
+ self._database_text_encoding = UTF_16LE
+ elif database_header.database_text_encoding == UTF_16BE_DATABASE_TEXT_ENCODING:
+ self._database_text_encoding = UTF_16BE
+ elif database_header.database_text_encoding:
+ log_message = "The database text encoding: {} is not recognized as a valid database text encoding."
+ log_message = log_message.format(database_header.database_text_encoding)
+ self._logger.error(log_message)
+ raise RuntimeError(log_message)
+
+ self.header = database_header
+
+ except:
+ log_message = "Failed to initialize the database header."
+ self._logger.error(log_message)
+ raise
+
+ elif self.file_type == FILE_TYPE.WAL:
+
+ try:
+ self.header = WriteAheadLogHeader(self.file_object.read(WAL_HEADER_LENGTH))
+ except:
+ log_message = "Failed to initialize the write ahead log header."
+ self._logger.error(log_message)
+ raise
+
+ elif self.file_type == FILE_TYPE.WAL_INDEX:
+
+ try:
+ self.header = WriteAheadLogIndexHeader(self.file_object.read(WAL_INDEX_HEADER_LENGTH))
+ except:
+ log_message = "Failed to initialize the write ahead log index header."
+ self._logger.error(log_message)
+ raise
+
+ elif self.file_type == FILE_TYPE.ROLLBACK_JOURNAL:
+
+ try:
+ self.header = RollbackJournalHeader(self.file_object.read(ROLLBACK_JOURNAL_HEADER_LENGTH))
+ except:
+ log_message = "Failed to initialize the rollback journal header."
+ self._logger.error(log_message)
+ raise
+
+ else:
+
+ log_message = "Invalid file type specified: {}.".format(self.file_type)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding="", print_header=True):
+ string = padding + "File Type: {}\n" \
+ + padding + "File Size: {}\n" \
+ + padding + "Database Text Encoding: {}"
+ string = string.format(self.file_type,
+ self.file_size,
+ self.database_text_encoding)
+ if print_header:
+ string += "\n" + padding + "Header:\n{}".format(self.header.stringify(padding + "\t"))
+ return string
+
+ @property
+ def database_text_encoding(self):
+ return self._database_text_encoding
+
+ @database_text_encoding.setter
+ def database_text_encoding(self, database_text_encoding):
+
+ if self._database_text_encoding and self._database_text_encoding != database_text_encoding:
+ log_message = "Database text encoding is set to: {} and cannot be set differently to: {}. " \
+ "Operation not permitted."
+ log_message = log_message.format(self._database_text_encoding, database_text_encoding)
+ self._logger.error(log_message)
+ raise TypeError(log_message)
+
+ if database_text_encoding not in [UTF_8, UTF_16LE, UTF_16BE]:
+ log_message = "The database text encoding: {} is not recognized as a valid database text encoding."
+ log_message = log_message.format(database_text_encoding)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ self._database_text_encoding = database_text_encoding
+
+ def close(self):
+
+ if self.file_externally_controlled:
+
+ log_message = "Ignored request to close externally controlled file."
+ self._logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ else:
+
+ try:
+
+ self.file_object.close()
+
+ except IOError:
+
+ log_message = "Unable to close the file object."
+ self._logger.exception(log_message)
+ raise
+
+ def read_data(self, offset, number_of_bytes):
+
+ if offset >= self.file_size:
+ log_message = "Requested offset: {} is >= the file size: {}."
+ log_message = log_message.format(offset, self.file_size)
+ self._logger.error(log_message)
+ raise EOFError(log_message)
+
+ if offset + number_of_bytes > self.file_size:
+ log_message = "Requested length of data: {} at offset {} to {} is > than the file size: {}."
+ log_message = log_message.format(number_of_bytes, offset, number_of_bytes + offset, self.file_size)
+ self._logger.error(log_message)
+ raise EOFError(log_message)
+
+ try:
+
+ self.file_object.seek(offset)
+ return self.file_object.read(number_of_bytes)
+
+ except ValueError:
+ log_message = "An error occurred while reading from the file at offset: {} for {} number of bytes."
+ log_message = log_message.format(offset, number_of_bytes)
+ self._logger.error(log_message)
+ raise
diff --git a/sqlite_dissect/file/header.py b/sqlite_dissect/file/header.py
new file mode 100644
index 0000000..e68471e
--- /dev/null
+++ b/sqlite_dissect/file/header.py
@@ -0,0 +1,42 @@
+from abc import ABCMeta
+from abc import abstractmethod
+from logging import getLogger
+from re import sub
+from sqlite_dissect.constants import LOGGER_NAME
+
+"""
+
+header.py
+
+This script holds an abstract class for file header objects to extend and inherit from. File headers such as that
+of the wal, journal, and database file headers will extend this class.
+
+Note: The database file header is the same as the file header for the sqlite database. However, for cases like the wal
+ file, the file has a file header that is not related to the actual database information and then depending on how
+ many commits were done with the first page in them, could have many database headers.
+
+This script holds the following object(s):
+SQLiteHeader(object)
+
+"""
+
+
+class SQLiteHeader(object):
+
+ __metaclass__ = ABCMeta
+
+ def __init__(self):
+ self.page_size = None
+ self.md5_hex_digest = None
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ @abstractmethod
+ def stringify(self, padding=""):
+ log_message = "The abstract method stringify was called directly and is not implemented."
+ getLogger(LOGGER_NAME).error(log_message)
+ raise NotImplementedError(log_message)
diff --git a/sqlite_dissect/file/journal/README.md b/sqlite_dissect/file/journal/README.md
new file mode 100644
index 0000000..64d8d91
--- /dev/null
+++ b/sqlite_dissect/file/journal/README.md
@@ -0,0 +1,56 @@
+
+# sqlite_dissect.file.journal
+
+This package will control parsing and access to the sqlite journal files.
+
+- header.py
+- journal.py
+
+TODO items for the "journal" package:
+
+- [ ] Finish UML class diagrams.
+
+
+
+### header.py
+This script holds the header objects for the rollback journal file and page record.
+
+This script holds the following object(s):
+- RollbackJournalHeader(SQLiteHeader)
+- RollbackJournalPageRecordHeader(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Finish try/except exception handling for struct.error and ord in classes.
+ ##### RollbackJournalHeader Class:
+ - [ ] Investigate invalid rollback journal header strings (warning currently raised).
+ - [ ] How to handle "zero'd out" headers.
+ - [ ] Calling classes should check the auto-vacuum mode in the database header for validity.
+ - [ ] Investigate why most headers observed aren't zero padded like sqlite documentation states.
+ - [ ] Check if there are use cases of different endianness for journals in sqlite documentation.
+ ##### RollbackJournalPageRecordHeader Class:
+ - [ ] Needs to be implemented.
+
+
+
+### journal.py
+This script holds the class to parse the rollback journal file.
+
+This script holds the following object(s):
+- RollbackJournal(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Finish try/except exception handling for struct.error in classes.
+- [ ] Investigate if rollback journals can store data from multiple transactions.
+ ##### RollbackJournal Class:
+ - [ ] Account for the database text encoding in the file handle.
+ - [ ] This class needs to be fully implemented.
+ - [ ] Should this be incorporated with the version/version history somehow?
+ - [ ] The file_size arg may not be needed since it is in the file handle and may be removed
+ - [ ] Implement the stringify method correctly.
+
\ No newline at end of file
diff --git a/sqlite_dissect/file/journal/__init__.py b/sqlite_dissect/file/journal/__init__.py
new file mode 100644
index 0000000..26a0037
--- /dev/null
+++ b/sqlite_dissect/file/journal/__init__.py
@@ -0,0 +1,10 @@
+
+"""
+
+__init__.py
+
+This init script will initialize any needed logic for this package.
+
+This package will control parsing and access to the sqlite journal files.
+
+"""
diff --git a/sqlite_dissect/file/journal/header.py b/sqlite_dissect/file/journal/header.py
new file mode 100644
index 0000000..e093c8b
--- /dev/null
+++ b/sqlite_dissect/file/journal/header.py
@@ -0,0 +1,98 @@
+from binascii import hexlify
+from logging import getLogger
+from struct import unpack
+from re import sub
+from warnings import warn
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import ROLLBACK_JOURNAL_ALL_CONTENT_UNTIL_END_OF_FILE
+from sqlite_dissect.constants import ROLLBACK_JOURNAL_HEADER_ALL_CONTENT
+from sqlite_dissect.constants import ROLLBACK_JOURNAL_HEADER_HEX_STRING
+from sqlite_dissect.constants import ROLLBACK_JOURNAL_HEADER_LENGTH
+from sqlite_dissect.utilities import get_md5_hash
+from sqlite_dissect.file.header import SQLiteHeader
+
+"""
+
+header.py
+
+This script holds the header objects for the rollback journal file and page record.
+
+This script holds the following object(s):
+RollbackJournalHeader(SQLiteHeader)
+RollbackJournalPageRecordHeader(object)
+
+"""
+
+
+class RollbackJournalHeader(SQLiteHeader):
+
+ def __init__(self, rollback_journal_header_byte_array):
+
+ super(RollbackJournalHeader, self).__init__()
+
+ logger = getLogger(LOGGER_NAME)
+
+ if len(rollback_journal_header_byte_array) != ROLLBACK_JOURNAL_HEADER_LENGTH:
+ log_message = "The rollback journal header byte array of size: {} is not the expected size of: {}."
+ log_message = log_message.format(len(rollback_journal_header_byte_array), ROLLBACK_JOURNAL_HEADER_LENGTH)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ self.header_string = rollback_journal_header_byte_array[0:8]
+
+ if self.header_string != ROLLBACK_JOURNAL_HEADER_HEX_STRING.decode("hex"):
+
+ """
+
+ Instead of throwing an error here, a warning is thrown instead. This is due to the fact that the header
+ string was found in a few files that did not match the appropriate rollback journal header string.
+ Additional research needs to be done into what use cases this could lead to and if these are valid use
+ cases or not.
+
+ """
+
+ log_message = "The header string is invalid."
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ self.page_count = unpack(b">I", rollback_journal_header_byte_array[8:12])[0]
+
+ if rollback_journal_header_byte_array[8:12] == ROLLBACK_JOURNAL_HEADER_ALL_CONTENT.decode("hex"):
+ self.page_count = ROLLBACK_JOURNAL_ALL_CONTENT_UNTIL_END_OF_FILE
+
+ self.random_nonce_for_checksum = unpack(b">I", rollback_journal_header_byte_array[12:16])[0]
+ self.initial_size_of_database_in_pages = unpack(b">I", rollback_journal_header_byte_array[16:20])[0]
+ self.disk_sector_size = unpack(b">I", rollback_journal_header_byte_array[20:24])[0]
+ self.size_of_pages_in_journal = unpack(b">I", rollback_journal_header_byte_array[24:28])[0]
+
+ # The page size will be the same size as the "size of pages in journal" attribute of the header.
+ self.page_size = self.size_of_pages_in_journal
+
+ self.md5_hex_digest = get_md5_hash(rollback_journal_header_byte_array)
+
+ def stringify(self, padding=""):
+ string = padding + "Header String (Hex): {}\n" \
+ + padding + "Page Count: {}\n" \
+ + padding + "Random Nonce for Checksum: {}\n" \
+ + padding + "Initial Size of Database in Pages: {}\n" \
+ + padding + "Disk Sector Size: {}\n" \
+ + padding + "Size of Pages in Journal: {}\n" \
+ + padding + "MD5 Hex Digest: {}"
+ return string.format(hexlify(self.header_string), self.page_count, self.random_nonce_for_checksum,
+ self.initial_size_of_database_in_pages, self.disk_sector_size,
+ self.size_of_pages_in_journal, self.md5_hex_digest)
+
+
+class RollbackJournalPageRecordHeader(object):
+
+ def __init__(self):
+ pass
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding=""):
+ pass
diff --git a/sqlite_dissect/file/journal/jounal.py b/sqlite_dissect/file/journal/jounal.py
new file mode 100644
index 0000000..97aa12a
--- /dev/null
+++ b/sqlite_dissect/file/journal/jounal.py
@@ -0,0 +1,32 @@
+from re import sub
+from sqlite_dissect.constants import FILE_TYPE
+from sqlite_dissect.file.file_handle import FileHandle
+
+"""
+
+journal.py
+
+This script holds the class to parse the rollback journal file.
+
+This script holds the following object(s):
+RollbackJournal(object)
+
+"""
+
+
+class RollbackJournal(object):
+
+ def __init__(self, file_identifier, file_size=None):
+
+ self.file_handle = FileHandle(FILE_TYPE.ROLLBACK_JOURNAL, file_identifier, file_size=file_size)
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding=""):
+ string = padding + "File Handle:\n{}"
+ string = string.format(self.file_handle.stringify(padding + "\t"))
+ return string
diff --git a/sqlite_dissect/file/schema/README.md b/sqlite_dissect/file/schema/README.md
new file mode 100644
index 0000000..688ef3c
--- /dev/null
+++ b/sqlite_dissect/file/schema/README.md
@@ -0,0 +1,138 @@
+
+# sqlite_dissect.file.schema
+
+This package will control parsing and access to the sqlite master schema files.
+
+- column.py
+- master.py
+- table.py
+- utilities.py
+
+TODO items for the "schema" package:
+
+- [ ] Finish UML class diagrams.
+
+
+
+### column.py
+This script holds the objects needed for parsing column related objects to the master schema.
+
+This script holds the following object(s):
+- ColumnDefinition(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Create variables/constants for regular expressions used?
+ ##### ColumnDefinition Class:
+ - [ ] Improve the handling of finding and skipping comments.
+ - [ ] Handle column constraints correctly.
+ - [ ] Address the "(one/\*comment\*/two)" comment use case where sqlite allows this but ignores "two".
+ - [ ] Decide if static methods should be moved to a utility class (ie. do they have a reuse need).
+ - [ ] When getting the next segment index FOREIGN KEY constraints will cause issues when implemented.
+ - [ ] Test where the trim replaced all whitespace removed for segment in else for data types.
+ - [ ] Add additional documentation on the "NOT SPECIFIED" being a data type in addition to "INVALID".
+ - [ ] Address additional token use cases possibly.
+ - [ ] _get_next_segment_ending_index: The specific data type checking is not needed.
+ - [ ] _get_next_segment_ending_index: Document that the string should be trimmed.
+ - [ ] _get_next_segment_ending_index: Check on constraint strings such as "DEFAULT 0".
+ - [ ] _get_column_affinity: Check if this has duplicate functionality to other utility methods.
+ ##### ColumnConstraint Class:
+ - [ ] Implement comments.
+ - [ ] Needs to be implemented.
+
+
+
+### master.py
+This script holds the main objects used for parsing the master schema and master schema entries (ie. rows).
+
+This script holds the following object(s):
+- MasterSchema(object)
+- MasterSchemaRow(object)
+- TableRow(MasterSchemaRow)
+- OrdinaryTableRow(TableRow)
+- VirtualTableRow(TableRow)
+- IndexRow(MasterSchemaRow)
+- ViewRow(MasterSchemaRow)
+- TriggerRow(MasterSchemaRow)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Investigate use cases quotes may be used in sql outside index, table, and column names.
+- [ ] Investigate if trigger statements can be used in regards to modifying the master schema.
+- [ ] Does it make more sense to have a WithoutRowIdTableRow instead of the OrdinaryTableRow with a flag?
+- [ ] All table and index rows should have column definitions of some sort.
+- [ ] Create variables/constants for regular expressions used?
+ ##### MasterSchema Class:
+ - [ ] Rename master_schema_entries just entries?
+ - [ ] Check if indexes are created on virtual tables for validation.
+ - [ ] Check to make sure every index has an associated table.
+ - [ ] Check to make sure every view has associated tables.
+ - [ ] Check to make sure trigger has associated tables and/or views.
+ - [ ] Validation on the master schema entries such as if indexes exist without any tables defined.
+ - [ ] When adding entries to the master schema entries, check if they already exist or not.
+ - [ ] Change the master schema entries to be better defined (for example a type keyed dictionary).
+ - [ ] Additional validation for the 0 root page use case in master_schema_b_tree_root_page_numbers.
+ - [ ] Remove the "master schema" in front of class attributes?
+ ##### MasterSchemaRow Class:
+ - [ ] Validate use cases of 0 or None for root page in rows (see root_page property).
+ - [ ] Implement comments in virtual tables, index, trigger and view rows once implemented.
+ - [ ] Address the "(one/*comment*/two)" comment use case where sqlite allows this but ignores "two".
+ - [ ] Investigate removal of the sql_has_comments flag.
+ - [ ] The row id is incorporated in the identifier and may be able to change upon alter statements.
+ - [ ] The root page nomenclature can be confusing since there is a master schema root and b-tree root.
+ - [ ] master_schema_b_tree_root_page_numbers: Test with a empty schema.
+ ##### TableRow Class:
+ - [ ] If a virtual table is found, the database version must be >= 3.8.2.
+ ##### OrdinaryTableRow Class:
+ - [ ] Provide better "sqlite_" internal schema object support (may not be needed).
+ - [ ] Implement parsing of the "AS" use case in the create table statement.
+ - [ ] The sql parsing is a bit complicated. This should be able to be done easier.
+ - [ ] During sql parsing use the size of the constraints array to check against instead of a boolean.
+ ##### VirtualTableRow Class:
+ - [ ] Provide better support for modules and a ModuleArgument class. Currently a warning is given.
+ - [ ] Virtual tables are assumed to always have a root page of 0. Investigate and enforce this.
+ ##### IndexRow Class:
+ - [ ] Handle the use case of indexes on table rows that have "without rowid" specified on them.
+ - [ ] Implement "sqlite_autoindex_TABLE_N" index internal schema objects. Currently a warning is given.
+ - [ ] Implement parsing of index columns.
+ - [ ] Implement partial indexes.
+ ##### ViewRow Class:
+ - [ ] Implement.
+ - [ ] Check tables exist for view information and validation.
+ ##### TriggerRow Class:
+ - [ ] Implement.
+ - [ ] Check tables and views exist for trigger information and validation.
+
+
+
+### table.py
+This script holds the objects needed for parsing table related objects to the master schema.
+
+This script holds the following object(s):
+- TableConstraint(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+ ##### TableConstraint Class:
+ - [ ] Needs to be implemented.
+
+
+
+### utilities.py
+This script holds utility functions for dealing with schema specific objects such as parsing comments from sql rather
+than more general utility methods.
+
+This script holds the following function(s):
+- get_index_of_closing_parenthesis(string, opening_parenthesis_offset=0)
+- parse_comment_from_sql_segment(sql_segment)
+
+
+TODO:
+- [ ] Documentation improvements.
+
\ No newline at end of file
diff --git a/sqlite_dissect/file/schema/__init__.py b/sqlite_dissect/file/schema/__init__.py
new file mode 100644
index 0000000..4fef9ce
--- /dev/null
+++ b/sqlite_dissect/file/schema/__init__.py
@@ -0,0 +1,10 @@
+
+"""
+
+__init__.py
+
+This init script will initialize any needed logic for this package.
+
+This package will control parsing and access to the sqlite master schema files.
+
+"""
diff --git a/sqlite_dissect/file/schema/column.py b/sqlite_dissect/file/schema/column.py
new file mode 100644
index 0000000..d38b3e0
--- /dev/null
+++ b/sqlite_dissect/file/schema/column.py
@@ -0,0 +1,604 @@
+from logging import getLogger
+from re import match
+from re import sub
+from sqlite_dissect.constants import COLUMN_CONSTRAINT_PREFACES
+from sqlite_dissect.constants import DATA_TYPE
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import TYPE_AFFINITY
+from sqlite_dissect.exception import MasterSchemaRowParsingError
+from sqlite_dissect.file.schema.utilities import get_index_of_closing_parenthesis
+
+"""
+
+column.py
+
+This script holds the objects needed for parsing column related objects to the master schema.
+
+This script holds the following object(s):
+ColumnDefinition(object)
+
+"""
+
+
+class ColumnDefinition(object):
+
+ def __init__(self, index, column_text, comments=None):
+
+ logger = getLogger(LOGGER_NAME)
+
+ self.index = index
+ self.column_text = sub("\s\s+", " ", column_text.strip())
+
+ """
+
+ When the column text is sent in, the column text starts from the first column name until the "," in the
+ following form:
+ "COLUMN_NAME ... ,"
+
+ Any comments that may appear before the COLUMN_NAME or after the "," should already be parsed and sent in
+ through the constructor as the comments field. However, there may still be comments in the column text
+ itself, where the "...." appear above. These are parsed out here and removing them from the column text.
+ After the column text has all the comments removed, all multiple whitespace character segments including
+ newlines, etc. are replaced by single whitespace characters and then the column text is stripped. Comments
+ are only stripped since the "-- ... \n" comment form cannot have more than the terminating "\n" character
+ in it and the "/* ... */ segment may have "\n" characters in it for a reason, such as length of the comment.
+
+ The way the comments are parsed out here is done by character and skipping ahead instead of pattern matches
+ since technically a comment may have another comment form in it.
+
+ Any comment pulled out from any place in the column definition is considered on the column definition level,
+ and not tied to specific constraints, data types, etc.
+
+ Note: The self.column_text field will be set to the column text sent into this class with only whitespace
+ modifications to strip the text and replace multiple whitespace characters with a single space, " ".
+
+ """
+
+ # Setup the field to parse the column text and comments
+ parsed_column_text = ""
+ parsed_comments = []
+ parsed_comments_total_length = 0
+
+ # Define an index for the parsing the column text
+ character_index = 0
+
+ # Iterate through all of the characters in the column text
+ while character_index < len(column_text):
+
+ # Get the current indexed character
+ character = column_text[character_index]
+
+ # Check for the "/* ... */" comment form
+ if character is "/":
+ last_comment_character_index = column_text.index("*/", character_index) + 1
+ parsed_comment = column_text[character_index:last_comment_character_index + 1]
+ parsed_comments_total_length += len(parsed_comment)
+ parsed_comments.append(parsed_comment)
+ character_index = last_comment_character_index
+
+ # Check for the "-- ... \n" comment form
+ elif character is "-" and column_text[character_index + 1] == "-":
+
+ """
+
+ Above, we check to make sure we are encountering a comment by checking the next character as well
+ for the "-- ... \n" comment.
+
+ Note: A single "-" is allowed since it can be before a negative default value for example in the
+ create statement.
+
+ """
+
+ last_comment_character_index = column_text.index("\n", character_index)
+ parsed_comment = column_text[character_index:last_comment_character_index + 1]
+ parsed_comments_total_length += len(parsed_comment)
+ parsed_comments.append(parsed_comment)
+ character_index = last_comment_character_index
+
+ else:
+ parsed_column_text += character
+
+ # Increment the character index
+ character_index += 1
+
+ # Make sure the parsed lengths add up correctly to the original length
+ if parsed_comments_total_length + len(parsed_column_text) != len(column_text):
+ log_message = "Column index: {} with column text: {} of length: {} was not parsed correctly. The length " \
+ "of the parsed comments total length was: {} with the following comments: {} and the " \
+ "length of the parsed column text was: {} as: {}."
+ log_message = log_message.format(self.index, column_text, len(column_text), parsed_comments_total_length,
+ parsed_comments, len(parsed_column_text), parsed_column_text)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Update the parsed column text replacing any whitespace with a single " " character and stripping it
+ parsed_column_text = sub("\s\s+", " ", parsed_column_text.strip())
+
+ # Check the comments sent in for validity
+ if comments:
+ for comment in comments:
+ if not comment.startswith("--") and not comment.startswith("/*"):
+ log_message = "Comment specified does not start with the schema comment prefix: {}.".format(comment)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Below we strip the comments but if a "\n" happens to be in a "/* ... */", we leave it alone.
+ self.comments = [comment.strip() for comment in comments] if comments else []
+ self.comments += [comment.strip() for comment in parsed_comments]
+
+ # Retrieve the column name and remaining column text after the column name is removed
+ self.column_name, \
+ remaining_column_text = ColumnDefinition._get_column_name_and_remaining_sql(index, parsed_column_text)
+
+ # Setup default values for the column definition fields
+ self.derived_data_type_name = None
+ self.data_type = DATA_TYPE.NOT_SPECIFIED
+ self.column_constraints = []
+
+ """
+
+ If there is a remaining column text then we parse through it since there is either at least one data type
+ or column constraint defined.
+
+ There are 0..1 data types and 0...* column constraints if there is remaining column text.
+
+ Note: The following statements are valid column definitions:
+ 1.) field previous_field TEXT
+ 2.) field TEXT INTEGER BLOB
+
+ This was noticed in a database that had a create table statement that had multiple field names but did
+ not throw an error in SQLite. This is because SQLite pulls the first field as the column name and then
+ takes the string until it hits a column constraint as the whole data type field. In the above examples,
+ the derived data types would be:
+ 1.) previous_field TEXT
+ 2.) TEXT INTEGER BLOB
+
+ SQLite checks for the data type seeing if certain patterns are in this string in a certain order (see
+ the _get_column_affinity function for more information). Therefore, the affinities of the two examples
+ above would be:
+ 1.) TEXT
+ 2.) INTEGER
+
+ Due to this, we parse out the data type the same way as SQLite. We move through the file until we find
+ a column constraint or the end of the column definition and then take that as the data type segment to
+ check on. Keep in mind there are more use cases that are tokenized during this process in SQLite. For
+ instance, if the column definition "field previous_field TEXT as BLOB" was specified, it would fail in
+ SQLite since "as" is a word that is identified as a particular use case in addition to column
+ constraints. This will not be worried about here since this will address all use cases allowed by SQLite
+ and be a superset of all of the use cases allowed for better compatibility instead of trying to handle
+ all of the same token use cases in the SQLite library.
+
+ """
+
+ while len(remaining_column_text):
+
+ # Get the next column definition segment
+ segment_index = ColumnDefinition._get_next_segment_ending_index(self.index, self.column_name,
+ remaining_column_text)
+
+ # Make sure an error did not occur retrieving the segment index
+ if segment_index <= 0 or segment_index > len(remaining_column_text):
+ log_message = "Column name: {} with index: {} has a segment out of bounds with index: {} when the " \
+ "remaining column text is: {} with length: {} from full column text: {}."
+ log_message = log_message.format(self.column_name, self.index, segment_index, remaining_column_text,
+ len(remaining_column_text), self.column_text)
+ logger.error(log_message)
+ raise IndexError(log_message)
+
+ # Get the next segment
+ segment = remaining_column_text[:segment_index + 1]
+
+ if (len(segment) == len(remaining_column_text) or match("\w", remaining_column_text[segment_index + 1])) \
+ and ColumnDefinition._is_column_constraint_preface(segment):
+
+ """
+
+ Here we set the column constraints to the rest of the remaining text.
+
+ """
+
+ # Set the column constraints
+ self.column_constraints = [remaining_column_text]
+
+ # Set the remaining column text (This will be an empty string but needed to exit from while.)
+
+ """
+
+ The next step here is to parse the table constraints:
+ remaining_column_text = remaining_column_text[len(self.column_constraints):]
+ ...
+
+ """
+
+ break
+
+ else:
+
+ """
+
+ The data type may have "(" and ")" characters in it to specify size (size of which is ignored by SQLite
+ as a side note) and needs to be correctly accounted for. Here we get rid of any whitespace around the
+ parenthesis and then any leading or trailing whitespace.
+
+ """
+
+ segment = sub("\s*\(\s*", "(", segment)
+ segment = sub("\s*\)\s*", ")", segment)
+ segment = segment.strip()
+
+ # Convert it to all uppercase for the derived data type name
+ self.derived_data_type_name = segment.upper()
+
+ # Obtain the data type (if possible, otherwise it will be INVALID) from the derived data type name
+ self.data_type = self._get_data_type(self.derived_data_type_name)
+
+ # Set the remaining column text accounting for the white space character after
+ remaining_column_text = remaining_column_text[segment_index + 1:]
+
+ self.type_affinity = self._get_column_affinity(self.data_type, self.derived_data_type_name)
+
+ @staticmethod
+ def _get_column_affinity(data_type, derived_data_type):
+
+ column_type = data_type
+
+ """
+
+ Below we check if the data type was invalid. If the data type is invalid, it means the original
+ type statement was not a predefined type. However, SQLite does not check against predefined types.
+ The SQLite codes does string matches on what was defined to determine affinity. For instance when
+ defining a table: "CREATE TABLE example (a CHAR, b CHARACTER)", both a and b will be determined to have
+ both TEXT affinity according to the rules below. Due to this, we set the type to check on back to the
+ derived data type since that has the original text in it with only some spacing modifications which is
+ negligible. Since the patterns are matched on case sensitivity, we call upper() on the derived data type.
+
+ """
+
+ if column_type == DATA_TYPE.INVALID:
+ column_type = derived_data_type.upper()
+
+ """
+
+ In order to determine the column affinity from the declared column data type we have to follow the
+ set of rules from the SQLite Data Type Documentation below in order:
+
+ 1.) If the declared type contains the string "INT" then it is assigned INTEGER affinity.
+ 2.) If the declared type of the column contains any of the strings "CHAR", "CLOB", or "TEXT"
+ then that column has TEXT affinity. Notice that the type VARCHAR contains the string "CHAR" and is
+ thus assigned TEXT affinity.
+ 3.) If the declared type for a column contains the string "BLOB" or if no type is specified then the column
+ has affinity BLOB.
+ 4.) If the declared type for a column contains any of the strings "REAL", "FLOA", or "DOUB" then the column
+ has REAL affinity.
+ 5.) Otherwise, the affinity is NUMERIC.
+
+ """
+
+ if "INT" in column_type:
+ return TYPE_AFFINITY.INTEGER
+ elif "CHAR" in column_type or "CLOB" in column_type or "TEXT" in column_type:
+ return TYPE_AFFINITY.TEXT
+ elif "BLOB" in column_type or column_type == DATA_TYPE.NOT_SPECIFIED:
+ return TYPE_AFFINITY.BLOB
+ elif "REAL" in column_type or "FLOA" in column_type or "DOUB" in column_type:
+ return TYPE_AFFINITY.REAL
+ else:
+ return TYPE_AFFINITY.NUMERIC
+
+ @staticmethod
+ def _get_column_name_and_remaining_sql(index, column_text):
+
+ # Initialize the logger
+ logger = getLogger(LOGGER_NAME)
+
+ """
+
+ Since the column name can be in brackets, backticks, single quotes, or double quotes, we check to make sure
+ the column name is not in brackets, backticks, single quotes, or double quotes. If it is, our job is fairly
+ simple, otherwise we parse it normally.
+
+ Note: SQLite allows backticks for compatibility with MySQL and allows brackets for compatibility with
+ Microsoft databases.
+
+ """
+
+ if column_text[0] == "`":
+
+ # The column name is surrounded by backticks
+ match_object = match("^`(.*?)`", column_text)
+
+ if not match_object:
+ log_message = "No backtick match found for sql column definition: {} with text: {}."
+ log_message = log_message.format(index, column_text)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Set the column name and strip the backticks
+ column_name = column_text[match_object.start():match_object.end()].strip("`")
+
+ # Set the remaining column text
+ remaining_column_text = column_text[match_object.end():]
+
+ # Return the column name and remaining column text stripped of whitespace
+ return column_name, remaining_column_text.strip()
+
+ elif column_text[0] == "[":
+
+ # The column name is surrounded by brackets
+ match_object = match("^\[(.*?)\]", column_text)
+
+ if not match_object:
+ log_message = "No bracket match found for sql column definition: {} with text: {}."
+ log_message = log_message.format(index, column_text)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Set the column name and strip the brackets
+ column_name = column_text[match_object.start():match_object.end()].strip("[]")
+
+ # Set the remaining column text
+ remaining_column_text = column_text[match_object.end():]
+
+ # Return the column name and remaining column text stripped of whitespace
+ return column_name, remaining_column_text.strip()
+
+ elif column_text[0] == "\'":
+
+ # The column name is surrounded by single quotes
+ match_object = match("^\'(.*?)\'", column_text)
+
+ if not match_object:
+ log_message = "No single quote match found for sql column definition: {} with text: {}."
+ log_message = log_message.format(index, column_text)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Set the column name and strip the single quotes
+ column_name = column_text[match_object.start():match_object.end()].strip("\'")
+
+ # Set the remaining column text
+ remaining_column_text = column_text[match_object.end():]
+
+ # Return the column name and remaining column text stripped of whitespace
+ return column_name, remaining_column_text.strip()
+
+ elif column_text[0] == "\"":
+
+ # The column name is surrounded by double quotes
+ match_object = match("^\"(.*?)\"", column_text)
+
+ if not match_object:
+ log_message = "No double quote match found for sql column definition: {} with text: {}."
+ log_message = log_message.format(index, column_text)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Set the column name and strip the double quotes
+ column_name = column_text[match_object.start():match_object.end()].strip("\"")
+
+ # Set the remaining column text
+ remaining_column_text = column_text[match_object.end():]
+
+ # Return the column name and remaining column text stripped of whitespace
+ return column_name, remaining_column_text.strip()
+
+ else:
+
+ """
+
+ We know now that either the space character is used to separate the column name or the column name
+ makes up the entirety of the column text if there is no space.
+
+ """
+
+ if column_text.find(" ") != -1:
+
+ # There is whitespace delimiting the column name
+ column_name = column_text[:column_text.index(" ")]
+
+ # Parse the remaining column text
+ remaining_column_text = column_text[column_text.index(" ") + 1:]
+
+ # Return the column name and remaining column text stripped of whitespace
+ return column_name, remaining_column_text.strip()
+
+ else:
+
+ # The whole column text is just the column name
+ column_name = column_text
+
+ # The remaining column text should be an empty string but we return it for better interoperability
+ remaining_column_text = column_text[len(column_text):]
+
+ if remaining_column_text:
+ log_message = "Column text remaining when none expected for column name: {} with text: {} " \
+ "and remaining: {} for index: {}."
+ log_message = log_message.format(column_name, column_text, remaining_column_text, index)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Return the column name and remaining column text stripped of whitespace
+ return column_name, remaining_column_text.strip()
+
+ @staticmethod
+ def _get_data_type(derived_data_type):
+
+ # Convert the derived data type to uppercase
+ derived_data_type = derived_data_type.upper()
+
+ # Remove any parenthesis along with numerical values
+ derived_data_type = sub("\(.*\)$", "", derived_data_type)
+
+ # Replace spaces with underscores
+ derived_data_type = sub(" ", "_", derived_data_type)
+
+ for data_type in DATA_TYPE:
+
+ # We remove any numerical values from the end since sqlite does not recognize them in the data types
+ if sub("_\d+.*$", "", data_type) == derived_data_type:
+ return data_type
+
+ # If no data type was found we return an invalid data type
+ return DATA_TYPE.INVALID
+
+ @staticmethod
+ def _get_next_segment_ending_index(index, column_name, remaining_column_text):
+
+ # Initialize the logger
+ logger = getLogger(LOGGER_NAME)
+
+ if len(remaining_column_text) == 0:
+ log_message = "Invalid remaining column text of 0 length found for column index: {} with name: {}: {}."
+ log_message = log_message.format(index, column_name, remaining_column_text)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ """
+
+ Note: We do not want to trim the string ourselves here since we are parsing text and do not know what the
+ calling logic is doing outside this function.
+
+ """
+
+ # Make sure all space is trimmed from the front of the remaining column text as it should be
+ if remaining_column_text[0].isspace():
+ log_message = "Invalid remaining column text beginning with a space found for column " \
+ "index: {} with name: {}: {}."
+ log_message = log_message.format(index, column_name, remaining_column_text)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ # Iterate through the remaining column text to find the next segment
+ next_segment_ending_index = 0
+ while next_segment_ending_index < len(remaining_column_text):
+
+ """
+
+ Note: Since column constraints are not properly implemented at the moment the following will work for
+ column data types but in the future, when this is expanded for column constraints, the
+ constraints will all work the same way according to the documentation except for the FOREIGN KEY
+ constraint which has content following the closing parenthesis.
+
+ """
+
+ if remaining_column_text[next_segment_ending_index] == "(":
+
+ # If we find a "(", we return the index of the closing ")" accounting for the following whitespace
+ return get_index_of_closing_parenthesis(remaining_column_text, next_segment_ending_index) + 1
+
+ elif remaining_column_text[next_segment_ending_index].isspace():
+
+ if remaining_column_text[next_segment_ending_index + 1] == "(":
+
+ # If we find a "(", return the index of the closing one accounting for the following whitespace
+ return get_index_of_closing_parenthesis(remaining_column_text, next_segment_ending_index + 1) + 1
+
+ """
+
+ We do not have to worry about checking the length of the remaining column text since that is already
+ done above. However, this function does not properly check for constraint segments such as "DEFAULT 0"
+ where there still may be content following the initial constraint. However, constraints are not fully
+ implemented at this time, and when this is returned it will be detected within this class, and the rest
+ of the string will be used. A TODO has been put at the top of this script in regards to this.
+
+ Note: We know that if there is a space, than there must be characters following that space since
+ all whitespace was replaced with single whitespaces and the string was trimmed.
+
+ """
+
+ if ColumnDefinition._is_column_constraint_preface(
+ remaining_column_text[next_segment_ending_index + 1:]):
+
+ return next_segment_ending_index
+
+ else:
+ next_segment_ending_index += 1
+
+ else:
+
+ # Check if this segment index equals the end of the remaining column text and if so, return it
+
+ if next_segment_ending_index + 1 == len(remaining_column_text):
+ return next_segment_ending_index
+
+ next_segment_ending_index += 1
+
+ """
+
+ The next segment was unable to be found
+
+ """
+
+ log_message = "Was unable to find the next segment for column index: {} with name: {} on {}."
+ log_message = log_message.format(index, column_name, remaining_column_text)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ @staticmethod
+ def _is_column_constraint_preface(segment):
+
+ for column_constraint_preface in COLUMN_CONSTRAINT_PREFACES:
+
+ """
+
+ Note: When the check is done on the segment, we check the next character is not one of the allowed
+ characters in a column name, data type, etc. to make sure the constraint preface is not the
+ beginning of a longer name where it is not actually a constraint preface (example: primaryEmail).
+ The "\w" regular expression when no LOCALE and UNICODE flags are set will be equivalent to the set:
+ [a-zA-Z0-9_].
+
+ """
+
+ # Check to see if the segment starts with the column constraint preface
+ if segment.upper().startswith(column_constraint_preface):
+ if not (len(column_constraint_preface) + 1 <= len(segment)
+ and match("\w", segment[len(column_constraint_preface)])):
+ return True
+
+ return False
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding="", print_column_constraints=True):
+ string = padding + "Column Text: {}\n" \
+ + padding + "Index: {}\n" \
+ + padding + "Column Name: {}\n" \
+ + padding + "Derived Data Type Name: {}\n" \
+ + padding + "Data Type: {}\n" \
+ + padding + "Type Affinity: {}\n" \
+ + padding + "Number of Comments: {}"
+ string = string.format(self.column_text,
+ self.index,
+ self.column_name,
+ self.derived_data_type_name,
+ self.data_type,
+ self.type_affinity,
+ len(self.comments))
+ for comment in self.comments:
+ string += "\n" + padding + "Comment: {}".format(comment)
+ if print_column_constraints:
+ string += "\n" + padding + "Column Constraints: {}".format(self.column_constraints)
+ return string
+
+
+class ColumnConstraint(object):
+
+ def __init__(self, index, constraint):
+
+ self.index = index
+ self.constraint = constraint
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding=""):
+ string = padding + "Index: {}\n" \
+ + padding + "Constraint: {}"
+ return string.format(self.index, self.constraint)
diff --git a/sqlite_dissect/file/schema/master.py b/sqlite_dissect/file/schema/master.py
new file mode 100644
index 0000000..f830478
--- /dev/null
+++ b/sqlite_dissect/file/schema/master.py
@@ -0,0 +1,2327 @@
+from abc import ABCMeta
+from abc import abstractmethod
+from binascii import hexlify
+from collections import namedtuple
+from logging import getLogger
+from re import match
+from re import sub
+from warnings import warn
+from sqlite_dissect.constants import CREATE_TABLE_CLAUSE
+from sqlite_dissect.constants import CREATE_VIRTUAL_TABLE_CLAUSE
+from sqlite_dissect.constants import CREATE_INDEX_CLAUSE
+from sqlite_dissect.constants import CREATE_UNIQUE_INDEX_CLAUSE
+from sqlite_dissect.constants import INDEX_ON_COMMAND
+from sqlite_dissect.constants import INDEX_WHERE_CLAUSE
+from sqlite_dissect.constants import INTERNAL_SCHEMA_OBJECT_INDEX_PREFIX
+from sqlite_dissect.constants import INTERNAL_SCHEMA_OBJECT_PREFIX
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import MASTER_PAGE_HEX_ID
+from sqlite_dissect.constants import MASTER_SCHEMA_COLUMN
+from sqlite_dissect.constants import MASTER_SCHEMA_NUMBER_OF_COLUMNS
+from sqlite_dissect.constants import MASTER_SCHEMA_ROW_TYPE
+from sqlite_dissect.constants import ORDINARY_TABLE_AS_CLAUSE
+from sqlite_dissect.constants import SQLITE_MASTER_SCHEMA_ROOT_PAGE
+from sqlite_dissect.constants import TABLE_CONSTRAINT_PREFACES
+from sqlite_dissect.constants import VIRTUAL_TABLE_USING_CLAUSE
+from sqlite_dissect.exception import MasterSchemaParsingError
+from sqlite_dissect.exception import MasterSchemaRowParsingError
+from sqlite_dissect.file.database.header import InteriorPageHeader
+from sqlite_dissect.file.database.page import TableInteriorPage
+from sqlite_dissect.file.database.page import TableLeafCell
+from sqlite_dissect.file.database.page import TableLeafPage
+from sqlite_dissect.file.database.utilities import get_pages_from_b_tree_page
+from sqlite_dissect.file.schema.column import ColumnDefinition
+from sqlite_dissect.file.schema.utilities import parse_comment_from_sql_segment
+from sqlite_dissect.file.schema.table import TableConstraint
+from sqlite_dissect.file.schema.utilities import get_index_of_closing_parenthesis
+from sqlite_dissect.utilities import get_md5_hash
+
+"""
+
+master.py
+
+This script holds the main objects used for parsing the master schema and master schema entries (ie. rows).
+
+This script holds the following object(s):
+MasterSchema(object)
+MasterSchemaRow(object)
+TableRow(MasterSchemaRow)
+OrdinaryTableRow(TableRow)
+VirtualTableRow(TableRow)
+IndexRow(MasterSchemaRow)
+ViewRow(MasterSchemaRow)
+TriggerRow(MasterSchemaRow)
+
+"""
+
+
+class MasterSchema(object):
+
+ MasterSchemaEntryData = namedtuple("MasterSchemaEntryData",
+ "record_columns row_type sql b_tree_table_leaf_page_number cell")
+
+ def __init__(self, version_interface, root_page):
+
+ logger = getLogger(LOGGER_NAME)
+
+ if root_page.number != SQLITE_MASTER_SCHEMA_ROOT_PAGE:
+ log_message = "The root page number: {} is not the expected sqlite master schema root page number: {}."
+ log_message = log_message.format(root_page.number, SQLITE_MASTER_SCHEMA_ROOT_PAGE)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ if root_page.hex_type != MASTER_PAGE_HEX_ID:
+ log_message = "The root page hex type: {} is not the expected master page hex: {}."
+ log_message = log_message.format(hexlify(root_page.hex_type), hexlify(MASTER_PAGE_HEX_ID))
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ self._version_interface = version_interface
+
+ self.version_number = self._version_interface.version_number
+ self.page_version_number = self._version_interface.get_page_version(root_page.number)
+ self.root_page = root_page
+ self.master_schema_entries = []
+
+ """
+
+ The master schema entry data attribute below is a dictionary with up to four keys in it representing each of
+ the four types of master schema entries: index, table, trigger, and view pointing to an array of row data
+ where each entry is a MasterSchemaEntryData object describing an entry of that type.
+
+ """
+
+ database_text_encoding = self._version_interface.database_text_encoding
+
+ if isinstance(self.root_page, TableInteriorPage):
+
+ master_schema_entry_data = MasterSchema._parse_table_interior(self.root_page, database_text_encoding)
+
+ elif isinstance(self.root_page, TableLeafPage):
+
+ master_schema_entry_data = MasterSchema._parse_table_leaf(self.root_page, database_text_encoding)
+
+ else:
+
+ """
+
+ Note: This case should never occur since we checked above that the root page needs to start with the
+ master page hex id and a ValueError would have already been thrown if this was not true. This
+ check is still done just in case.
+
+ """
+
+ log_message = "The root page is not a table page but is a: {}.".format(type(self.root_page))
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ if not master_schema_entry_data:
+
+ """
+
+ There is the use case that no master schema entry data was found (ie. empty/no defined schema).
+
+ Double check this use case by making sure the root page b-tree header has:
+ 1.) The number of cells on the page set to zero.
+ 2.) The cell content offset is equal to the page size (meaning there is no page content).
+ 3.) The b-tree table page is not an interior page (referring that it would have subpages with information).
+
+ """
+
+ b_tree_root_page_header = self.root_page.header
+
+ if b_tree_root_page_header.number_of_cells_on_page != 0:
+ log_message = "The b-tree root page header has a cell count of: {} where the master schema entry " \
+ "data was not set in version: {}."
+ log_message = log_message.format(b_tree_root_page_header.number_of_cells_on_page, self.version_number)
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+
+ if b_tree_root_page_header.cell_content_offset != self._version_interface.page_size:
+ log_message = "The b-tree root page cell content offset is: {} when it should match the page " \
+ "size: {} where the master schema entry data was not set in version: {}."
+ log_message = log_message.format(b_tree_root_page_header.cell_content_offset,
+ self._version_interface.page_size, self.version_number)
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+
+ if isinstance(b_tree_root_page_header, InteriorPageHeader):
+ log_message = "The b-tree root page is an interior table page where the master schema entry data " \
+ "was not set in version: {}."
+ log_message = log_message.format(self.version_number)
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+
+ """
+
+ Next we create dictionaries for both tables and views.
+
+ Table names are unique across both tables and views, however:
+ 1.) indexes can only be created on tables (not virtual tables or views)
+ 2.) views are built off of the tables
+ 3.) triggers can be built off of either tables and/or views but it is helpful to know which
+
+ Therefore, we work with two dictionaries instead of one general table dictionary in the form of:
+ dictionary[TABLE_NAME] = [MasterSchemaRow] where MasterSchemaRow will be either a TableRow or IndexRow
+ depending on the respective dictionary.
+
+ Note: Virtual tables will be checked in a different manner to ensure no indexes have been created from it
+ for validation purposes.
+
+ """
+
+ master_schema_tables = {}
+ master_schema_views = {}
+
+ if master_schema_entry_data:
+
+ # Make sure the database text encoding is set.
+ if not self._version_interface.database_text_encoding:
+ log_message = "Master schema entries were found, however no database text encoding as been set yet " \
+ "as expected in version: {}."
+ log_message = log_message.format(self.version_number)
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+
+ """
+
+ Due to the way each type is built off of each other, we create the entries in the following order:
+ 1.) Tables
+ 2.) Indexes
+ 3.) Views
+ 4.) Triggers
+
+ Since information from tables in creating indexes is helpful (especially in generating signatures), tables
+ are created first and then sent into the IndexRow class. The specific table that belongs to the index being
+ created is then pulled out and check in the IndexRow constructor. This table is not pulled out ahead of
+ time and sent in by itself since we don't have a good way to get to the index table name until the IndexRow
+ is created itself.
+
+ Next, all tables are sent into the ViewRow since a view can be made of multiple tables.
+
+ Last, all tables and views are sent into the TriggerRow since a trigger can be across multiple tables
+ and views. Triggers can be defined on views. Although INSERT, UPDATE, DELETE operations will not work
+ on views, triggers will cause associated triggers to fire.
+
+ """
+
+ # Account for table master schema rows
+ if MASTER_SCHEMA_ROW_TYPE.TABLE in master_schema_entry_data:
+ for row_type_data in master_schema_entry_data[MASTER_SCHEMA_ROW_TYPE.TABLE]:
+
+ """
+
+ For tables, we have the choice of two types of tables. The ordinary table and a virtual table.
+ There are two classes for these: OrdinaryTableRow and VirtualTableRow. Both of these classes
+ extend the TableRow class but need to be specified differently since they both are parsed
+ differently. We figure out what type of table we have by checking the beginning of the command.
+ If the command starts with CREATE_TABLE_COMMAND then the table is a create [ordinary] table
+ command and if it starts with CREATE_VIRTUAL_TABLE_COMMAND then the table is a virtual table.
+
+ Note: Due to the way the rules work (documented in the table row classes themselves), the
+ create command at the beginning is always a set static command. All capitals with single
+ spaces until the table name. Therefore, we can be assured that these checks will work.
+
+ """
+
+ if row_type_data.sql.startswith(CREATE_TABLE_CLAUSE):
+ table_row = OrdinaryTableRow(self._version_interface,
+ row_type_data.b_tree_table_leaf_page_number,
+ row_type_data.cell, row_type_data.record_columns)
+ elif row_type_data.sql.startswith(CREATE_VIRTUAL_TABLE_CLAUSE):
+ table_row = VirtualTableRow(self._version_interface,
+ row_type_data.b_tree_table_leaf_page_number,
+ row_type_data.cell, row_type_data.record_columns)
+ else:
+ log_message = "Master schema table row with table name: {} has invalid sql: {}."
+ log_message = log_message.format(row_type_data.sql)
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+
+ if not table_row:
+ log_message = "Master schema table row was not set."
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+
+ self.master_schema_entries.append(table_row)
+ if table_row.table_name in master_schema_tables:
+ log_message = "Master schema table row with table name: {} was already specified in table rows."
+ log_message = log_message.format(table_row.table_name)
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+ master_schema_tables[table_row.table_name] = table_row
+
+ # Account for index master schema rows
+ if MASTER_SCHEMA_ROW_TYPE.INDEX in master_schema_entry_data:
+ for row_type_data in master_schema_entry_data[MASTER_SCHEMA_ROW_TYPE.INDEX]:
+ index_row = IndexRow(self._version_interface, row_type_data.b_tree_table_leaf_page_number,
+ row_type_data.cell, row_type_data.record_columns, master_schema_tables)
+ self.master_schema_entries.append(index_row)
+
+ # Account for view master schema rows
+ if MASTER_SCHEMA_ROW_TYPE.VIEW in master_schema_entry_data:
+ for row_type_data in master_schema_entry_data[MASTER_SCHEMA_ROW_TYPE.VIEW]:
+ view_row = ViewRow(self._version_interface,
+ row_type_data.b_tree_table_leaf_page_number,
+ row_type_data.cell,
+ row_type_data.record_columns,
+ master_schema_tables)
+ self.master_schema_entries.append(view_row)
+ if view_row.table_name in master_schema_tables:
+ log_message = "Master schema view row with table name: {} was already specified in table rows."
+ log_message = log_message.format(view_row.table_name)
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+ if view_row.table_name in master_schema_views:
+ log_message = "Master schema view row with table name: {} was already specified in view rows."
+ log_message = log_message.format(view_row.table_name)
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+ master_schema_views[view_row.table_name] = view_row
+
+ # Account for trigger master schema rows
+ if MASTER_SCHEMA_ROW_TYPE.TRIGGER in master_schema_entry_data:
+ for row_type_data in master_schema_entry_data[MASTER_SCHEMA_ROW_TYPE.TRIGGER]:
+ trigger_row = TriggerRow(self._version_interface, row_type_data.b_tree_table_leaf_page_number,
+ row_type_data.cell, row_type_data.record_columns, master_schema_tables,
+ master_schema_views)
+ self.master_schema_entries.append(trigger_row)
+
+ self.master_schema_pages = get_pages_from_b_tree_page(self.root_page)
+ self.master_schema_page_numbers = [master_schema_page.number for master_schema_page in self.master_schema_pages]
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding="", print_master_schema_root_page=True,
+ print_master_schema_entries=True, print_b_tree_root_pages=True):
+ string = padding + "Version Number: {}\n" \
+ + padding + "Page Version Number: {}\n" \
+ + padding + "Master Schema Page Numbers: {}\n" \
+ + padding + "Master Schema Entries Length: {}\n" \
+ + padding + "Master Schema B-Tree Root Page Numbers: {}"
+ string = string.format(self.version_number,
+ self.page_version_number,
+ self.master_schema_page_numbers,
+ len(self.master_schema_entries),
+ self.master_schema_b_tree_root_page_numbers)
+ if print_master_schema_root_page:
+ string += "\n" + padding + "Master Schema Root Page:\n{}"
+ string = string.format(self.root_page.stringify(padding + "\t"))
+ if print_master_schema_entries:
+ for master_schema_entry in self.master_schema_entries:
+ string += "\n" + padding + "Master Schema Entry:\n{}"
+ string = string.format(master_schema_entry.stringify(padding + "\t"), print_b_tree_root_pages)
+ return string
+
+ @property
+ def master_schema_b_tree_root_page_numbers(self):
+
+ """
+
+ This property will return a list of all of the root page numbers obtained from all master schema entries but
+ only in the following cases:
+ 1.) The entry has a root page number and is not None.
+ 2.) The root page number is not 0.
+
+ Therefore, if the entries were called manually and inspected, master schema entries that are not in
+ this returned page number list may either be 0 or none.
+
+ Note: Originally there was a method to retrieve root b-tree pages directly from the master schema. This was
+ changed by just having the master schema report the root page numbers and then have the client retrieve
+ them as needed from the version interface itself. In regards to pulling out the root pages the following
+ note was made that still applies:
+
+ Additional investigation needs to be done here to see and confirm exactly where the root page can be
+ 0 or None. Right now we know of that according to the documentation "rows that define views,
+ triggers, and virtual tables, the rootpage column is 0 or NULL". We have seen that:
+ 1.) None seems to be used for triggers and views.
+ 2.) The root page number 0 seems to be used for virtual tables.
+
+ Again additional investigation needs to be done here but these should be documented and checked. It
+ may be better to check this in the subclasses themselves instead of here (or both).
+
+ :return: list A list of int data types representing the root page numbers from the master schema entries.
+
+ """
+
+ return [entry.root_page_number for entry in self.master_schema_entries if entry.root_page_number]
+
+ @staticmethod
+ def _create_master_schema_entry_data_named_tuple(b_tree_table_leaf_page_number, cell, database_text_encoding):
+
+ logger = getLogger(LOGGER_NAME)
+
+ record_columns = dict(map(lambda x: [x.index, x], cell.payload.record_columns))
+
+ if MASTER_SCHEMA_COLUMN.TYPE not in record_columns:
+ log_message = "No type column found in record columns for cell index: {}.".format(cell.index)
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+
+ if not record_columns[MASTER_SCHEMA_COLUMN.TYPE].value:
+ log_message = "No type value set in type record column index: {} for cell index: {}."
+ log_message = log_message.format(record_columns[MASTER_SCHEMA_COLUMN.TYPE].index, cell.index)
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+
+ row_type = record_columns[MASTER_SCHEMA_COLUMN.TYPE].value.decode(database_text_encoding)
+
+ if MASTER_SCHEMA_COLUMN.SQL not in record_columns:
+ log_message = "No sql column found in record columns for cell index: {}.".format(cell.index)
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+
+ """
+
+ Note: The value in the SQL record column may be None if there is a index internal schema object found.
+
+ """
+
+ sql_value = record_columns[MASTER_SCHEMA_COLUMN.SQL].value
+ sql = sql_value.decode(database_text_encoding) if sql_value else None
+
+ return MasterSchema.MasterSchemaEntryData(record_columns, row_type, sql, b_tree_table_leaf_page_number, cell)
+
+ @staticmethod
+ def _parse_table_interior(b_tree_table_interior_page, database_text_encoding):
+
+ logger = getLogger(LOGGER_NAME)
+
+ pages = [b_tree_table_interior_page.right_most_page]
+ for b_tree_table_interior_cell in b_tree_table_interior_page.cells:
+ pages.append(b_tree_table_interior_cell.left_child_page)
+
+ """
+
+ The master schema entry data attribute below is a dictionary with up to four keys in it representing each of
+ the four types of master schema entries: index, table, trigger, and view pointing to an array of row data
+ where each entry is a MasterSchemaEntryData object describing an entry of that type.
+
+ """
+
+ master_schema_entry_data = {}
+
+ for page in pages:
+
+ if isinstance(page, TableInteriorPage):
+ returned_master_schema_entry_data = MasterSchema._parse_table_interior(page, database_text_encoding)
+ elif isinstance(page, TableLeafPage):
+ returned_master_schema_entry_data = MasterSchema._parse_table_leaf(page, database_text_encoding)
+ else:
+ log_message = "Invalid page type found: {} when expecting TableInteriorPage or TableLeafPage."
+ log_message = log_message.format(type(page))
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+
+ if not returned_master_schema_entry_data:
+ log_message = "Returned master schema entry data was not set."
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+
+ for row_type, row_type_data in returned_master_schema_entry_data.iteritems():
+ if row_type in master_schema_entry_data:
+ master_schema_entry_data[row_type].extend(row_type_data)
+ else:
+ master_schema_entry_data[row_type] = row_type_data
+
+ return master_schema_entry_data
+
+ @staticmethod
+ def _parse_table_leaf(b_tree_table_leaf_page, database_text_encoding):
+
+ logger = getLogger(LOGGER_NAME)
+
+ """
+
+ All leaf pages should have at least one cell entry in them unless they are the root page. If the leaf page
+ is the root page, it can have 0 cells indicating no schema.
+
+ """
+
+ if len(b_tree_table_leaf_page.cells) == 0 and b_tree_table_leaf_page.number != SQLITE_MASTER_SCHEMA_ROOT_PAGE:
+ log_message = "Length of cells on leaf page is 0 and page number is: {}."
+ log_message = log_message.format(b_tree_table_leaf_page.number)
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+
+ """
+
+ The master schema entry data attribute below is a dictionary with up to four keys in it representing each of
+ the four types of master schema entries: index, table, trigger, and view pointing to an array of row data
+ where each entry is a MasterSchemaEntryData object describing an entry of that type.
+
+ """
+
+ master_schema_entry_data = {}
+
+ for cell in b_tree_table_leaf_page.cells:
+ entry_data = MasterSchema._create_master_schema_entry_data_named_tuple(b_tree_table_leaf_page.number, cell,
+ database_text_encoding)
+ if entry_data.row_type not in master_schema_entry_data:
+ master_schema_entry_data[entry_data.row_type] = [entry_data]
+ else:
+ master_schema_entry_data[entry_data.row_type].append(entry_data)
+
+ return master_schema_entry_data
+
+
+class MasterSchemaRow(object):
+
+ __metaclass__ = ABCMeta
+
+ @abstractmethod
+ def __init__(self, version_interface, b_tree_table_leaf_page_number, b_tree_table_leaf_cell, record_columns):
+
+ logger = getLogger(LOGGER_NAME)
+
+ if not isinstance(b_tree_table_leaf_cell, TableLeafCell):
+ log_message = "Invalid cell type found: {} when expecting TableLeafCell."
+ log_message = log_message.format(type(b_tree_table_leaf_cell))
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ self._version_interface = version_interface
+
+ self.b_tree_table_leaf_page_number = b_tree_table_leaf_page_number
+ self.version_number = self._version_interface.version_number
+ self.page_version_number = self._version_interface.get_page_version(self.b_tree_table_leaf_page_number)
+
+ self.row_id = b_tree_table_leaf_cell.row_id
+ self.row_md5_hex_digest = b_tree_table_leaf_cell.md5_hex_digest
+ self.record_md5_hex_digest = b_tree_table_leaf_cell.payload.md5_hex_digest
+ self.record_columns = record_columns
+
+ if len(self.record_columns) != MASTER_SCHEMA_NUMBER_OF_COLUMNS:
+ log_message = "Invalid number of columns: {} when expected {} for row id: {} of row type: {} on page: {}."
+ log_message = log_message.format(len(self.record_columns), MASTER_SCHEMA_NUMBER_OF_COLUMNS,
+ self.row_id, self.row_type, self.b_tree_table_leaf_page_number)
+ logger.error(log_message)
+ MasterSchemaRowParsingError(log_message)
+
+ if not self.record_columns[MASTER_SCHEMA_COLUMN.TYPE].value:
+ log_message = "No master schema column row type value found for row id: {} of row type: {} on page: {}."
+ log_message = log_message.format(self.row_id, self.row_type, self.b_tree_table_leaf_page_number)
+ logger.error(log_message)
+ MasterSchemaRowParsingError(log_message)
+
+ if not self.record_columns[MASTER_SCHEMA_COLUMN.NAME].value:
+ log_message = "No master schema column name value found for row id: {} of row type: {} on page: {}."
+ log_message = log_message.format(self.row_id, self.row_type, self.b_tree_table_leaf_page_number)
+ logger.error(log_message)
+ MasterSchemaRowParsingError(log_message)
+
+ if not self.record_columns[MASTER_SCHEMA_COLUMN.TABLE_NAME].value:
+ log_message = "No master schema column table name value found for row id: {} of row type: {} on page: {}."
+ log_message = log_message.format(self.row_id, self.row_type, self.b_tree_table_leaf_page_number)
+ logger.error(log_message)
+ MasterSchemaRowParsingError(log_message)
+
+ # Get the database text encoding
+ database_text_encoding = version_interface.database_text_encoding
+
+ # The fields are read out as strings for better incorporation with calling classes when hashing since
+ # if this is not done they are bytearray types and will be unhashable possibly throwing an exception.
+ self.row_type = self.record_columns[MASTER_SCHEMA_COLUMN.TYPE].value.decode(database_text_encoding)
+ self.name = self.record_columns[MASTER_SCHEMA_COLUMN.NAME].value.decode(database_text_encoding)
+ self.table_name = self.record_columns[MASTER_SCHEMA_COLUMN.TABLE_NAME].value.decode(database_text_encoding)
+ self.root_page_number = self.record_columns[MASTER_SCHEMA_COLUMN.ROOT_PAGE].value
+
+ sql_value = self.record_columns[MASTER_SCHEMA_COLUMN.SQL].value
+ self.sql = sql_value.decode(database_text_encoding) if sql_value else None
+
+ self.sql_has_comments = False
+
+ self.comments = []
+
+ if self.sql:
+
+ """
+
+ Below describes the documentation and assumptions that have been made while parsing the schema.
+ It is important to keep in mind that these may change in the future or might be different for
+ older SQLite files. Most of the files being test with are in the range of SQLite version 3.6 to 3.9.
+
+ For the SQLITE_MASTER_TABLE_TYPE the table type could be a normal table or virtual table.
+ The two SQL commands this would account for would be CREATE TABLE and CREATE VIRTUAL TABLE.
+
+ According to the SQLite File Format Documentation, the following modifications are done to the
+ SQL commands before storing them into the SQLite master table SQL column:
+ 1.) The CREATE, TABLE, VIEW, TRIGGER, and INDEX keywords at the beginning of the statement are
+ converted to all upper case letters.
+ 2.) The TEMP or TEMPORARY keyword is removed if it occurs after the initial CREATE keyword.
+ 3.) Any database name qualifier that occurs prior to the name of the object being created is removed.
+ 4.) Leading spaces are removed.
+ 5.) All spaces following the first two keywords are converted into a single space.
+
+ To note, number 5 above does not work as exactly worded. The spaces are removed throughout all of
+ main keywords to the table name. After the table name, all spaces and capitalization are kept as
+ entered.
+
+ Due to this we don't have to check for the TEMP, TEMPORARY, or database name qualifier such as
+ main.[DB NAME], temp.[DB NAME], etc. These qualifiers only place the table into the corresponding
+ opened database (schema name) and then removes this portion of the statement. As a side note,
+ temporary database files are stored in the temp directory of the user along with any additional files
+ such as a rollback journal or WAL file.
+
+ Also, virtual tables were not incorporated until SQLite version 3.8.2 and therefore will not appear
+ in earlier version of SQLite.
+
+ The statement "IF NOT EXISTS" is also removed but not documented in the above for table creation.
+ Therefore, we do not need to check for this use case.
+
+ """
+
+ # Check if comments exist
+ if self.sql.find("--") != -1 or self.sql.find("/*"):
+ self.sql_has_comments = True
+
+ """
+
+ Below we make a unique identifier for this master schema entry. This is build from all of the fields in the
+ master schema entry except for the root page.
+
+ Note: All fields will have a value except for the SQL. This could be None but "None" will just be used in
+ the creation of the identifier.
+
+ """
+
+ master_schema_entry_identifier_string = "{}{}{}{}".format(self.row_id, self.row_type, self.name,
+ self.table_name, self.sql)
+ self.md5_hash_identifier = get_md5_hash(master_schema_entry_identifier_string)
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding="", print_record_columns=True):
+ string = padding + "Version Number: {}\n" \
+ + padding + "Page Version Number: {}\n" \
+ + padding + "B-Tree Table Leaf Page Number: {}\n" \
+ + padding + "Row ID: {}\n" \
+ + padding + "Row MD5 Hex Digest: {}\n" \
+ + padding + "Record MD5 Hex Digest: {}\n" \
+ + padding + "Row Type: {}\n" \
+ + padding + "Name: {}\n" \
+ + padding + "Table Name: {}\n" \
+ + padding + "Root Page Number: {}\n" \
+ + padding + "SQL: {}\n" \
+ + padding + "SQL Has Comments: {}\n" \
+ + padding + "MD5 Hash Identifier: {}"
+ string = string.format(self.version_number,
+ self.page_version_number,
+ self.b_tree_table_leaf_page_number,
+ self.row_id,
+ self.row_md5_hex_digest,
+ self.record_md5_hex_digest,
+ self.row_type,
+ self.name,
+ self.table_name,
+ self.root_page_number,
+ self.sql,
+ self.sql_has_comments,
+ self.md5_hash_identifier)
+ for comment in self.comments:
+ string += "\n" + padding + "Comment: {}".format(comment)
+ if print_record_columns:
+ for index, record_column in self.record_columns.iteritems():
+ string += "\n" \
+ + padding + "Record Column {}:\n{}:".format(index, record_column.stringify(padding + "\t"))
+ return string
+
+ @staticmethod
+ def _get_master_schema_row_name_and_remaining_sql(row_type, name, sql, remaining_sql_command):
+
+ # Initialize the logger
+ logger = getLogger(LOGGER_NAME)
+
+ """
+
+ This method can only be called on table or index types.
+
+ """
+
+ if row_type not in [MASTER_SCHEMA_ROW_TYPE.TABLE, MASTER_SCHEMA_ROW_TYPE.INDEX]:
+ log_message = "Invalid row type: {} defined when parsing master schema row name: {} from sql: {} when " \
+ "type {} or {} was expected."
+ log_message = log_message.format(row_type, name, sql,
+ MASTER_SCHEMA_ROW_TYPE.TABLE, MASTER_SCHEMA_ROW_TYPE.INDEX)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ """
+
+ Since the table or index name can be in brackets, backticks, single quotes, or double quotes, we check to
+ make sure the table or index name is not in single or double quotes. If it is, our job is fairly simple,
+ otherwise we parse it normally.
+
+ Note: Characters like the '.' character are not allowed since it implies a schema. However, if it is in
+ brackets, backticks, or quotes (single or double), it is allowed.
+
+ Note: There may be comments following the table name preceding the column definitions, ie. "(...)", portion
+ of the SQL. If the table name has brackets, backticks, or quotes (single or double) around it,
+ then this use case is handled in the way the table name is pulled out. However, if there are not
+ brackets, backticks, or quotes around the table name, the table name and remaining SQL have to be
+ accounted for differently in the case that there are comments.
+
+ Note: SQLite allows backticks for compatibility with MySQL and allows brackets for compatibility with
+ Microsoft databases.
+
+ """
+
+ if remaining_sql_command[0] == "[":
+
+ # The table name or index name is surrounded by brackets
+ match_object = match("^\[(.*?)\]", remaining_sql_command)
+
+ if not match_object:
+ log_message = "No bracket match found for {} name in sql for {} row name: {} and sql: {}."
+ log_message = log_message.format(row_type, row_type, name, sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Set the parsed name and strip the brackets
+ parsed_name = remaining_sql_command[match_object.start():match_object.end()].strip("[]")
+
+ # Set the remaining sql
+ remaining_sql_command = remaining_sql_command[match_object.end():]
+
+ # Return the parsed name and remaining sql command
+ return parsed_name, remaining_sql_command
+
+ elif remaining_sql_command[0] == "`":
+
+ # The table name or index name is surrounded by backticks
+ match_object = match("^`(.*?)`", remaining_sql_command)
+
+ if not match_object:
+ log_message = "No backtick match found for {} name in sql for {} row name: {} and sql: {}."
+ log_message = log_message.format(row_type, row_type, name, sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Set the parsed name and strip the backticks
+ parsed_name = remaining_sql_command[match_object.start():match_object.end()].strip("`")
+
+ # Set the remaining sql
+ remaining_sql_command = remaining_sql_command[match_object.end():]
+
+ # Return the parsed name and remaining sql command
+ return parsed_name, remaining_sql_command
+
+ elif remaining_sql_command[0] == "\'":
+
+ # The table name or index name is surrounded by single quotes
+ match_object = match("^\'(.*?)\'", remaining_sql_command)
+
+ if not match_object:
+ log_message = "No single quote match found for {} name in sql for {} row name: {} and sql: {}."
+ log_message = log_message.format(row_type, row_type, name, sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Set the parsed name and strip the single quotes
+ parsed_name = remaining_sql_command[match_object.start():match_object.end()].strip("\'")
+
+ # Set the remaining sql
+ remaining_sql_command = remaining_sql_command[match_object.end():]
+
+ # Return the parsed name and remaining sql command
+ return parsed_name, remaining_sql_command
+
+ elif remaining_sql_command[0] == "\"":
+
+ # The table name or index name is surrounded by double quotes
+ match_object = match("^\"(.*?)\"", remaining_sql_command)
+
+ if not match_object:
+ log_message = "No double quote match found for {} name in sql for {} row name: {} and sql: {}."
+ log_message = log_message.format(row_type, row_type, name, sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Set the parsed name and strip the double quotes
+ parsed_name = remaining_sql_command[match_object.start():match_object.end()].strip("\"")
+
+ # Set the remaining sql
+ remaining_sql_command = remaining_sql_command[match_object.end():]
+
+ # Return the parsed name and remaining sql command
+ return parsed_name, remaining_sql_command
+
+ else:
+
+ # Iterate through the characters in the remaining sql command
+ for index, character in enumerate(remaining_sql_command):
+
+ """
+
+ This works for both table and index since with indexes:
+ 1.) Indexes: Following the index name there has to be a newline, space or a comment indicator.
+ There is no use case for it to be anything else such as the opening parenthesis.
+ 2.) Tables: Following the table name, there may or may not be a space between the table name and
+ opening parenthesis. There may also be a comment (with or without a space) directly
+ after the table name. Here will only care in the case it is a comment indicator directly
+ after the table name without a space. We also check for newlines.
+
+ Note: This may be a bit more time consuming for virtual table module names since at this point you
+ could just parse out the name by finding the next " " character index as the ending index for
+ the name.
+
+ Note: A single "-" character is not allowed here as it is within the column definitions such as
+ default negative integer values, etc.
+
+ """
+
+ # See if the character is a single space or an opening parenthesis, or comment indicator
+ if character == '\n' or character == ' ' or character == '(' or character == '-' or character == '/':
+
+ # Check to make sure the full comment indicators were found for "--" and "/*"
+ if (character == '-' and remaining_sql_command[index + 1] != '-') or \
+ (character == '/' and remaining_sql_command[index + 1] != '*'):
+
+ log_message = "Comment indicator '{}' found followed by an invalid secondary comment " \
+ "indicator: {} found in {} name in sql for {} row name: {} and sql: {}."
+ log_message = log_message.format(character, remaining_sql_command[index + 1],
+ row_type, row_type, name, sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Set the table name or index name
+ parsed_name = remaining_sql_command[:index]
+
+ # Set the remaining sql
+ remaining_sql_command_start_offset = remaining_sql_command.index(parsed_name) + len(parsed_name)
+ remaining_sql_command = remaining_sql_command[remaining_sql_command_start_offset:]
+
+ # Return the parsed name and remaining sql command
+ return parsed_name, remaining_sql_command
+
+ # See if the character is a "." since this would apply a schema name which we know shouldn't exist.
+ elif character == '.':
+ log_message = "Invalid \'.\' character found in {} name in sql for " \
+ "{} row name: {} and sql: {}."
+ log_message = log_message.format(row_type, row_type, name, sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ """
+
+ Note: The index method could throw an exception if the table name or index name is not found but this
+ use case is ignored here since we just retrieved it from the remaining SQL command itself.
+
+ """
+
+ log_message = "No {} name found in sql for {} row name: {} and sql: {}."
+ log_message = log_message.format(row_type, row_type, name, sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+
+class TableRow(MasterSchemaRow):
+
+ def __init__(self, version, b_tree_table_leaf_page_number, b_tree_table_leaf_cell, record_columns):
+
+ # Call the superclass to initialize this object
+ super(TableRow, self).__init__(version, b_tree_table_leaf_page_number, b_tree_table_leaf_cell, record_columns)
+
+ # Initialize the logger
+ logger = getLogger(LOGGER_NAME)
+
+ # Make sure this is the table row type after initialized by it's superclass
+ if self.row_type != MASTER_SCHEMA_ROW_TYPE.TABLE:
+ log_message = "Invalid row type: {} when expecting: {} with name: {}."
+ log_message = log_message.format(self.row_type, MASTER_SCHEMA_ROW_TYPE.TABLE, self.name)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ """
+
+ The SQL is always specified for tables (as well as triggers and views). The majority of indexes also have
+ the SQL specified. However, "internal indexes" created by "unique" or "primary key" constraints on ordinary
+ tables do not have SQL.
+
+ """
+
+ # The sql statement must exist for table rows
+ if not self.sql:
+ log_message = "SQL does not exist for table row with name: {}."
+ log_message = log_message.format(self.name)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ def stringify(self, padding="", print_record_columns=True):
+ return super(TableRow, self).stringify(padding, print_record_columns)
+
+ @staticmethod
+ def _get_module_name_and_remaining_sql(name, sql, remaining_sql_command):
+ return MasterSchemaRow._get_master_schema_row_name_and_remaining_sql(MASTER_SCHEMA_ROW_TYPE.TABLE, name, sql,
+ remaining_sql_command)
+
+
+class OrdinaryTableRow(TableRow):
+
+ def __init__(self, version, b_tree_table_leaf_page_number, b_tree_table_leaf_cell, record_columns):
+
+ # Call the superclass to initialize this object
+ super(OrdinaryTableRow, self).__init__(version, b_tree_table_leaf_page_number,
+ b_tree_table_leaf_cell, record_columns)
+
+ # Initialize the logger
+ logger = getLogger(LOGGER_NAME)
+
+ # Make sure this is a create table statement
+ if not self.sql.startswith(CREATE_TABLE_CLAUSE):
+ log_message = "Invalid sql for create ordinary table statement: {} with name: {}."
+ log_message = log_message.format(self.sql, self.name)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Declare the column definitions and table constraints
+ self.column_definitions = []
+ self.table_constraints = []
+
+ """
+
+ Note: The "without rowid" option can not be used in virtual tables.
+
+ Note: Virtual tables do not have any "internal schema objects".
+
+ """
+
+ self.without_row_id = False
+ self.internal_schema_object = False
+
+ # Retrieve the sql command to this table and replace all multiple spaces with a single space
+ sql_command = sub("[\t\r\f\v ][\t\r\f\v ]+", " ", self.sql)
+
+ # Set the create command offset to point to the end of the "create table" statement
+ create_command_offset = len(CREATE_TABLE_CLAUSE)
+
+ """
+
+ We take off the "create table" beginning portion of the command here leaving the table name followed by
+ the column definitions and table constraints with an optional "without rowid" at the end.
+
+ Note: The schema names are never included in the statements themselves since they just redirect which file
+ the data will be stored in. Schemas act more as file handles to open SQLite files in the driver.
+
+ """
+
+ # Left strip the "create table" command from the beginning of the create table statement removing any whitespace
+ remaining_sql_command = str(sql_command[create_command_offset:]).lstrip()
+
+ """
+
+ We now parse through the remaining SQL command to find the table name. Once we find the table name and set it,
+ we remove the table name from the remaining SQL command.
+
+ Note: The table and/or column names may be in single or double quotes. For example, quotes need to be used
+ if a table name has spaces. This is only seen in the SQL statement. These quotes are removed in the
+ name and table name fields.
+
+ Note: It was observed that there may be or may not be a space between the table name and opening parenthesis.
+
+ Note: There may also be a comment directly following the table name (with or without a space character) before
+ the column definitions. The SQL function checks for this use
+ case but does not remove the comment from the returned string. Therefore, it needs to be checked here
+ for comments.
+
+ Note: The above was noticed with one of the sequence tables automatically created by SQLite in some use cases
+ was parsed. The following tables are examples of this in the documentation:
+ 1.) CREATE TABLE sqlite_sequence(name,seq);
+ 2.) CREATE TABLE sqlite_stat1(tbl,idx,stat);
+ 3.) CREATE TABLE sqlite_stat2(tbl,idx,sampleno,sample)
+ 4.) CREATE TABLE sqlite_stat3(tbl,idx,nEq,nLt,nDLt,sample)
+ 5.) CREATE TABLE sqlite_stat4(tbl,idx,nEq,nLt,nDLt,sample);
+
+ These use cases are "internal schema objects" and any master schema objects with the name beginning
+ with "sqlite_" these types of objects. The prefix "sqlite_" used in the name of SQLite master schema
+ rows is reserved for use by SQLite.
+
+ Note: There is no current use case of having "internal schema objects" for virtual tables and therefore
+ no virtual table name will start with "sqlite_".
+
+ """
+
+ # Retrieve the table name and remaining sql after the table name is removed
+ table_name, remaining_sql_command = \
+ MasterSchemaRow._get_master_schema_row_name_and_remaining_sql(self.row_type, self.name, self.sql,
+ remaining_sql_command)
+
+ # Left strip the remaining sql command
+ remaining_sql_command = remaining_sql_command.lstrip()
+
+ # Make sure the table name was set which may not have if for some reason the remaining sql command
+ # did not contain a single space character which would not be an acceptable create table statement
+ if not table_name:
+ log_message = "The table name was not set while parsing sql for table row name: {} and sql: {}."
+ log_message = log_message.format(self.name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Check the table name is equal to the name as specified in the sqlite documentation
+ if table_name.lower() != self.name.lower():
+ log_message = "For table master schema row: {}, the derived table name: {} from the sql: {} " \
+ "does not match the name: {},"
+ log_message = log_message.format(self.row_id, table_name, self.sql, self.name)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Check the table name is equal to the table name as specified in the sqlite documentation
+ if table_name.lower() != self.table_name.lower():
+ log_message = "For table master schema row: {}, the derived table name: {} from the sql: {} " \
+ "does not match the table name: {},"
+ log_message = log_message.format(self.row_id, table_name, self.sql, self.table_name)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ """
+
+ Check the table name to see if it is a internal schema object starting with "sqlite_". More investigation is
+ needed for these objects if there are any different use cases that may apply to them. It appears that these
+ can be parsed just as normal tables. Therefore, we only throw an info message to the logging framework and
+ continue on.
+
+ """
+
+ if self.table_name.startswith(INTERNAL_SCHEMA_OBJECT_PREFIX):
+ self.internal_schema_object = True
+
+ log_message = "Master schema ordinary table row found as internal schema object with name: {}, " \
+ "table name: {} and sql: {} and may have use cases that still need to be addressed."
+ log_message = log_message.format(self.name, self.table_name, self.sql)
+ logger.info(log_message)
+
+ """
+
+ The remaining SQL command must now either start with an opening parenthesis "(", a comment indicator, or "AS".
+ Comment indicators would be either the "--" or "/*" character sequences.
+
+ Note: At this moment the "as [select-stmt]" is not addressed and if detected, a NotImplementedError
+ will be thrown.
+
+ Note: Comments are parsed differently for each row. In the case of a normal table row comments can be
+ anywhere in the create table statement following the name. Therefore the beginning statement:
+ "CREATE TABLE [NAME]" cannot include any comments, but comments can directly follow the name,
+ with or without a space. It was also noted that comments will not appear after the ending ")"
+ parenthesis after the column definitions unless "WITHOUT ROWID" is specified in which case they
+ will occur even after the "WITHOUT ROWID" SQL.
+
+ """
+
+ # Check for comments after the table name, before the column definitions
+ while remaining_sql_command.startswith(("--", "/*")):
+ comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command)
+ self.comments.append(comment.rstrip())
+ remaining_sql_command = remaining_sql_command.lstrip()
+
+ # See if the opening parenthesis is not the first character
+ if remaining_sql_command.find("(") != 0:
+
+ # Check if this remaining sql statement starts with "AS"
+ if remaining_sql_command[:len(ORDINARY_TABLE_AS_CLAUSE)].upper() == ORDINARY_TABLE_AS_CLAUSE:
+ log_message = "Create table statement has an \"AS\" clause for master schema table row with " \
+ "name: {} and sql: {} and is not implemented."
+ log_message = log_message.format(self.name, self.sql)
+ logger.error(log_message)
+ raise NotImplementedError(log_message)
+
+ # If the remaining sql statement does not hit the above two use cases then this is an erroneous statement
+ else:
+ log_message = "Create table statement has an unknown clause for master schema table row with " \
+ "name: {} and sql: {}."
+ log_message = log_message.format(self.name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ """
+
+ Due to the checks above and the fact that the "AS" use case is not handled yet, we can be assured that
+ this create statement remaining SQL command is now in the form of: "(...) ...".
+
+ Next we will parse out the column definitions and table constraints between the "(" and "). After this is done,
+ we will investigate the trailing portion of the create statement past the closing parenthesis if it exists.
+
+ Note: If the "AS" statement was used instead of the opening parenthesis here, the create table statement
+ would be needed to be parsed differently and not in the form of: "(...) ...". Due to this, there
+ is not the same concept of a trailing portion of the create statement past the closing parenthesis.
+ Instead the remaining statement following the "AS" would be a select statement and need to be parsed
+ as such.
+
+ """
+
+ # The first thing is to get the closing parenthesis index to the column definitions and table constraints
+ closing_parenthesis_index = get_index_of_closing_parenthesis(remaining_sql_command)
+
+ # Declare the definitions to be the "(...)" section of the "(...) ..." explained above
+ definitions = remaining_sql_command[:closing_parenthesis_index + 1]
+
+ # Double check the definitions has a beginning opening parenthesis and ends with a closing parenthesis
+ if definitions.find("(") != 0 or definitions.rfind(")") != len(definitions) - 1:
+ log_message = "The definitions are not surrounded by parenthesis as expected for table row with name: {}" \
+ "and sql: {} with definitions: {}."
+ log_message = log_message.format(self.name, self.sql, definitions)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Remove the beginning and ending parenthesis and left strip the string in case single whitespace characters
+ # appear directly after the opening parenthesis and set it back to the definitions. The characters before
+ # the ending parenthesis are allowed since there could be a "\n" character corresponding to a "--" comment.
+ definitions = definitions[1:len(definitions) - 1].lstrip()
+
+ """
+
+ At this point the column definitions, column constraints, and table constraints should be in the format of:
+ ( column-name [[type-name] [column constraint]] [, ...,] [, table constraint] [, ...,] )
+ where the brackets [] represent optional declaration and [, ...,] represents repeats of the previous argument.
+
+ A definition can be a column definition or table constraint:
+ 1.) A column definition is in the form of: column-name [[type-name] [column constraint]]
+ column-name [type-name] [COLUMN-CONSTRAINT ....]
+ 2.) A table constraint is in the form of: [table-constraint]
+ [TABLE-CONSTRAINT ...]
+
+ In order to parse the column definitions and table constraints we need to break them up in their respective
+ segments. Since parentheses and commas exist in their respective segments, we cannot simply do a split on
+ a comma to divide up the sections. In order to break up the sections correctly, we iterate through the
+ definitions string looking for the commas but if we find an opening parenthesis, skip to the closing
+ parenthesis ignoring commas if they exist as well as other characters in that portion of the string. Also,
+ if we find a quote character such as " or ', we need to skip to the following " or ' character.
+
+ According to the documentation it appears that commas separate each segment defining the column definitions
+ and table constraints and only appear within a pair of opening/closing parenthesis within the segment
+ otherwise. Therefore we do not make an assumption here, but raise an exception.
+
+ As we move along parsing the individual segments, we check the beginning of each new section (minus leading
+ whitespace that is removed) if it begins with one of the table constraint prefaces. If it does, we know
+ that is the end of the column definitions and the start of the table constraints. From the first (if any)
+ segment matches one of the table constraint prefaces, than that and any following definitions should all be
+ table constraints and no more column definitions should show up. If any of the following from the first table
+ constraint here does not begin with a table constraint preface, than an exception will be thrown.
+
+ To note, if the first definition found is a table constraint, than an exception will be thrown as well. Also,
+ at least one column definition must be present in the definitions in order to be a proper create statement.
+ According to the documentation, this appears true and therefore if this use case is detected, an exception is
+ thrown.
+
+ Note: When a table is created it must have at least one column.
+
+ Note: The above documentation does not account for comments. Comments may be found anywhere within the
+ definitions. However, if quotes are used to define a default value, data type, etc. the comment is
+ ignored.
+
+ Example: CREATE TABLE example (text_field "TEXT -- I am a text field")
+ In the above example, the data type is "TEXT -- I am a text field" which resolves to a TEXT
+ storage class and from SQLite's perspective, there is no comment.
+
+ Note: The above also gives merit to the following use case:
+
+ Example: CREATE TABLE example (text_field "TEXT -- maintenance information")
+ In the above example, the storage class IS NOT TEXT. It is INTEGER since "int" appears
+ in the string and is checked for first by SQLite when checking the storage class.
+
+ Note: If a value, or other field has a "," in it, it also gets ignored in the same manner if inside single or
+ double quotes. As an example usage, this was first noticed in the DEFAULT clause of a column definition
+ which contained "," characters in the default text string.
+
+ """
+
+ # Make sure the definitions is not an empty string
+ if not definitions:
+ log_message = "No definitions parsed for the table row name: {} and sql: {}."
+ log_message = log_message.format(self.name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Define a index for the column definitions and table constraints
+ definition_index = 0
+
+ # Define an index for the parsing the definitions and the beginning definition index
+ character_index = 0
+ beginning_definition_index = 0
+
+ # Define a boolean for when the table constraints
+ table_constraints_found = False
+
+ # Initialize comments
+ column_definition_comments = []
+
+ # Iterate through all of the characters in the definitions
+ while character_index < len(definitions):
+
+ # Get the current indexed character
+ character = definitions[character_index]
+
+ """
+
+ Check to make sure we are not encountering a comment.
+
+ Note: A single "-" is allowed since it can be before a negative default value for example in the create
+ statement.
+
+ """
+
+ if character is "-":
+
+ # Check to make sure the full comment indicator was found for "--"
+ if definitions[character_index + 1] == "-":
+ character_index = definitions.index("\n", character_index)
+
+ elif character is "/":
+
+ # Check to make sure the full comment indicator was found for "/*"
+ if definitions[character_index + 1] != "*":
+ log_message = "Comment indicator '{}' found followed by an invalid secondary comment " \
+ "indicator: {} found in {}."
+ log_message = log_message.format(character, definitions[character_index + 1], definitions)
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+
+ character_index = definitions.index("*/", character_index) + 1
+
+ """
+
+ Below, we account for column definition comments that may have commas or parenthesis in them in order to
+ make sure a particular portion of a comment doesn't cause the column definition to be parsed incorrectly.
+
+ This is also done with backticks, single, and double quotes.
+
+ Note: SQLite allows backticks for compatibility with MySQL and allows brackets for compatibility with
+ Microsoft databases.
+
+ """
+
+ # Check if the character is an opening bracket, `, and skip to the closing single quote if so
+ if character == "[":
+
+ try:
+
+ # Set the character index to the closing bracket to this opening one
+ character_index = definitions.index("]", character_index + 1)
+
+ except ValueError:
+
+ log_message = "No ending \"]\" character found in the definitions: {} starting from index: {} " \
+ "while parsing the remaining sql: {} for the table row name: {}."
+ log_message = log_message.format(definitions, character_index + 1, remaining_sql_command, self.name)
+ logger.error(log_message)
+ raise
+
+ # Check if the character is an opening backtick, `, and skip to the closing single quote if so
+ if character == "`":
+
+ try:
+
+ # Set the character index to the closing backtick to this opening one
+ character_index = definitions.index("`", character_index + 1)
+
+ except ValueError:
+
+ log_message = "No ending \"`\" character found in the definitions: {} starting from index: {} " \
+ "while parsing the remaining sql: {} for the table row name: {}."
+ log_message = log_message.format(definitions, character_index + 1, remaining_sql_command, self.name)
+ logger.error(log_message)
+ raise
+
+ # Check if the character is an opening single quote, ', and skip to the closing single quote if so
+ if character == "'":
+
+ try:
+
+ # Set the character index to the closing single quote to this opening one
+ character_index = definitions.index("'", character_index + 1)
+
+ except ValueError:
+
+ log_message = "No ending \"'\" character found in the definitions: {} starting from index: {} " \
+ "while parsing the remaining sql: {} for the table row name: {}."
+ log_message = log_message.format(definitions, character_index + 1, remaining_sql_command, self.name)
+ logger.error(log_message)
+ raise
+
+ # Check if the character is an opening double quote, ", and skip to the closing double quote if so
+ if character == "\"":
+
+ try:
+
+ # Set the character index to the closing double quote to this opening one
+ character_index = definitions.index("\"", character_index + 1)
+
+ except ValueError:
+
+ log_message = "No ending \"\"\" character found in the definitions: {} starting from index: {} " \
+ "while parsing the remaining sql: {} for the table row name: {}."
+ log_message = log_message.format(definitions, character_index + 1, remaining_sql_command, self.name)
+ logger.error(log_message)
+ raise
+
+ # Check if the character is an opening parenthesis and skip to the closing parenthesis if so
+ if character == "(":
+
+ # Set the character index to the closing parenthesis to this opening one and increment the index
+ character_index = get_index_of_closing_parenthesis(definitions, character_index)
+
+ # If we find a closing parenthesis character than something went wrong and an exception is thrown
+ elif character == ")":
+ log_message = "An error occurred while parsing the remaining sql: {} for the table row name: {}."
+ log_message = log_message.format(remaining_sql_command, self.name)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ """
+
+ Above we update the index in the case that we find a opening parenthesis to the closing parenthesis index.
+ Below we check if the character is a comma or at the end of the definition string in order to make the next
+ and/or possibly final definition.
+
+ 1.) If the character is a comma then we know we reached that end of the portion of the definition in the
+ definition string and there are more that follow.
+ 2.) If the character index + 1 == len(definitions) which means the character index is pointing to the last
+ element in the array and on the while loop will break on the next iteration. In this case we make the
+ remaining segment into a definition.
+
+ """
+
+ # Check if we find a comma character and if so we know we reached the end of the current definition or
+ # if the character index is either at the end of the definitions string.
+ if character == "," or character_index + 1 == len(definitions):
+
+ # Initialize a variable to add to the character index if comments are found after the comma
+ ending_comments_length = 0
+
+ # If the character index is one length than the length of the definitions (at the end of the
+ # definitions string) then we want to increment it one in order to pick up the last character.
+ # This is due to the array for strings being exclusive to the last index specified.
+ if character_index + 1 == len(definitions):
+ character_index += 1
+
+ # Check if there are comments if there was a comma
+ else:
+
+ """
+
+ For column definitions and table constraints, we will only parse out the comments and send them
+ into the constructor if they start out the definition or directly follow a ",". Any other comments,
+ will not be parsed here and will instead be sent into the column definition or table constraint
+ class for parsing. This was decided to be the best way to associate comments based on location in
+ the create table statement based on location.
+
+ Example 1: CREATE TABLE example_1 ( -- field for text
+ text_field)
+ Here the comment will be parsed out and sent in to the column definition constructor.
+
+ Example 2: CREATE TABLE example_2 ( text_field, /* text field */ integer_field,
+ /* integer field */ )
+ Here the "/* text field */" comment will be sent in as a comment to the text_field
+ column definition. The same will be true for the "/* integer field */" comment for the
+ integer_field.
+
+ Example 3: CREATE TABLE example_3 ( text_field
+ -- field for text)
+ Here the comment will be included in the column definition string and not parsed as a
+ separate comment since there is no "," character even though it's on the next line.
+
+ Example 4: CREATE TABLE example_4 ( text_field,
+ -- field for text
+ integer_field
+ -- field for integer)
+ Here, both comments will be sent in the column definition string for the integer_field
+ and not parsed separate since the first comment is after the "," and the second comment
+ is before (although no following fields are specified here) the next ",". Even though
+ it can be seen that this may not be correct, the pattern above does not follow a
+ consistent pattern and is against what was considered the best way to parse schema
+ comments.
+
+ Example 5: CREATE TABLE example_5 (text_field, -- field for text
+ /* this is a field for text */
+ integer_field -- field for integer)
+ Here, the "-- field for text" comment on the first line will be parsed and sent into the
+ column definition for the text_field. However the "/* this is a field for text */"
+ comment will be parsed and sent into the second column definition. The final comment
+ "-- field for integer" will be sent in along with the integer_field as part of the
+ column definition string.
+
+ In summation, comments right in the beginning or directly following a "," in the definitions will be
+ parsed separate and sent in through the constructor of the corresponding column definition or table
+ constraint. Otherwise, the comment will be send in as part of the definition string to the
+ appropriate class and leave that class up to parse the inner comments to that definition.
+
+ Note: The reason why comments preceding the definition had to be parsed was to pull out extra
+ content from the beginning of the column definition or table constraint in order to be able
+ to detect if it was a table constraint or not.
+
+ Note: This means in the above form of parsing comments that there can be many "/* ... */" comments
+ as long as a "\n" does not appear following the ",". This means that as soon as there is a
+ "-- ... \n" comment, the parsing will end. This also means that there will always be at most
+ one "-- ... \n" comment and the end of the statement following the ",".
+
+ """
+
+ # Get the remaining definition past the comma
+ remaining_definition = definitions[character_index + 1:]
+ left_stripped_character_length = len(remaining_definition)
+ remaining_definition = sub("^[\t\r\f\v ]+", "", remaining_definition)
+ left_stripped_character_length -= len(remaining_definition)
+
+ # See if any comments in the form "/* ... */" exist and remove them if so (there may be 0 ... *)
+ while remaining_definition.startswith("/*"):
+ comment, remaining_definition = parse_comment_from_sql_segment(remaining_definition)
+ left_stripped_character_length += len(remaining_definition)
+ remaining_definition = remaining_definition.lstrip(" ")
+ left_stripped_character_length -= len(remaining_definition)
+ ending_comments_length += len(comment) + left_stripped_character_length
+ column_definition_comments.append(comment)
+
+ # See if any comments in the form "-- ... \n" exist and remove them if so (there may be 0 ... 1)
+ if remaining_definition.startswith("--"):
+ comment, remaining_definition = parse_comment_from_sql_segment(remaining_definition)
+ left_stripped_character_length += len(remaining_definition)
+ remaining_definition = remaining_definition.lstrip(" ")
+ left_stripped_character_length -= len(remaining_definition)
+ ending_comments_length += len(comment) + left_stripped_character_length
+ column_definition_comments.append(comment)
+
+ # Initialize a current definition index to validate against later
+ current_definition_index = definition_index
+
+ # Get the definition string and strip the beginning characters since we do not need any
+ # default whitespace characters there, but may need them at the end (for example in the case
+ # of a "--" comment that ends in "\n".
+ definition = definitions[beginning_definition_index:character_index].lstrip()
+
+ # Check for comments after the beginning of the definition
+ while definition.startswith(("--", "/*")):
+ comment, remaining_definition = parse_comment_from_sql_segment(definition)
+ column_definition_comments.append(comment.rstrip())
+ definition = remaining_definition.lstrip()
+
+ # Iterate through the table constraint prefaces and make sure none of them start off the definition
+ for table_constraint_preface in TABLE_CONSTRAINT_PREFACES:
+
+ # Make sure the length of the definition is at least as long as the table constraint preface
+ if len(definition) >= len(table_constraint_preface):
+
+ """
+
+ Note: Even though the column and table constraint share some of the same prefaces for
+ their constraints, this check is safe since the column definitions will never
+ start out directly with a column constraint preface name that could be confused with
+ a table constraint preface name.
+
+ Note: When the check is done on the definition, we check the next character is not one of the
+ allowed characters in a column name to make sure the constraint preface is not the
+ beginning of a longer column name where it is not actually a constraint preface
+ (example: primaryEmail). The "\w" regular expression when no LOCALE and UNICODE flags
+ are set will be equivalent to the set: [a-zA-Z0-9_].
+
+ """
+
+ # Check to see if the definition starts with the table constraint preface
+ if definition[:len(table_constraint_preface)].upper() == table_constraint_preface:
+
+ if not (len(table_constraint_preface) + 1 <= len(definition)
+ and match("\w", definition[len(table_constraint_preface)])):
+
+ # We have found a table constraint here and make sure this is not the first definition
+ if definition_index == 0:
+
+ # The first definition is a table constraint which should not occur
+ log_message = "First definition found: {} in table row with name: {} and sql: {} " \
+ "is a table constraint."
+ log_message = log_message.format(definition[:len(table_constraint_preface)],
+ self.name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # The definition is a table constraint and not the first definition
+ else:
+
+ """
+
+ Note: Since we are here we assume the first column definition has already been made
+ because at least one of them had to be parsed successfully before reaching
+ this portion of the code. Therefore no additional checks need to be done
+ for checking at least one column definition existing.
+
+ """
+
+ # Create the table constraint
+ self.table_constraints.append(TableConstraint(definition_index, definition,
+ column_definition_comments))
+
+ # Set the table constraints found variable to true now
+ table_constraints_found = True
+
+ # Reinitialize the comments
+ column_definition_comments = []
+
+ # Increment the definition index
+ definition_index += 1
+
+ """
+
+ After each parsing of the definition we check if that was a table constraint. If it was we make sure
+ that the first table constraint and all ones following it are. If this iteration is not a table
+ constraint, that means no table constraints should have been found yet and it is a normal column
+ definition.
+
+ """
+
+ # Check if table constraint has not been found yet (previously or on this iteration)
+ if not table_constraints_found:
+
+ """
+
+ This definition is a column definition.
+
+ Make sure the index was not incremented since no table constraint was made.
+
+ """
+
+ # Make sure the definition index has not changed
+ if current_definition_index != definition_index:
+ log_message = "The definition index: {} was updated indicating a table constraint was " \
+ "made when it should be: {} for a column definition in table row with " \
+ "name: {} and sql: {}."
+ log_message = log_message.format(definition_index, current_definition_index,
+ self.name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Create the column definition
+ self.column_definitions.append(ColumnDefinition(definition_index, definition,
+ column_definition_comments))
+
+ # Reinitialize the comments to the next segments columns
+ column_definition_comments = []
+
+ # Increment the definition index
+ definition_index += 1
+
+ # Make sure the table constraint was made
+ else:
+
+ """
+
+ This definition is a table constraint.
+
+ Make sure the index was incremented since the table constraint was made.
+
+ """
+
+ # Check that the definition index was incremented meaning a table constraint was made
+ if current_definition_index + 1 != definition_index:
+ log_message = "The definition index: {} was not updated indicating a column definition was " \
+ "made when it should be: {} for a table constraint in table row with " \
+ "name: {} and sql: {}."
+ log_message = log_message.format(definition_index, current_definition_index + 1,
+ self.name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Update the beginning definition and character indexes
+ character_index += ending_comments_length + 1
+ beginning_definition_index = character_index
+
+ # The character is just a normal character
+ else:
+
+ # Increment the character index
+ character_index += 1
+
+ """
+
+ Lastly, if there is remaining SQL, we check to make sure it is the "without rowid" statement. If it is not,
+ then an exception will be thrown since that is the only use case allowed here according to the SQLite
+ documentation.
+
+ """
+
+ # Last get the remaining sql command to check for the "without rowid" use case
+ remaining_sql_command = remaining_sql_command[closing_parenthesis_index + 1:].lstrip()
+
+ # See if the remaining sql command has any content left
+ if len(remaining_sql_command) != 0:
+
+ """
+
+ Note: Below we check for comments before, in between and after the "without rowid" statement. We only
+ check for comments assuming we have the "without rowid" specified. This is due to the fact that
+ if the "without rowid" is not specified, any comments following the end of the column definitions
+ are ignored in the create table statement by SQLite. Only when "without rowid" is specified, are
+ comments recognized.
+
+ """
+
+ # Check for comments after the end of the column definitions before the "without rowid"
+ while remaining_sql_command.startswith(("--", "/*")):
+ comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command)
+ self.comments.append(comment.rstrip())
+ remaining_sql_command = remaining_sql_command.lstrip()
+
+ # If there is content left, check if it is the "without rowid" string by seeing if it starts with "without"
+ if remaining_sql_command.upper().startswith("WITHOUT"):
+
+ remaining_sql_command = remaining_sql_command[len("WITHOUT"):].lstrip()
+
+ # Check for comments after the end of the column definitions before the "without rowid"
+ while remaining_sql_command.startswith(("--", "/*")):
+ comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command)
+ self.comments.append(comment.rstrip())
+ remaining_sql_command = remaining_sql_command.lstrip()
+
+ if remaining_sql_command.upper().startswith("ROWID"):
+
+ remaining_sql_command = remaining_sql_command[len("ROWID"):].lstrip()
+
+ # Set the without row id variable to true
+ self.without_row_id = True
+
+ # Check for comments at the end
+ while remaining_sql_command.startswith(("--", "/*")):
+ comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command)
+ self.comments.append(comment.rstrip())
+ remaining_sql_command = remaining_sql_command.lstrip()
+
+ # Make sure we are at the end
+ if len(remaining_sql_command) != 0:
+ log_message = "Invalid sql ending: {} found when nothing more expected in " \
+ "table row with name: {} and sql: {}."
+ log_message = log_message.format(remaining_sql_command, self.name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ else:
+ log_message = "Invalid sql ending: {} found after \"WITHOUT\" when \"ROWID\" expected in " \
+ "table row with name: {} and sql: {}."
+ log_message = log_message.format(remaining_sql_command, self.name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # The remaining string is not the "without rowid" string which, according to sqlite documentation,
+ # should not occur
+ else:
+ log_message = "Invalid sql ending: {} found in table row with name: {} and sql: {}."
+ log_message = log_message.format(remaining_sql_command, self.name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ """
+
+ Until the "without rowid" is fully implemented, we will throw a warning here. Tables without a row id have
+ all of their data stored in index b-tree pages rather than table b-tree pages. Also, the ordering of the
+ columns are switched around depending on what field(s) the primary key is comprised of and where those fields
+ are in the column definitions.
+
+ """
+
+ if self.without_row_id:
+ log_message = "A table specified without a row id was found in table row with name: {} and sql: {}. " \
+ "This use case is not fully implemented."
+ log_message = log_message.format(self.name, self.sql)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ def stringify(self, padding="", print_record_columns=True,
+ print_column_definitions=True, print_table_constraints=True):
+ string = "\n" \
+ + padding + "Without Row ID: {}\n" \
+ + padding + "Internal Schema Object: {}\n" \
+ + padding + "Column Definitions Length: {}\n" \
+ + padding + "Table Constraints Length: {}"
+ string = string.format(self.without_row_id,
+ self.internal_schema_object,
+ len(self.column_definitions),
+ len(self.table_constraints))
+ string = super(OrdinaryTableRow, self).stringify(padding, print_record_columns) + string
+ if print_column_definitions:
+ for column_definition in self.column_definitions:
+ string += "\n" \
+ + padding + "Column Definition:\n{}".format(column_definition.stringify(padding + "\t"))
+ if print_table_constraints:
+ for table_constraint in self.table_constraints:
+ string += "\n" \
+ + padding + "Table Constraint:\n{}".format(table_constraint.stringify(padding + "\t"))
+ return string
+
+
+class VirtualTableRow(TableRow):
+
+ def __init__(self, version, b_tree_table_leaf_page_number, b_tree_table_leaf_cell, record_columns):
+
+ # Call the superclass to initialize this object
+ super(VirtualTableRow, self).__init__(version, b_tree_table_leaf_page_number,
+ b_tree_table_leaf_cell, record_columns)
+
+ # Initialize the logger
+ logger = getLogger(LOGGER_NAME)
+
+ # Make sure this is a create virtual table statement
+ if not self.sql.startswith(CREATE_VIRTUAL_TABLE_CLAUSE):
+ log_message = "Invalid sql for create virtual table statement: {} with name: {}."
+ log_message = log_message.format(self.sql, self.name)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ """
+
+ Note: The "without rowid" option can not be used in virtual tables.
+
+ Note: Virtual tables do not have any "internal schema objects".
+
+ """
+
+ # Retrieve the sql command to this table and replace all multiple spaces with a single space
+ sql_command = sub("[\t\r\f\v ][\t\r\f\v ]+", " ", self.sql)
+
+ # Set the create command offset to point to the end of the "create virtual table" statement
+ create_command_offset = len(CREATE_VIRTUAL_TABLE_CLAUSE)
+
+ """
+
+ We take off the "create virtual table" beginning portion of the command here leaving the table name followed by
+ the "using" statement and then the module arguments.
+
+ Note: The schema names are never included in the statements themselves since they just redirect which file
+ the data will be stored in. Schemas act more as file handles to open sqlite files in the driver.
+
+ """
+
+ # Left strip the "create table" command from the beginning of the create table statement removing any whitespace
+ remaining_sql_command = str(sql_command[create_command_offset:]).lstrip()
+
+ """
+
+ We now parse through the remaining SQL command to find the table name. Once we find the table name and set it,
+ we remove the table name from the remaining SQL command.
+
+ Note: The table and/or column names may be in single or double quotes. For example, quotes need to be used
+ if a table name has spaces. This is only seen in the SQL statement. These quotes are removed in the
+ name and table name fields.
+
+ Note: It was observed that there may be or may not be a space between the table name and opening parenthesis.
+
+ Note: The above was noticed with one of the sequence tables automatically created by SQLite in some use cases
+ was parsed. The following tables are examples of this in the documentation:
+ 1.) CREATE TABLE sqlite_sequence(name,seq);
+ 2.) CREATE TABLE sqlite_stat1(tbl,idx,stat);
+ 3.) CREATE TABLE sqlite_stat2(tbl,idx,sampleno,sample)
+ 4.) CREATE TABLE sqlite_stat3(tbl,idx,nEq,nLt,nDLt,sample)
+ 5.) CREATE TABLE sqlite_stat4(tbl,idx,nEq,nLt,nDLt,sample);
+
+ These use cases are "internal schema objects" and any master schema objects with the name beginning
+ with "sqlite_" these types of objects. The prefix "sqlite_" used in the name of SQLite master schema
+ rows is reserved for use by SQLite.
+
+ Note: There is no current use case of having "internal schema objects" for virtual tables and therefore
+ no virtual table name will start with "sqlite_".
+
+ """
+
+ # Retrieve the table name and remaining sql after the table name is removed
+ table_name, remaining_sql_command = \
+ MasterSchemaRow._get_master_schema_row_name_and_remaining_sql(self.row_type, self.name, self.sql,
+ remaining_sql_command)
+
+ # Left strip the remaining sql command
+ remaining_sql_command = remaining_sql_command.lstrip()
+
+ # Check the table name is equal to the name as specified in the sqlite documentation
+ if table_name.lower() != self.name.lower():
+ log_message = "For virtual table master schema row: {}, the derived table name: {} from the sql: {} " \
+ "does not match the name: {},"
+ log_message = log_message.format(self.row_id, table_name, self.sql, self.name)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Check the table name is equal to the table name as specified in the sqlite documentation
+ if table_name.lower() != self.table_name.lower():
+ log_message = "For virtual table master schema row: {}, the derived table name: {} from the sql: {} " \
+ "does not match the table name: {},"
+ log_message = log_message.format(self.row_id, table_name, self.sql, self.table_name)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ """
+
+ Check the virtual table name to see if it is a internal schema object starting with "sqlite_". Since this
+ is not expected for a virtual table, a error will be raised if detected.
+
+ """
+
+ if self.table_name.startswith(INTERNAL_SCHEMA_OBJECT_PREFIX):
+ log_message = "Master schema virtual table row found as internal schema object with name: {}, " \
+ "table name: {} and sql: {} which should not occur."
+ log_message = log_message.format(self.name, self.table_name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ """
+
+ The remaining SQL command must now either start with "using" which may be mixed-case or an opening
+ parenthesis "(", or a comment indicator. Comment indicators would be either the "--" or "/*" character
+ sequences.
+
+ """
+
+ # Check for comments after the virtual table name, before the using clause
+ while remaining_sql_command.startswith(("--", "/*")):
+ comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command)
+ self.comments.append(comment.rstrip())
+ remaining_sql_command = remaining_sql_command.lstrip()
+
+ # Declare the module arguments
+ self.module_arguments = []
+
+ # Check if this remaining sql statement starts with "AS"
+ if remaining_sql_command[:len(VIRTUAL_TABLE_USING_CLAUSE)].upper() != VIRTUAL_TABLE_USING_CLAUSE:
+ log_message = "Create virtual table statement does not have a \"USING\" clause for master schema " \
+ "table row with name: {} and sql: {}."
+ log_message = log_message.format(self.name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Remove the using prefix and left strip any whitespace
+ remaining_sql_command = remaining_sql_command[len(VIRTUAL_TABLE_USING_CLAUSE):].lstrip()
+
+ # Check for comments after the using clause, before the module name
+ while remaining_sql_command.startswith(("--", "/*")):
+ comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command)
+ self.comments.append(comment.rstrip())
+ remaining_sql_command = remaining_sql_command.lstrip()
+
+ # Declare the module arguments
+ self.module_arguments = []
+
+ # Retrieve the module name and remaining sql after the module name is removed
+ self.module_name, remaining_sql_command = TableRow._get_module_name_and_remaining_sql(self.name, self.sql,
+ remaining_sql_command)
+
+ # Left strip the remaining sql command
+ remaining_sql_command = remaining_sql_command.lstrip()
+
+ # Check for comments after the module name, before the module arguments
+ while remaining_sql_command.startswith(("--", "/*")):
+ comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command)
+ self.comments.append(comment.rstrip())
+ remaining_sql_command = remaining_sql_command.lstrip()
+
+ # Declare the module arguments
+ self.module_arguments = []
+
+ """
+
+ At this point the remaining portion of the SQL command should be in the form of "( module-argument, ... )".
+
+ """
+
+ # The first thing is to get the closing parenthesis index to the module arguments
+ closing_parenthesis_index = get_index_of_closing_parenthesis(remaining_sql_command)
+
+ # Declare the arguments to be the "(...)" section
+ arguments = remaining_sql_command[:closing_parenthesis_index + 1]
+
+ # Double check the module arguments has a beginning opening parenthesis and ends with a closing parenthesis
+ if arguments.find("(") != 0 or arguments.rfind(")") != len(arguments) - 1:
+ log_message = "The arguments are not surrounded by parenthesis as expected for table row with name: {}" \
+ "and sql: {} with arguments: {}."
+ log_message = log_message.format(self.name, self.sql, arguments)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Remove the beginning and ending parenthesis and left strip the string in case single whitespace characters
+ # appear directly after the opening parenthesis and set it back to the definitions. The characters before
+ # the ending parenthesis are allowed since there could be a "\n" character corresponding to a "--" comment.
+
+ """
+
+ The next step here is to parse and strip the module arguments and continue parsing:
+ The next step here is the strip the arguments of the parenthesis and continue parsing the module arguments:
+ arguments = arguments[1:len(arguments) - 1].lstrip()
+ ...
+
+ Support for virtual table modules and module arguments is not yet implemented.
+
+ """
+
+ """
+
+ At this point we have the SQL down to the remaining module arguments. Since the module arguments are different
+ depending on the module, many use cases will need to be investigated and addressed. For now a warning is
+ thrown that a virtual table was found.
+
+ """
+
+ log_message = "Virtual table name: {} was found with module name: {} and sql: {}. Virtual table modules are " \
+ "not fully implemented."
+ log_message = log_message.format(self.name, self.module_name, self.sql)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ """
+
+ The last thing to do is make sure there is nothing remaining in the SQL after the closing parenthesis of the
+ module arguments.
+
+ Note: Similarly, like the create table statement, any comments placed after the module name (when there are no
+ module arguments), or the module arguments, are ignored by SQLite.
+
+ """
+
+ # Last get the remaining sql command to check for the "without rowid" use case
+ remaining_sql_command = remaining_sql_command[closing_parenthesis_index + 1:].lstrip()
+
+ # See if the remaining sql command has any content left
+ if len(remaining_sql_command) != 0:
+ log_message = "Additional content found in virtual table sql after module arguments in table row" \
+ "with name: {} found with module name: {} and sql: {}."
+ log_message = log_message.format(self.name, self.module_name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ def stringify(self, padding="", print_record_columns=True, print_module_arguments=True):
+ string = "\n" \
+ + padding + "Module Name: {}\n" \
+ + padding + "Module Arguments Length: {}"
+ string = string.format(self.module_name,
+ len(self.module_arguments))
+ string = super(VirtualTableRow, self).stringify(padding, print_record_columns) + string
+ if print_module_arguments:
+ for module_argument in self.module_arguments:
+ string += "\n" \
+ + padding + "Module Argument:\n{}".format(module_argument.stringify(padding + "\t"))
+ return string
+
+
+class IndexRow(MasterSchemaRow):
+
+ def __init__(self, version_interface, b_tree_table_leaf_page_number,
+ b_tree_table_leaf_cell, record_columns, tables):
+
+ super(IndexRow, self).__init__(version_interface, b_tree_table_leaf_page_number,
+ b_tree_table_leaf_cell, record_columns)
+
+ # Initialize the logger
+ logger = getLogger(LOGGER_NAME)
+
+ # Make sure this is the index row type after initialized by it's superclass
+ if self.row_type != MASTER_SCHEMA_ROW_TYPE.INDEX:
+ log_message = "Invalid row type: {} when expecting: {} with name: {}."
+ log_message = log_message.format(self.row_type, MASTER_SCHEMA_ROW_TYPE.INDEX, self.name)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ """
+
+ Three boolean fields are declared below:
+
+ 1.) internal_schema_object:
+ An internal schema object is if the index is created by SQLite implicitly through the create table statement
+ such as a primary key or unique constraint.
+
+ 2.) unique
+ If the index is not an internal schema object then it is either a regular index or a unique index. The unique
+ index only enforces that duplicates are not allowed.
+
+ Note: NULL values are considered unique to each other in SQLite, therefore there may be multiple NULL values
+ in any index including unique indexes.
+
+ 3.) partial_index:
+ An index where the WHERE clause is found is a partial index. In ordinary indexes, there is exactly one entry
+ in the index for every row in the table but in partial indexes only some subset of the rows in the table have
+ corresponding index entries. For example where a value is not null resulting in a index where only non-null
+ values have the index over them.
+
+ """
+
+ self.internal_schema_object = False
+ self.unique = False
+ self.partial_index = False
+
+ # Check if this index is an internal schema object
+ if self.name.startswith(INTERNAL_SCHEMA_OBJECT_PREFIX):
+ self.internal_schema_object = True
+
+ """
+
+ Note: Currently the only internal schema objects for indexes begin with "sqlite_autoindex_" according
+ to SQLite documentation from version 3.9.2. Therefore, if any index starts with "sqlite_" but
+ without the following "autoindex_" portion, an error will be raised.
+
+ """
+
+ if self.internal_schema_object and not self.name.startswith(INTERNAL_SCHEMA_OBJECT_INDEX_PREFIX):
+ log_message = "Internal schema object detected but invalid prefix for index row with name: {}."
+ log_message = log_message.format(self.name)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ """
+
+ If this index is an internal schema object index, then it will have no SQL.
+
+ """
+
+ if self.internal_schema_object and self.sql:
+ log_message = "Internal schema object detected for index row with name: {} but found sql: {}."
+ log_message = log_message.format(self.name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ if not self.internal_schema_object and not self.sql:
+ log_message = "Index row with name: {} found with no sql and is not an internal schema object."
+ log_message = log_message.format(self.name)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Make sure the table name this index refers to is in the tables and retrieve that table row.
+ if self.table_name not in tables:
+ log_message = "Index row with name: {} and table name: {} has not correlating table in the tables."
+ log_message = log_message.format(self.name, self.table_name)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ table_row = tables[self.table_name]
+
+ if table_row.without_row_id:
+ log_message = "Index row with name: {} and table name: {} was found to rely on a table without a row id."
+ log_message = log_message.format(self.name, self.table_name)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ """
+
+ Since internal schema object do not have SQL, we need to handle internal schema object differently.
+ Internal schema objects need to have their names parsed rather than SQL. For index internal schema objects,
+ the name are of the form "sqlite_autoindex_TABLE_N" where table is the table name they refer to (this should
+ also match the table name) and N is the index of the primary or unique constraint as defined in the schema.
+
+ Note: The INTEGER PRIMARY KEY does not get an index. For older versions of SQLite it would get a
+ "sqlite_sequence" table created for it if it did not already exist, but this is no longer done unless
+ the AUTOINCREMENT clause is added which is not recommended per the SQLite documentation. However,
+ it has been noticed that there are cases where there may be a INTEGER PRIMARY KEY UNIQUE clause on a
+ column which would cause a unique index internal schema object to be made. This could be confusing
+ since the naming nomenclature would be the same for either primary key or unique and may at first
+ appear to be a created primary key index internal schema object.
+
+ Note: Index internal schema objects are created as side affects to create table statements. A index internal
+ schema object can not be created outside the create table statement.
+
+ """
+
+ if self.internal_schema_object:
+
+ """
+
+ Note: An index internal schema object will not be a partial index but may be unique depending on the
+ clause that created it from the create table statement.
+
+ """
+
+ """
+
+ Until the index internal schema objects are fully implemented, we will throw a warning here. The index
+ internal schema objects are only made on primary key or unique constraints created in the table according
+ to current documentation as of SQLite 3.9.2. These names are in teh form of "sqlite_autoindex_TABLE_N"
+ where TABLE is the table name the auto index belongs to (which should also be mirrored in the table name)
+ and N is the counter for where it appears in the create statement.
+
+ """
+
+ log_message = "A index internal schema object found in index row with name: {} " \
+ "and sql: {}. This is not fully implemented and may cause issues with index pages."
+ log_message = log_message.format(self.name, self.sql)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ else:
+
+ # Retrieve the sql command to this table and replace all multiple spaces with a single space
+ sql_command = sub("[\t\r\f\v ][\t\r\f\v ]+", " ", self.sql)
+
+ """
+
+ At the beginning of the create index statement there can be two use cases to account for:
+ 1.) CREATE INDEX [INDEX_NAME] ...
+ 2.) CREATE UNIQUE INDEX [INDEX_NAME] ...
+
+ The spacing and capitalization will always match one of the two the create [...] index statements above due
+ to the way SQLite works with the SQL. (Also, see documentation in the MasterSchemaRow class.)
+
+ The unique only means that the index is unique and there may not be more than one index in this set that
+ is equivalent. Keep in mind NULL values considered unique to each other in SQLite. This use case does
+ not concern us since we are merely parsing the data, creating signatures, carving data, etc. We are not
+ adding to the index here and therefore this is nothing more than informative for us. However, it may be
+ helpful to keep in mind in the future for trying to rebuild carved entries in some way.
+
+ """
+
+ if sql_command.startswith(CREATE_INDEX_CLAUSE):
+
+ # Set the create command offset to point to the end of the "create index" statement
+ create_command_offset = len(CREATE_INDEX_CLAUSE)
+
+ elif sql_command.startswith(CREATE_UNIQUE_INDEX_CLAUSE):
+
+ self.unique = True
+
+ # Set the create command offset to point to the end of the "create unique index" statement
+ create_command_offset = len(CREATE_UNIQUE_INDEX_CLAUSE)
+
+ else:
+ log_message = "Invalid sql for create index statement: {} with name: {}."
+ log_message = log_message.format(self.sql, self.name)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ if not create_command_offset:
+ log_message = "The create command offset was not set while parsing sql for index row name: {} " \
+ "and sql: {}."
+ log_message = log_message.format(self.name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ """
+
+ We take off the "create [unique] index" beginning portion of the command here leaving the index name next.
+ At this point we have the create index command in the following structure:
+
+ [INDEX_NAME] ON [TABLE_NAME] ( [INDEXED_COLUMN], ... ) [WHERE [EXPR]]
+
+ Note: An INDEXED_COLUMN (specified above) can be either a column-name or expr that may be followed by
+ either a COLLATE command or ASC/DESC command (or both).
+
+ Note: Capitalization of commands does not matter and checks on exact string commands need to take into
+ account case insensitivity.
+
+ Note: Following the index name, comments may appear from that point after in the index SQL.
+
+ """
+
+ # Strip off the "create [unique] index" command from the beginning of the create index statement
+ remaining_sql_command = str(sql_command[create_command_offset + 1:])
+
+ # Get the index name and remaining sql
+ index_name, remaining_sql_command = \
+ MasterSchemaRow._get_master_schema_row_name_and_remaining_sql(self.row_type, self.name,
+ self.sql, remaining_sql_command)
+
+ # Left strip the remaining sql command
+ remaining_sql_command = remaining_sql_command.lstrip()
+
+ # Check if this remaining sql statement starts with "ON"
+ if remaining_sql_command[:len(INDEX_ON_COMMAND)].upper() != INDEX_ON_COMMAND:
+ log_message = "Create index statement does not have a \"ON\" clause for master schema " \
+ "index row with name: {} and sql: {}."
+ log_message = log_message.format(self.name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Remove the using prefix and strip any whitespace from the beginning
+ remaining_sql_command = remaining_sql_command[len(INDEX_ON_COMMAND):].lstrip()
+
+ # Get the table name and remaining sql
+ table_name, remaining_sql_command = \
+ MasterSchemaRow._get_master_schema_row_name_and_remaining_sql(self.row_type, self.name,
+ self.sql, remaining_sql_command)
+
+ # Left strip the remaining sql command
+ remaining_sql_command = remaining_sql_command.lstrip()
+
+ # Check the index name is equal to the name as specified in the sqlite documentation
+ if index_name.lower() != self.name.lower():
+ log_message = "For index master schema row: {}, the index name: {} does not match the derived index" \
+ "name: {} from the sql: {}."
+ log_message = log_message.format(self.row_id, self.name, index_name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Check the table name is equal to the index table name as specified in the sqlite documentation
+ if table_name.lower() != self.table_name.lower():
+ log_message = "For index master schema row: {}, the table name: {} does not match the derived table " \
+ "name: {} from the sql: {}."
+ log_message = log_message.format(self.row_id, self.table_name, table_name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ """
+
+ Note: Since we already checked above that the table name was in the table master schema entries sent in,
+ we do not check again here.
+
+ """
+
+ """
+
+ The remaining SQL command must now either start with an opening parenthesis "(", or a comment indicator.
+ Comment indicators would be either the "--" or "/*" character sequences.
+
+ """
+
+ # Check for comments after the index name, before the indexed columns
+ while remaining_sql_command.startswith(("--", "/*")):
+ comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command)
+ self.comments.append(comment.rstrip())
+ remaining_sql_command = remaining_sql_command.lstrip()
+
+ # The first thing to be done is get the closing parenthesis index to the indexed columns
+ closing_parenthesis_index = get_index_of_closing_parenthesis(remaining_sql_command)
+
+ # Declare the indexed columns to be the "( [INDEXED_COLUMN], ... )" explained above
+ indexed_columns = remaining_sql_command[:closing_parenthesis_index + 1]
+
+ # Double check the indexed columns has a beginning opening parenthesis and ends with a closing parenthesis.
+ if indexed_columns.find("(") != 0 or indexed_columns.rfind(")") != len(indexed_columns) - 1:
+ log_message = "The indexed columns are not surrounded by parenthesis as expected for index row with" \
+ "name: {} and sql: {} with definitions: {}."
+ log_message = log_message.format(self.name, self.sql, indexed_columns)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Remove the beginning and ending parenthesis and left strip the string in case single whitespace characters
+ # appear directly after the opening parenthesis and set it back to the index columns. The characters before
+ # the ending parenthesis are allowed since there could be a "\n" character corresponding to a "--" comment.
+
+ """
+
+ The next step here is to parse and left strip the indexed columns and continue parsing:
+ indexed_columns = indexed_columns[1:len(indexed_columns) - 1].lstrip()
+ ...
+
+ Support for indexed columns has not been implemented yet.
+
+ """
+
+ """
+
+ Lastly, if there is remaining SQL, we check to make sure it is a "WHERE" statement. If it is not,
+ then an exception will be thrown since that is the only use case allowed here according to the SQLite
+ documentation.
+
+ """
+
+ # Last get the remaining sql command to check for the "where" use case
+ remaining_sql_command = remaining_sql_command[closing_parenthesis_index + 1:].lstrip()
+
+ """
+
+ The create index statements work differently than the create table statements in respect to comments
+ and the clauses after the column definitions/indexed columns. In a create table statement, any comments
+ after the end of the column definitions is ignored by SQLite unless the "without rowid" clause is
+ stated which then recognizes comments before, in between, and after the clause.
+
+ For create index statements, comments are not ignored by SQLite no matter if the "where" clause
+ is specified after the indexed columns or not. Therefore, if the remaining SQL command has any
+ more content, it may either be a comment, a "where" clause, or both.
+
+ """
+
+ # Check for comments after the end of the index columns
+ while remaining_sql_command.startswith(("--", "/*")):
+ comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command)
+ self.comments.append(comment.rstrip())
+ remaining_sql_command = remaining_sql_command.lstrip()
+
+ # See if the remaining sql command has any content left
+ if len(remaining_sql_command) != 0:
+
+ """
+
+ Since we removed any previous comments above, if we still have content at this point, we know that the
+ only allowed use case in this scenario is to have the "where" statement next in the SQL.
+
+ Note: The "where" clause may be mixed-case.
+
+ """
+
+ # Check if this remaining sql statement starts with "WHERE"
+ if remaining_sql_command[:len(INDEX_WHERE_CLAUSE)].upper() != INDEX_WHERE_CLAUSE:
+ log_message = "Create virtual table statement does not have a \"WHERE\" clause for master schema " \
+ "index row with name: {} and sql: {} when expected."
+ log_message = log_message.format(self.name, self.sql)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ # Set the partial index flag and the where expression
+ self.partial_index = True
+
+ # Check for comments after the where clause
+ while remaining_sql_command.startswith(("--", "/*")):
+ comment, remaining_sql_command = parse_comment_from_sql_segment(remaining_sql_command)
+ self.comments.append(comment.rstrip())
+ remaining_sql_command = remaining_sql_command.lstrip()
+
+ """
+
+ The next step here is to parse the "WHERE" clause:
+ remaining_sql_command = remaining_sql_command[len(INDEX_WHERE_CLAUSE):].lstrip()
+ ...
+
+ Support for partial indexes has not been implemented yet.
+
+ """
+
+ """
+
+ Until the partial index is fully implemented, we will throw a warning here. Partial indexes are only
+ made on a subset of rows depending on the "WHERE" clause which would need to be parsed to be exact.
+
+ """
+
+ if self.partial_index:
+ log_message = "A index specified as a partial index was found in index row with name: {} " \
+ "and sql: {}. This use case is not fully implemented."
+ log_message = log_message.format(self.name, self.sql)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ def stringify(self, padding="", print_record_columns=True):
+ string = "\n" \
+ + padding + "Internal Schema Object: {}\n" \
+ + padding + "Unique: {}\n" \
+ + padding + "Partial Index: {}"
+ string = string.format(self.internal_schema_object,
+ self.unique,
+ self.partial_index)
+ string = super(IndexRow, self).stringify(padding, print_record_columns) + string
+ return string
+
+
+class ViewRow(MasterSchemaRow):
+
+ def __init__(self, version_interface, b_tree_table_leaf_page_number,
+ b_tree_table_leaf_cell, record_columns, tables):
+
+ super(ViewRow, self).__init__(version_interface, b_tree_table_leaf_page_number,
+ b_tree_table_leaf_cell, record_columns)
+
+ logger = getLogger(LOGGER_NAME)
+
+ if self.row_type != MASTER_SCHEMA_ROW_TYPE.VIEW:
+ log_message = "Invalid row type: {} when expecting: {} with name: {}."
+ log_message = log_message.format(self.row_type, MASTER_SCHEMA_ROW_TYPE.VIEW, self.name)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+
+class TriggerRow(MasterSchemaRow):
+
+ def __init__(self, version_interface, b_tree_table_leaf_page_number,
+ b_tree_table_leaf_cell, record_columns, tables, views):
+
+ super(TriggerRow, self).__init__(version_interface, b_tree_table_leaf_page_number,
+ b_tree_table_leaf_cell, record_columns)
+
+ logger = getLogger(LOGGER_NAME)
+
+ if self.row_type != MASTER_SCHEMA_ROW_TYPE.TRIGGER:
+ log_message = "Invalid row type: {} when expecting: {} with name: {}."
+ log_message = log_message.format(self.row_type, MASTER_SCHEMA_ROW_TYPE.TRIGGER, self.name)
+ logger.error(log_message)
+ raise ValueError(log_message)
diff --git a/sqlite_dissect/file/schema/table.py b/sqlite_dissect/file/schema/table.py
new file mode 100644
index 0000000..ba5b1c1
--- /dev/null
+++ b/sqlite_dissect/file/schema/table.py
@@ -0,0 +1,47 @@
+from logging import getLogger
+from re import sub
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.exception import MasterSchemaRowParsingError
+
+"""
+
+table.py
+
+This script holds the objects needed for parsing table related objects to the master schema.
+
+This script holds the following object(s):
+TableConstraint(object)
+
+"""
+
+
+class TableConstraint(object):
+
+ def __init__(self, index, constraint, comments=None):
+
+ logger = getLogger(LOGGER_NAME)
+
+ self.index = index
+ self.constraint = constraint
+
+ if comments:
+ for comment in comments:
+ if not comment.startswith("--") or not comment.startswith("/*"):
+ log_message = "Comment specified does not start with the schema comment prefix: {}.".format(comment)
+ logger.error(log_message)
+ raise MasterSchemaRowParsingError(log_message)
+
+ self.comments = [comment.strip() for comment in comments] if comments else []
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding=""):
+ string = padding + "Index: {}\n" \
+ + padding + "Constraint: {}"
+ for comment in self.comments:
+ string += "\n" + padding + "Comment: {}".format(comment)
+ return string.format(self.index, self.constraint)
diff --git a/sqlite_dissect/file/schema/utilities.py b/sqlite_dissect/file/schema/utilities.py
new file mode 100644
index 0000000..78d1401
--- /dev/null
+++ b/sqlite_dissect/file/schema/utilities.py
@@ -0,0 +1,187 @@
+from logging import getLogger
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.exception import MasterSchemaParsingError
+
+"""
+
+utilities.py
+
+This script holds utility functions for dealing with schema specific objects such as parsing comments from sql rather
+than more general utility methods.
+
+This script holds the following function(s):
+get_index_of_closing_parenthesis(string, opening_parenthesis_offset=0)
+parse_comment_from_sql_segment(sql_segment)
+
+"""
+
+
+def get_index_of_closing_parenthesis(string, opening_parenthesis_offset=0):
+
+ """
+
+
+ Note: Comments are skipped.
+
+ Note: The string to find the index of the closing parenthesis in requires there to be the opening parenthesis
+ at the index of the opening parenthesis offset. This can be 0 by default representing the opening
+ parenthesis at the beginning of the string or a specified index.
+
+ :param string: str The string to find the index of the closing parenthesis in.
+ :param opening_parenthesis_offset: int The index of the first opening parenthesis.
+
+ :return:
+
+ :raise:
+
+ """
+
+ logger = getLogger(LOGGER_NAME)
+
+ if string[opening_parenthesis_offset] != "(":
+ log_message = "The opening parenthesis offset specifies a \"{}\" character and not the " \
+ "expected \"(\" in {} with opening parenthesis offset: {}."
+ log_message = log_message.format(string[opening_parenthesis_offset], string, opening_parenthesis_offset)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ """
+
+ We need to find the matching ")" character to the opening of the column definition area. To
+ do this we search looking for the ")" character but skip one for every matching "(" we find from the
+ first occurrence.
+
+ We also have to skip all comments indicated by "--" and "/*" and terminated by "\n" and "*/" respectively.
+ In order to skip comments, we have to flag when we are in a comment. In the case that we find:
+ 1.) "--" comment: We set the comment_indicator field to 1 and back to 0 once the "\n" is found
+ 2.) "/*" comment: We set the comment_indicator field to 2 and back to 0 once the "*/" is found
+
+ Note: If we are in a comment already, we ignore other comment indicators.
+
+ """
+
+ closing_parenthesis_offset = opening_parenthesis_offset
+ embedded_parentheses = 0
+ comment_indicator = 0
+
+ for index, character in enumerate(string[opening_parenthesis_offset + 1:], opening_parenthesis_offset + 1):
+
+ closing_parenthesis_offset = index
+
+ if comment_indicator:
+
+ if (comment_indicator == 1 and character == '\n') or \
+ (comment_indicator == 2 and character == '/' and string[index - 1] == '*'):
+ comment_indicator = 0
+
+ else:
+
+ if character is "(":
+
+ embedded_parentheses += 1
+
+ elif character is ")":
+
+ if embedded_parentheses == 0:
+ break
+ else:
+ embedded_parentheses -= 1
+
+ elif character is "-":
+
+ """
+
+ Check to make sure we are encountering a comment.
+
+ Note: A single "-" is allowed since it can be before a negative default value for example in the
+ create statement.
+
+ """
+
+ # Check to make sure the full comment indicator was found for "--"
+ if string[index + 1] == "-":
+
+ # Set the comment indicator
+ comment_indicator = 1
+
+ elif character == "/":
+
+ # Check to make sure the full comment indicators were found for "--" and "/*"
+ if character == "/" and string[index + 1] != "*":
+ log_message = "Comment indicator '{}' found followed by an invalid secondary comment " \
+ "indicator: {} found in {}."
+ log_message = log_message.format(character, string[index + 1], string)
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+
+ # Set the comment indicator
+ comment_indicator = 2
+
+ # Check to make sure the closing parenthesis was found
+ if closing_parenthesis_offset == len(string) - 1 and string[closing_parenthesis_offset] != ")":
+ log_message = "The closing parenthesis was not found in {} with opening parenthesis offset: {}."
+ log_message = log_message.format(string, opening_parenthesis_offset)
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
+
+ return closing_parenthesis_offset
+
+
+def parse_comment_from_sql_segment(sql_segment):
+
+ """
+
+ This function will parse out the comment from the sql_segment. This function assumes that a comment
+ was already detected and needs to be parsed and therefore the sql_segment should start with:
+ 1.) --
+ 2.) /*
+
+ If the sql_segment does not start with either, then an exception will be raised. If a comment is
+ found then the comment will be parsed out and returned along with the remaining sql. Only the first comment
+ will be stripped and returned in the case that there are multiple comments within the supplied sql_segment.
+
+ If the either of the two above use cases above are found, then they will be parsed in the following manner:
+ 1.) --: The comment will be parsed from the "--" until the newline "\n" character is found:
+ ... [-- ... \n] ...
+ 2.) /*: THe comment will be parsed from the "/*" until the matching "*/" character sequence is found:
+ ... [/* ... */] ...
+ Note: The "/* ... */" comment tags can have new lines within them.
+
+ Note: The returned comment will include the "--" or "/* and "*/" strings. If the comment was started with the
+ "--" comment indicator, the ending '\n' character is included in the comment string. It is up to the caller
+ to call rstrip() or a likewise operation if needed.
+
+ Note: The returned remaining_sql_segment will not have strip() called on it.
+
+ :param sql_segment:
+
+ :return: tuple(comment, remaining_sql_segment)
+
+ :raise: MasterSchemaParsingError
+
+ """
+
+ logger = getLogger(LOGGER_NAME)
+
+ # Check if the sql segment starts with "--"
+ if sql_segment.startswith("--"):
+
+ comment = sql_segment[:sql_segment.index('\n') + 1]
+ remaining_sql_segment = sql_segment[sql_segment.index('\n') + 1:]
+
+ return comment, remaining_sql_segment
+
+ # Check if the sql segment starts with "/*"
+ elif sql_segment.startswith("/*"):
+
+ comment = sql_segment[:sql_segment.index("*/") + 2]
+ remaining_sql_segment = sql_segment[sql_segment.index("*/") + 2:]
+
+ return comment, remaining_sql_segment
+
+ # The remaining sql command does not start with "--" or "/*" as expected
+ else:
+ log_message = "The sql segment: {} did not start with the expected \"--\" or \"/*\" strings."
+ log_message = log_message.format(sql_segment.number)
+ logger.error(log_message)
+ raise MasterSchemaParsingError(log_message)
diff --git a/sqlite_dissect/file/utilities.py b/sqlite_dissect/file/utilities.py
new file mode 100644
index 0000000..5c38524
--- /dev/null
+++ b/sqlite_dissect/file/utilities.py
@@ -0,0 +1,29 @@
+from sqlite_dissect.file.wal.commit_record import WriteAheadLogCommitRecord
+
+"""
+
+utilities.py
+
+This script holds utility functions for dealing with the version classes rather than more general utility methods.
+
+This script holds the following function(s):
+validate_page_version_history(version_history)
+
+"""
+
+
+def validate_page_version_history(version_history):
+ for version_number, version in version_history.versions.iteritems():
+ for page_number, page in version.pages.iteritems():
+ if page.page_version_number != version.page_version_index[page.number]:
+ return False
+ if page.version_number != version.version_number:
+ return False
+ if isinstance(version, WriteAheadLogCommitRecord):
+ if page_number in version.updated_page_numbers:
+ page_frame_index = version.page_frame_index
+ page_frame = page_frame_index[page.number]
+ actual_page_frame = version.frames[page.number].frame_number
+ if page_frame != actual_page_frame:
+ return False
+ return True
diff --git a/sqlite_dissect/file/version.py b/sqlite_dissect/file/version.py
new file mode 100644
index 0000000..684caf3
--- /dev/null
+++ b/sqlite_dissect/file/version.py
@@ -0,0 +1,388 @@
+from abc import ABCMeta
+from abc import abstractmethod
+from binascii import hexlify
+from logging import getLogger
+from re import sub
+from sqlite_dissect.constants import INDEX_INTERIOR_PAGE_HEX_ID
+from sqlite_dissect.constants import INDEX_LEAF_PAGE_HEX_ID
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import MASTER_PAGE_HEX_ID
+from sqlite_dissect.constants import PAGE_TYPE
+from sqlite_dissect.constants import PAGE_TYPE_LENGTH
+from sqlite_dissect.constants import SQLITE_DATABASE_HEADER_LENGTH
+from sqlite_dissect.constants import SQLITE_MASTER_SCHEMA_ROOT_PAGE
+from sqlite_dissect.constants import TABLE_INTERIOR_PAGE_HEX_ID
+from sqlite_dissect.constants import TABLE_LEAF_PAGE_HEX_ID
+from sqlite_dissect.exception import VersionParsingError
+from sqlite_dissect.file.database.header import DatabaseHeader
+from sqlite_dissect.file.database.page import IndexInteriorPage
+from sqlite_dissect.file.database.page import IndexLeafPage
+from sqlite_dissect.file.database.page import TableInteriorPage
+from sqlite_dissect.file.database.page import TableLeafPage
+from sqlite_dissect.file.database.utilities import get_pages_from_b_tree_page
+from sqlite_dissect.file.schema.master import MasterSchema
+
+"""
+
+version.py
+
+This script holds the superclass objects used for parsing the database and write ahead log.
+
+This script holds the following object(s):
+Version(object)
+
+"""
+
+
+class Version(object):
+
+ __metaclass__ = ABCMeta
+
+ def __init__(self, file_handle, version_number, store_in_memory, strict_format_checking):
+
+ self._logger = getLogger(LOGGER_NAME)
+
+ self.file_handle = file_handle
+ self.version_number = version_number
+ self.store_in_memory = store_in_memory
+ self.strict_format_checking = strict_format_checking
+ self.page_size = self.file_handle.header.page_size
+
+ self._database_header = None
+ self.database_size_in_pages = None
+
+ self._root_page = None
+
+ self.first_freelist_trunk_page = None
+ self.freelist_page_numbers = None
+ self.pointer_map_pages = None
+ self.pointer_map_page_numbers = None
+
+ self._master_schema = None
+
+ self.updated_page_numbers = None
+ self.page_version_index = None
+
+ """
+
+ The _pages variable is only for the use case that the pages are requested to be stored in memory.
+
+ """
+
+ self._pages = None
+
+ """
+
+ The following variables are to track across the versions (database and wal commit records) what portions of the
+ file are changed.
+
+ For the Database:
+
+ The database header, root b-tree page, and master schema will be set to True since these objects are always
+ considered modified in the database file. As a side note, the master schema could not have any entries if there
+ was no schema but is still considered modified.
+
+ The freelist pages modified flag will be set to True if freelist pages exist, otherwise False since there are no
+ freelist pages.
+
+ The pointer map pages modified flag will be set to True if the largest b-tree root page is set in the header
+ indicating that auto-vacuuming is turned on. Otherwise, if this field is 0, auto-vacuuming is turned off and
+ the pointer map pages modified flag will be set to False. As a side note, if this is set to False, then it
+ will continue to be False throughout all following versions since the auto-vacuuming must be set before the
+ schema creation and cannot be turned off if enabled, or turned on if not enabled initially. (Switching between
+ full (0) and incremental (1) auto-vacuuming modes is allowed.)
+
+ The updated b-tree page numbers array are all the schema root page numbers including all pages of the b-tree.
+ These will represent all of the b-tree and overflow pages (excluding the master schema related pages) updated.
+ All of the b-tree pages for the database will be included in this array.
+
+ For the WriteAheadLogCommitRecord:
+
+ The database header will be set to True if the database header was updated. This should always occur if any
+ change was made to the root page (although the root page may be in the commit record with no changes).
+
+ The root b-tree page modified flag will be set if the content on the root b-tree portion (not including the
+ database header) is modified. This will also result in teh master schema modified flag being set to True.
+ However, the inverse is not true as described next.
+
+ The master schema modified flag will be set if the master schema is updated. This includes any of the master
+ schema pages being updated in the b-tree. If the master schema updated pages did not include the sqlite master
+ schema root page (1). then the master schema modified flag will still be set to True, but the root b-tree page
+ modified flag will be False.
+
+ The freelist pages modified and pointer map pages flags will be set to True when the freelist pages are updated
+ in any way.
+
+ The updated b-tree page numbers array are all the schema root page numbers including all pages of the b-tree.
+ These will represent all of the b-tree and overflow pages (excluding the master schema related pages) updated.
+ Only the b-tree pages for the wal commit record that were updated will be included in this array.
+
+ Note: The database header modified flag and the root b-tree page modified tags refer to different areas of the
+ sqlite root page. The database header may be modified without the root b-tree page being modified.
+ However, if the root b-tree page is modified, then the header should always be modified since the header
+ contains a change counter that is incremented whenever changes to the database are done.
+
+ Note: The following variables below specify if the root b-tree page was modified and if the master
+ schema was modified. Although the master schema is on the database root b-tree page, the
+ master schema changes may not be directly on the root page itself. Therefore, the master schema
+ may be modified without modifying the root page but if the root b-tree page is modified, then the
+ master schema modified flag will always be set.
+
+ """
+
+ self.database_header_modified = False
+ self.root_b_tree_page_modified = False
+ self.master_schema_modified = False
+ self.freelist_pages_modified = False
+ self.pointer_map_pages_modified = False
+
+ self.updated_b_tree_page_numbers = None
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding="", print_pages=True, print_schema=True):
+ string = padding + "File Type: {}\n" \
+ + padding + "Version Number: {}\n" \
+ + padding + "Store in Memory: {}\n" \
+ + padding + "Strict Format Checking: {}\n" \
+ + padding + "Page Size: {}\n" \
+ + padding + "File Handle:\n{}\n" \
+ + padding + "Database Header:\n{}\n" \
+ + padding + "Database Size in Pages: {}\n" \
+ + padding + "Freelist Page Numbers: {}\n" \
+ + padding + "Pointer Map Page Numbers: {}\n" \
+ + padding + "Updated Page Numbers: {}\n" \
+ + padding + "Page Version Index: {}\n" \
+ + padding + "Database Header Modified: {}\n" \
+ + padding + "Root B-Tree Page Modified: {}\n" \
+ + padding + "Master Schema Modified: {}\n" \
+ + padding + "Freelist Pages Modified: {}\n" \
+ + padding + "Pointer Map Pages Modified: {}\n" \
+ + padding + "Updated B-Tree Page Numbers: {}"
+ string = string.format(self.file_type,
+ self.version_number,
+ self.store_in_memory,
+ self.strict_format_checking,
+ self.page_size,
+ self.file_handle.stringify(padding + "\t"),
+ self.database_header.stringify(padding + "\t"),
+ self.database_size_in_pages,
+ self.freelist_page_numbers,
+ self.pointer_map_page_numbers,
+ self.updated_page_numbers,
+ self.page_version_index,
+ self.database_header_modified,
+ self.root_b_tree_page_modified,
+ self.master_schema_modified,
+ self.freelist_pages_modified,
+ self.pointer_map_pages_modified,
+ self.updated_b_tree_page_numbers)
+ if print_pages:
+ for page in self.pages.itervalues():
+ string += "\n" + padding + "Page:\n{}".format(page.stringify(padding + "\t"))
+ if print_schema:
+ string += "\n" \
+ + padding + "Master Schema:\n{}".format(self.master_schema.stringify(padding + "\t", print_pages))
+ return string
+
+ @property
+ def file_type(self):
+ return self.file_handle.file_type
+
+ @property
+ def database_text_encoding(self):
+ return self.file_handle.database_text_encoding
+
+ @database_text_encoding.setter
+ def database_text_encoding(self, database_text_encoding):
+ self.file_handle.database_text_encoding = database_text_encoding
+
+ @property
+ def database_header(self):
+ if not self._database_header:
+ return DatabaseHeader(self.get_page_data(SQLITE_MASTER_SCHEMA_ROOT_PAGE)[:SQLITE_DATABASE_HEADER_LENGTH])
+ return self._database_header
+
+ @property
+ def root_page(self):
+ if not self._root_page:
+ return self.get_b_tree_root_page(SQLITE_MASTER_SCHEMA_ROOT_PAGE)
+ return self._root_page
+
+ @property
+ def master_schema(self):
+ if not self._master_schema:
+ return MasterSchema(self, self.root_page)
+ return self._master_schema
+
+ @property
+ def pages(self):
+
+ # Return the pages if they are being stored in memory and already parsed
+ if self._pages:
+ return self._pages
+
+ pages = {}
+
+ # Populate the freelist pages into the pages dictionary
+ freelist_trunk_page = self.first_freelist_trunk_page
+ while freelist_trunk_page:
+ pages[freelist_trunk_page.number] = freelist_trunk_page
+ for freelist_leaf_page in freelist_trunk_page.freelist_leaf_pages:
+ pages[freelist_leaf_page.number] = freelist_leaf_page
+ freelist_trunk_page = freelist_trunk_page.next_freelist_trunk_page
+
+ # Populate the pointer map pages into the pages dictionary
+ for pointer_map_page in self.pointer_map_pages:
+ pages[pointer_map_page.number] = pointer_map_page
+
+ """
+
+ Since the WAL commit record may not have the master schema parsed and needs to parse it, we store the master
+ schema to a variable so it is only parsed once, if need be.
+
+ """
+
+ master_schema = self.master_schema
+
+ # Populate the master schema page into the pages dictionary including the root page
+ for master_schema_page in master_schema.master_schema_pages:
+ pages[master_schema_page.number] = master_schema_page
+
+ # Populate the b-trees from the master schema including the root page
+ for b_tree_root_page_number in master_schema.master_schema_b_tree_root_page_numbers:
+ b_tree_root_page = self.get_b_tree_root_page(b_tree_root_page_number)
+ for b_tree_page in get_pages_from_b_tree_page(b_tree_root_page):
+ pages[b_tree_page.number] = b_tree_page
+
+ # Set the number of pages that were found
+ number_of_pages = len(pages)
+
+ if number_of_pages != self.database_size_in_pages:
+ log_message = "The number of pages: {} did not match the database size in pages: {} for version: {}."
+ log_message = log_message.format(number_of_pages, self.database_size_in_pages, self.version_number)
+ self._logger.error(log_message)
+ raise VersionParsingError(log_message)
+
+ for page_number in [page_index + 1 for page_index in range(self.database_size_in_pages)]:
+ if page_number not in pages:
+ log_message = "Page number: {} was not found in the pages: {} for version: {}."
+ log_message = log_message.format(page_number, pages.keys(), self.version_number)
+ self._logger.error(log_message)
+ raise VersionParsingError(log_message)
+
+ return pages
+
+ @abstractmethod
+ def get_page_data(self, page_number, offset=0, number_of_bytes=None):
+ log_message = "The abstract method get_page_data was called directly and is not implemented."
+ self._logger.error(log_message)
+ raise NotImplementedError(log_message)
+
+ @abstractmethod
+ def get_page_offset(self, page_number):
+ log_message = "The abstract method get_page_offset was called directly and is not implemented."
+ self._logger.error(log_message)
+ raise NotImplementedError(log_message)
+
+ def get_b_tree_root_page(self, b_tree_page_number):
+
+ """
+
+
+
+ Note: There is no real way of efficiently checking if this page is a root page or not and doesn't really
+ matter for the purpose of this library. Therefore, any b-tree page requested is considered a root
+ page in relation to it's position to the b-tree that it is a part of for the purposes of this function.
+
+ :param b_tree_page_number:
+
+ :return:
+
+ """
+
+ # Return the page if it is already being in memory and already parsed
+ if self._pages:
+
+ b_tree_root_page = self._pages[b_tree_page_number]
+
+ # Make sure the page is a b-tree page
+ if b_tree_root_page.page_type not in [PAGE_TYPE.B_TREE_TABLE_INTERIOR, PAGE_TYPE.B_TREE_TABLE_LEAF,
+ PAGE_TYPE.B_TREE_INDEX_INTERIOR, PAGE_TYPE.B_TREE_INDEX_LEAF]:
+ log_message = "The b-tree page number: {} is not a b-tree page but instead has a type of: {}."
+ log_message = log_message.format(b_tree_page_number, b_tree_root_page.page_type)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ # Return the b-tree page
+ return b_tree_root_page
+
+ page_hex_type = self.get_page_data(b_tree_page_number, 0, PAGE_TYPE_LENGTH)
+
+ if page_hex_type == MASTER_PAGE_HEX_ID:
+
+ # Make sure this is the sqlite master schema root page
+ if b_tree_page_number != SQLITE_MASTER_SCHEMA_ROOT_PAGE:
+ log_message = "The b-tree page number: {} contains the master page hex but is not page number: {}."
+ log_message = log_message.format(b_tree_page_number)
+ self._logger.error(log_message)
+ raise VersionParsingError(log_message)
+
+ page_hex_type = self.get_page_data(b_tree_page_number, SQLITE_DATABASE_HEADER_LENGTH, PAGE_TYPE_LENGTH)
+
+ # If this is the sqlite master schema root page then this page has to be a table interior or leaf page
+ if page_hex_type not in [TABLE_INTERIOR_PAGE_HEX_ID, TABLE_LEAF_PAGE_HEX_ID]:
+ log_message = "The b-tree page number: {} contains the master page hex but has hex type: {} which " \
+ "is not the expected table interior or table leaf page hex."
+ log_message = log_message.format(b_tree_page_number, hexlify(page_hex_type))
+ self._logger.error(log_message)
+ raise VersionParsingError(log_message)
+
+ # Check if it was a b-tree table interior
+ if page_hex_type == TABLE_INTERIOR_PAGE_HEX_ID:
+
+ # Create the table interior page
+ return TableInteriorPage(self, b_tree_page_number)
+
+ # Check if it was a b-tree table leaf
+ elif page_hex_type == TABLE_LEAF_PAGE_HEX_ID:
+
+ # Create the table leaf page
+ return TableLeafPage(self, b_tree_page_number)
+
+ # Check if it was a b-tree index interior
+ elif page_hex_type == INDEX_INTERIOR_PAGE_HEX_ID:
+
+ # Create the table interior page
+ return IndexInteriorPage(self, b_tree_page_number)
+
+ # Check if it was a b-tree index leaf
+ elif page_hex_type == INDEX_LEAF_PAGE_HEX_ID:
+
+ # Create the table leaf page
+ return IndexLeafPage(self, b_tree_page_number)
+
+ # Throw an exception since the type of the b-tree page was not a b-tree hex type
+ else:
+
+ log_message = "The b-tree page number: {} did not refer to a b-tree page but rather a page of hex type: {}."
+ log_message = log_message.format(hexlify(page_hex_type))
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ def get_page_version(self, page_number):
+
+ try:
+
+ return self.page_version_index[page_number]
+
+ except KeyError:
+
+ log_message = "The page number: {} was not found in the page version index: {} for version: {}."
+ log_message = log_message.format(page_number, self.page_version_index, self.version_number)
+ self._logger.error(log_message)
+ raise
diff --git a/sqlite_dissect/file/version_parser.py b/sqlite_dissect/file/version_parser.py
new file mode 100644
index 0000000..c2e23ae
--- /dev/null
+++ b/sqlite_dissect/file/version_parser.py
@@ -0,0 +1,338 @@
+from abc import ABCMeta
+from logging import getLogger
+from re import sub
+from warnings import warn
+from sqlite_dissect.constants import BASE_VERSION_NUMBER
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import MASTER_SCHEMA_ROW_TYPE
+from sqlite_dissect.constants import PAGE_TYPE
+from sqlite_dissect.exception import VersionParsingError
+from sqlite_dissect.file.schema.master import OrdinaryTableRow
+from sqlite_dissect.file.schema.master import VirtualTableRow
+
+"""
+
+version_parser.py
+
+This script holds the objects for parsing through the version history for master schema entries. This can be used
+for retrieving cells (records), carving, signature generation, etc..
+
+This script holds the following object(s):
+VersionParser(object)
+
+"""
+
+
+class VersionParser(object):
+
+ __metaclass__ = ABCMeta
+
+ def __init__(self, version_history, master_schema_entry, version_number=None, ending_version_number=None):
+
+ """
+
+
+
+ The version history will be iterated through and the respective subclass will use the master schema entry
+ parsed from every version where that master schema entry is found. The version numbers where the master schema
+ entry is found until the last version it is found in (if applicable) will be set at the parser starting version
+ number and parser ending version number.
+
+ In addition, the version number may be set for a specific version to be parsed. This way if you only want a
+ specific version to be parsed, you can specify the version number. If you want the range between two specific
+ versions, the version number and ending version number can be specified to parse the versions in between
+ (including the specified version number and ending version number). If these fields are set the parser
+ starting and ending version number will be set accordingly to be within the range of these versions, if
+ existing, otherwise None. If the master schema entry does not exist in between the versions, a warning will
+ be raised and the subclass will handle the use case accordingly (either by creating and empty object(s) or a
+ "empty" class depending on implementation).
+
+ The md5_hash_identifier field is used from the master schema entry to identify it across the versions. Due
+ to this, it does not matter what master schema entry from what version you choose. The md5_hash_identifier
+ is derived from the row id, name, table name, type, and sql to ensure uniqueness. (Root page numbers can be
+ updated.)
+
+ Note: The use case where the same master schema entry is removed and re-added needs to be addressed in the wal
+ file and is not fully supported here.
+
+ :param version_history:
+ :param master_schema_entry:
+ :param version_number:
+ :param ending_version_number:
+
+ :return:
+
+ :raise:
+
+ """
+
+ logger = getLogger(LOGGER_NAME)
+
+ if version_number is None and ending_version_number:
+ log_message = "Version number not specified where ending version number was specified as: {} for " \
+ "master schema entry with root page number: {} row type: {} name: {} table name: {} " \
+ "and sql: {}."
+ log_message = log_message.format(ending_version_number, master_schema_entry.root_page_number,
+ master_schema_entry.row_type, master_schema_entry.name,
+ master_schema_entry.table_name, master_schema_entry.sql)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ if version_number is not None and version_number == ending_version_number:
+ log_message = "Version number: {} specified where ending version number was also specified as: {} for " \
+ "master schema entry with root page number: {} row type: {} name: {} table name: {} and " \
+ "sql: {}."
+ log_message = log_message.format(version_number, ending_version_number,
+ master_schema_entry.root_page_number, master_schema_entry.row_type,
+ master_schema_entry.name, master_schema_entry.table_name,
+ master_schema_entry.sql)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ number_of_versions = version_history.number_of_versions
+
+ """
+
+ The ending version number needs to be less than the number of versions since version numbers start from
+ 0 and go to the last version. Therefore, the number of versions will be one greater than the last version
+ number.
+
+ """
+
+ if ending_version_number is not None and (ending_version_number >= number_of_versions or
+ ending_version_number <= version_number):
+ log_message = "Invalid ending version number: {} with {} number of versions with version number: {} for " \
+ "master schema entry with root page number: {} row type: {} name: {} table name: {} " \
+ "and sql: {}."
+ log_message = log_message.format(ending_version_number, number_of_versions, version_number,
+ master_schema_entry.root_page_number, master_schema_entry.row_type,
+ master_schema_entry.name, master_schema_entry.table_name,
+ master_schema_entry.sql)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ self.version_number = version_number
+ self.ending_version_number = ending_version_number
+
+ self.parser_starting_version_number = version_number if version_number is not None else BASE_VERSION_NUMBER
+ self.parser_ending_version_number = ending_version_number \
+ if ending_version_number is not None else number_of_versions - 1
+
+ """
+
+ According to the sqlite documentation the only pages with a root page are table and index types (excluding
+ virtual tables.) Therefore we can only parse cells from these types. In the case that trigger or
+ view master schema entry row types were specified we raise a warning here. This will result in having a
+ no entries to parse through.
+
+ Note: Support for virtual table modules that may or may not have database b-tree pages need to be accounted
+ for. A warning will be displayed if a virtual table is encountered.
+
+ Note: Support for "without rowid" tables are not accounted for properly. For now, a warning will be displayed.
+
+ """
+
+ if master_schema_entry.row_type not in [MASTER_SCHEMA_ROW_TYPE.TABLE, MASTER_SCHEMA_ROW_TYPE.INDEX]:
+ log_message = "Invalid master schema entry row type: {} for master schema entry with root page " \
+ "number: {} name: {} table name: {} and sql: {}. Only table and index master " \
+ "schema entries have associated cells to be parsed."
+ log_message = log_message.format(master_schema_entry.row_type, master_schema_entry.root_page_number,
+ master_schema_entry.name, master_schema_entry.table_name,
+ master_schema_entry.sql)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ # Set the page type and update it as appropriate
+ self.page_type = PAGE_TYPE.B_TREE_TABLE_LEAF
+
+ if isinstance(master_schema_entry, VirtualTableRow):
+ log_message = "A virtual table row type was found for the version parser which is not fully supported " \
+ "for master schema entry root page number: {} type: {} name: {} table name: {} and sql: {}."
+ log_message = log_message.format(master_schema_entry.root_page_number,
+ master_schema_entry.row_type, master_schema_entry.name,
+ master_schema_entry.table_name, master_schema_entry.sql)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ elif isinstance(master_schema_entry, OrdinaryTableRow) and master_schema_entry.without_row_id:
+ log_message = "A \"without rowid\" table row type was found for the version parser which is not " \
+ "supported for master schema entry root page number: {} row type: {} name: {} " \
+ "table name: {} and sql: {}. Erroneous cells may be generated."
+ log_message = log_message.format(master_schema_entry.root_page_number,
+ master_schema_entry.row_type, master_schema_entry.name,
+ master_schema_entry.table_name, master_schema_entry.sql)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ self.page_type = PAGE_TYPE.B_TREE_INDEX_LEAF
+
+ # Set the page type if the master schema row type is a index
+ if master_schema_entry.row_type == MASTER_SCHEMA_ROW_TYPE.INDEX:
+ self.page_type = PAGE_TYPE.B_TREE_INDEX_LEAF
+
+ """
+
+ Set the master schema entry fields we care about in this class. Since root page numbers can be different
+ depending on versions, root page numbers is a dictionary in the form of:
+ root_page_number_version_index[VERSION_NUMBER] = ROOT_PAGE_NUMBER(VERSION)
+
+ """
+
+ self.row_type = master_schema_entry.row_type
+ self.name = master_schema_entry.name
+ self.table_name = master_schema_entry.table_name
+ self.sql = master_schema_entry.sql
+ self.root_page_number_version_index = {}
+
+ # Get the md5_hash_identifier from the master schema entry
+ self.master_schema_entry_md5_hash_identifier = master_schema_entry.md5_hash_identifier
+
+ """
+
+ Setup the version numbers to parse through for the version history.
+
+ Note: If the master schema entry is either not found, or stops being found and then re-found, a warning will
+ be raised. The master schema entry uniqueness is determined by the master schema entry md5 hash
+ identifier from the MasterSchemaRow class.
+
+ """
+
+ versions = version_history.versions
+ starting_version_number = None
+ ending_version_number = None
+ for version_number in range(self.parser_starting_version_number, self.parser_ending_version_number + 1):
+
+ version = versions[version_number]
+
+ if version.master_schema_modified:
+ master_schema = version.master_schema
+ else:
+ master_schema = version.last_master_schema
+
+ if not master_schema:
+ log_message = "Master schema was unable to be found in starting version number: {} while parsing " \
+ "the version history for master schema entry with name: {} table name: {} " \
+ "row type: {} and sql: {} for version number: {} and ending version number: {}."
+ log_message = log_message.format(version_number, self.name, self.table_name, self.row_type, self.sql,
+ self.parser_starting_version_number,
+ self.parser_ending_version_number)
+ logger.error(log_message)
+ raise VersionParsingError(log_message)
+
+ entries = master_schema.master_schema_entries
+ entries_dictionary = dict(map(lambda entry: [entry.md5_hash_identifier, entry], entries))
+
+ if self.master_schema_entry_md5_hash_identifier in entries_dictionary:
+
+ if ending_version_number is None:
+
+ if starting_version_number is not None:
+ log_message = "The starting version number was set already when it should not have been " \
+ "since the ending version number was still not set for master schema entry " \
+ "row type: {} with root page number: {} name: {} table name: {} and sql: {}."
+ log_message = log_message.format(master_schema_entry.row_type,
+ master_schema_entry.root_page_number, master_schema_entry.name,
+ master_schema_entry.table_name, master_schema_entry.sql)
+ logger.error(log_message)
+ raise VersionParsingError(log_message)
+
+ starting_version_number = version_number
+ ending_version_number = version_number
+
+ if self.root_page_number_version_index:
+ log_message = "The root page number version index has already been populated with values " \
+ "when it should not have been for master schema entry row type: {} with root " \
+ "page number: {} name: {} table name: {} and sql: {}."
+ log_message = log_message.format(master_schema_entry.row_type,
+ master_schema_entry.root_page_number, master_schema_entry.name,
+ master_schema_entry.table_name, master_schema_entry.sql)
+ logger.error(log_message)
+ raise VersionParsingError(log_message)
+
+ # Add the first version number and b-tree root page number into the root page number version index
+ root_page_number = entries_dictionary[self.master_schema_entry_md5_hash_identifier].root_page_number
+ self.root_page_number_version_index[version_number] = root_page_number
+
+ elif ending_version_number == version_number - 1:
+ ending_version_number = version_number
+
+ if not self.root_page_number_version_index:
+ log_message = "The root page number version index has not already been populated with values " \
+ "when it should have been for master schema entry row type: {} with root " \
+ "page number: {} name: {} table name: {} and sql: {}."
+ log_message = log_message.format(master_schema_entry.row_type,
+ master_schema_entry.root_page_number, master_schema_entry.name,
+ master_schema_entry.table_name, master_schema_entry.sql)
+ logger.error(log_message)
+ raise VersionParsingError(log_message)
+
+ # Add the version number and b-tree root page number into the root page number version index
+ root_page_number = entries_dictionary[self.master_schema_entry_md5_hash_identifier].root_page_number
+ self.root_page_number_version_index[version_number] = root_page_number
+
+ else:
+ log_message = "Version number: {} did not have a master schema entry for the previous " \
+ "version number for master schema entry with name: {} table name: {} " \
+ "row type: {} and sql: {} for version number: {} and ending version number: {}."
+ log_message = log_message.format(version_number, self.name, self.table_name, self.row_type,
+ self.sql, self.parser_starting_version_number,
+ self.parser_ending_version_number)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ if starting_version_number is None and ending_version_number is None:
+ log_message = "Was unable to find any matching schema entries between version numbers {} " \
+ "and {}. The version parser will not parse anything for master schema entry with " \
+ "name: {} table name: {} row type: {} and sql: {}."
+ log_message = log_message.format(self.parser_starting_version_number,
+ self.parser_ending_version_number, self.name, self.table_name,
+ self.row_type, self.sql)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ self.parser_starting_version_number = starting_version_number
+ self.parser_ending_version_number = ending_version_number
+
+ """
+
+ We now have the parser starting and ending version numbers that we need to parse between and a root
+ page number version index referring to each version and it's root b-tree page in case it was updated.
+
+ Note: The root pages to the master schema entries are generated on demand from the version which will return
+ the b-tree page if it is already in memory, or parse it and then return it if it is not. Versions can
+ either be stored in memory or read out on demand for b-tree pages. This is allowed for conserving
+ memory and speeding up parsing (so each b-tree page does not need to be parsed in the case where
+ they do not change).
+
+ """
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding=""):
+ string = padding + "Row Type: {}\n" \
+ + padding + "Page Type: {}\n" \
+ + padding + "Name: {}\n" \
+ + padding + "Table Name: {}\n" \
+ + padding + "SQL: {}\n" \
+ + padding + "Root Page Number Version Index: {}\n" \
+ + padding + "Master Schema Entry MD5 Hash Identifier: {}\n" \
+ + padding + "Version Number: {}\n" \
+ + padding + "Ending Version Number: {}\n" \
+ + padding + "Parser Starting Version Number: {}\n" \
+ + padding + "Parser Ending Version Number: {}"
+ string = string.format(self.row_type,
+ self.page_type,
+ self.name,
+ self.table_name,
+ self.sql,
+ self.root_page_number_version_index,
+ self.master_schema_entry_md5_hash_identifier,
+ self.version_number,
+ self.ending_version_number,
+ self.parser_starting_version_number,
+ self.parser_ending_version_number)
+ return string
diff --git a/sqlite_dissect/file/wal/README.md b/sqlite_dissect/file/wal/README.md
new file mode 100644
index 0000000..26182a0
--- /dev/null
+++ b/sqlite_dissect/file/wal/README.md
@@ -0,0 +1,130 @@
+
+# sqlite_dissect.file.wal
+
+This package will control parsing and access to the SQLite WAL files.
+
+- commit_record.py
+- frame.py
+- header.py
+- utilities.py
+- wal.py
+
+TODO items for the "wal" package:
+
+- [ ] Finish UML class diagrams.
+
+
+
+### commit_record.py
+This script holds the objects used for parsing the write ahead log commit records.
+
+This script holds the following object(s):
+- WriteAheadLogCommitRecord(Version)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Better exception handling when creating objects, etc.
+- [ ] Investigate where a database file has empty space beyond the page size (wal checkpoints were set).
+- [ ] Is there a need (or way) to implement this without an initial database (just wal file)?
+- [ ] Investigate where a database file has empty space beyond the page size (wal checkpoints were set).
+ ##### WriteAheadLogCommitRecord Class:
+ - [ ] Check lists and dictionaries for fields before adding.
+ - [ ] Is there a better way to handle pointer map pages (parse on demand)?
+ - [ ] Investigate when a set of frames does not have a commit frame. (Warning currently thrown.)
+ - [ ] Investigate root pages in commit records with no changes. (Warning currently thrown.)
+ - [ ] The incremental vacuum mode can change in the header from 1 to 2 or 2 to 1.
+ - [ ] Investigate if the database text encoding/schema format number can change after set.
+ - [ ] Investigate if the size in pages can differ on first update if last version < 3.7.0.
+
+
+
+### frame.py
+This script holds the objects used for parsing the WAL frame.
+
+> Note: The WriteAheadLogFrame class is not responsible for parsing the page data itself. It is meant to give
+> information on the WAL frame and offsets of the page data but in order to parse the page data, the set of all
+> page changes to the commit record this frame belongs in is needed. Therefore the commit record class
+> (WriteAheadLogCommitRecord) will be responsible for parsing pages.
+>
+> There was some discussion about the page being stored back in the WriteAheadLogFrame once parsed but it was
+> decided that this made little to no difference and should just be retrieved from the commit record.
+>
+> As a side note, there are some basic things parsed from the page such as the page type. This is only for
+> debugging and logging purposes.
+
+This script holds the following object(s):
+-WriteAheadLogFrame(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Finish try/except exception handling for struct.error and ord in classes.
+ ##### WriteAheadLogFrame Class:
+ - [ ] Are both the frame index and frame number needed? Should the "frame" prefix be removed?
+ - [ ] Handle exceptions that may be raised from creating the wal frame header.
+ - [ ] The contains_sqlite_database_header attribute should apply to table b-trees, not all b-trees.
+ - [ ] Document that the root page is not parsed or contained in the frame and why.
+
+
+
+### header.py
+This script holds the header objects used for parsing the header of the WAL file and WAL frames.
+
+This script holds the following object(s):
+- WriteAheadLogHeader(SQLiteHeader)
+- WriteAheadLogFrameHeader(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Finish try/except exception handling for struct.error and ord in classes.
+- [ ] Implement checking of the salt values.
+- [ ] Implement checking of checksums in either big/little endian.
+- [ ] Investigate if the big/little endian applies to both checksums in the file header and frame header.
+- [ ] Create arrays for salt and checksum values rather than separate variables? They are arrays in the sqlite c code.
+ ##### WriteAheadLogHeader Class:
+ - [ ] Investigate use cases where the checkpoint != 0. A warning is thrown currently.
+
+
+
+### utilities.py
+This script holds utility functions for dealing with WAL specific objects such as comparing database header rather
+than more general utility methods.
+
+This script holds the following function(s):
+- compare_database_headers(previous_database_header, new_database_header)
+
+
+TODO:
+- [ ] Documentation improvements.
+ ##### compare_database_headers Function:
+ - [ ] The \_\_dict\_\_ also returns class objects that may cause issues.
+
+
+
+### wal.py
+This script holds the WAL objects used for parsing the WAL file.
+
+This script holds the following object(s):
+- WriteAheadLog(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+ ##### WriteAheadLog Class:
+ - [ ] Note that this does not extend the version object, instead the commit record does.
+ - [ ] Handle exceptions that may be raised from creating the wal frame.
+ - [ ] Check the salts and checksums across the frames to the header.
+ - [ ] Address the use case of having additional frames past the last committed frame.
+ - [ ] Update the commit record number when invalid frames are implemented.
+ - [ ] Implement wal files with invalid frames.
+ - [ ] Expand on salt 1 and checkpoint referencing documentation and in stringify() functions.
+ - [ ] Check the last valid frame index matches that in the wal index file (if found).
+ - [ ] Check the database size in pages in the wal index file (if found) against the last commit record.
+ - [ ] The file_size arg may not be needed since it is in the file handle and may be removed.
+
\ No newline at end of file
diff --git a/sqlite_dissect/file/wal/__init__.py b/sqlite_dissect/file/wal/__init__.py
new file mode 100644
index 0000000..721329e
--- /dev/null
+++ b/sqlite_dissect/file/wal/__init__.py
@@ -0,0 +1,10 @@
+
+"""
+
+__init__.py
+
+This init script will initialize any needed logic for this package.
+
+This package will control parsing and access to the SQLite WAL files.
+
+"""
diff --git a/sqlite_dissect/file/wal/commit_record.py b/sqlite_dissect/file/wal/commit_record.py
new file mode 100644
index 0000000..b3f1051
--- /dev/null
+++ b/sqlite_dissect/file/wal/commit_record.py
@@ -0,0 +1,1314 @@
+from copy import copy
+from warnings import warn
+from sqlite_dissect.constants import DATABASE_HEADER_VERSIONED_FIELDS
+from sqlite_dissect.constants import FIRST_FREELIST_TRUNK_PARENT_PAGE_NUMBER
+from sqlite_dissect.constants import FIRST_FREELIST_TRUNK_PAGE_INDEX
+from sqlite_dissect.constants import SQLITE_DATABASE_HEADER_LENGTH
+from sqlite_dissect.constants import SQLITE_MASTER_SCHEMA_ROOT_PAGE
+from sqlite_dissect.constants import UTF_8
+from sqlite_dissect.constants import UTF_8_DATABASE_TEXT_ENCODING
+from sqlite_dissect.constants import UTF_16BE
+from sqlite_dissect.constants import UTF_16BE_DATABASE_TEXT_ENCODING
+from sqlite_dissect.constants import UTF_16LE
+from sqlite_dissect.constants import UTF_16LE_DATABASE_TEXT_ENCODING
+from sqlite_dissect.constants import WAL_FRAME_HEADER_LENGTH
+from sqlite_dissect.constants import WAL_HEADER_LENGTH
+from sqlite_dissect.exception import WalCommitRecordParsingError
+from sqlite_dissect.file.database.header import DatabaseHeader
+from sqlite_dissect.file.database.page import FreelistTrunkPage
+from sqlite_dissect.file.database.utilities import create_pointer_map_pages
+from sqlite_dissect.file.schema.master import MasterSchema
+from sqlite_dissect.file.version import Version
+from sqlite_dissect.file.wal.utilities import compare_database_headers
+from sqlite_dissect.constants import BASE_VERSION_NUMBER
+from sqlite_dissect.utilities import get_md5_hash
+
+"""
+
+version.py
+
+This script holds the objects used for parsing the write ahead log commit records.
+
+This script holds the following object(s):
+WriteAheadLogCommitRecord(Version)
+
+"""
+
+
+class WriteAheadLogCommitRecord(Version):
+
+ """
+
+ This class extends the Version class and represents a version based on a commit record in the WAL file. The
+ database is not considered "committed" until a frame appears in the WAL file with a size of database in pages field
+ set declaring it a commit record. The SQLite drivers do not read any information out after the last commit record
+ (if there is any information). Therefore we structure each set of frames up to a commit record as a commit record
+ version and parse it as such.
+
+ Due to the way only parts of the commit record are updated, only parts of the SQLite database will be parsed and
+ stored in this class. For instance, the database header and master schema will only be parsed if they are changed
+ from the previous version. Otherwise, the last database header and last master schema will be set with the previous
+ version's for reference. If the database header and/or master schema is modified, then the objects will be parsed.
+ Also, their respective modified flags will be set. This is to reduce memory and parsing time.
+
+ The idea here is that the database header or master schema should never be needed unless changes were done which
+ can be checked by their respective modified flags which are set in the version and set to true for the original
+ database.
+
+ However, in order to support the version class, functions have been put in place that will pull the master schema,
+ root page, and database header for this version if needed, on demand (unless the "store in memory flag" is set).
+
+ The freelist pages and pointer map pages are always parsed since the overhead to do so is minimal and freelist pages
+ need to be parsed in order to ensure changes in the pages.
+
+ If the "store in memory" flag is set, the commit record will be fully parsed and stored in memory. This includes
+ the database header and master schema, regardless of changes, and all pages including b-tree pages. This flag is
+ defaulted to False rather than True as it is defaulted to in the database class due to the nature of how the commit
+ records are parsed vs the original database.
+
+ Note: The version number of the first commit record defined must start at 1. The previous version to the first
+ WAL commit record is 0 and will be the base SQLite database file.
+
+ Note: The following fields will be parsed on demand unless this commit record has specific updated pages with
+ regards to them (unless the "store in memory" flag is set):
+ 1.) self._database_header
+ 2.) self._root_page
+
+ Note: The root page may not be set if the database header is set since the root page refers to the master
+ schema and not the database header. However, the root page will always be set if the master schema
+ is set and vice-versa.
+
+ 3.) self._master_schema
+
+ """
+
+ def __init__(self, version_number, database, write_ahead_log, frames, page_frame_index, page_version_index,
+ last_database_header, last_master_schema, store_in_memory=False, strict_format_checking=True):
+
+ super(WriteAheadLogCommitRecord, self).__init__(write_ahead_log.file_handle, version_number,
+ store_in_memory, strict_format_checking)
+
+ """
+
+ Note: The database is needed to refer to the file handle in order to read page data out of the database file
+ if the particular page being requested has not been updated in the WAL file frames yet.
+
+ Note: The write ahead log is needed only for the use case of setting the database text encoding if it was
+ not previously set by the database file (Due to a database file with "no content").
+
+ """
+
+ self._database = database
+
+ for page_version_number in page_version_index.itervalues():
+ if page_version_number >= version_number:
+ log_message = "Page version number: {} is greater than the commit record specified version: {}."
+ log_message = log_message.format(page_version_number, version_number)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ max_version_number_in_page_version_index = max(page_version_index.values())
+ if self.version_number != max_version_number_in_page_version_index + 1:
+ log_message = "Version number: {} is not the next version number from the max version: {} in the page " \
+ "version index: {}.."
+ log_message = log_message.format(version_number, max_version_number_in_page_version_index,
+ page_version_index)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ """
+
+ Below we declare a boolean value for committed which explains if this commit record was "committed" to the
+ database. There should be at most one commit record where committed would be false. As the frames are
+ parsed, if a commit frame is found, the committed flag is set to true. If there are multiple commit frames,
+ then an exception is thrown since this is not allowed.
+
+ Note: If there are more than one commit frames, then that use case needs to be checked outside of this class.
+
+ Note: As of yet, the use case where there is a set of frames with no commit record has not been seen and
+ therefore a committed flag will determine if this commit frame was committed to the WAL file or not.
+ In the creating class (VersionHistory), a warning will be thrown if this use case is detected since it
+ has not been investigated and handled correctly.
+
+ The committed page size is determined from the commit frame in the frames and may be left as None if this is
+ the commit record at the end of the file (if it exists) that was not committed and does not have a commit frame.
+
+ The frames variable is a dictionary of page number to frame:
+ self.frames[FRAME.PAGE_NUMBER] = FRAME
+
+ """
+
+ self.committed = False
+ self.committed_page_size = None
+ self.frames = {}
+
+ # Iterate through the frames
+ for frame in frames:
+
+ # Make sure the page number to the current frame doesn't already exist in the previous frames
+ if frame.header.page_number in self.frames:
+ log_message = "Frame page number: {} found already existing in frame page numbers: {} in version: {}."
+ log_message = log_message.format(frame.header.page_number, self.frames.keys(), self.version_number)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Check if the frame is a commit frame
+ if frame.commit_frame:
+
+ # Make sure this commit frame hasn't already been committed
+ if self.committed:
+ log_message = "Frame page number: {} is a commit frame when commit record was already committed " \
+ "with frame page numbers: {} in version: {}."
+ log_message = log_message.format(frame.header.page_number, self.frames.keys(), self.version_number)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Set the committed flag to true
+ self.committed = True
+
+ # Make sure the committed page size has not already been set and set it
+ if self.committed_page_size:
+ log_message = "Frame page number: {} has a committed page size of: {} when it was already set " \
+ "to: {} with frame page numbers: {} in version: {}."
+ log_message = log_message.format(frame.header.page_number, frame.header.page_size_after_commit,
+ self.committed_page_size, self.frames.keys(), self.version_number)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ self.committed_page_size = frame.header.page_size_after_commit
+
+ # Add this frame to the frames dictionary
+ self.frames[frame.header.page_number] = frame
+
+ # Set the updated page numbers derived from this commit records frame keys
+ self.updated_page_numbers = copy(self.frames.keys())
+
+ log_message = "Commit Record Version: {} has the updated page numbers: {}."
+ log_message = log_message.format(self.version_number, self.updated_page_numbers)
+ self._logger.debug(log_message)
+
+ """
+
+ Here we setup the updated b-tree page numbers. This array will be removed from as we parse through the file
+ to leave just the b-tree pages of the commit record that were updated at the end.
+
+ """
+
+ self.updated_b_tree_page_numbers = copy(self.updated_page_numbers)
+
+ self.page_frame_index = dict.copy(page_frame_index)
+ self.page_version_index = dict.copy(page_version_index)
+ for updated_page_number in self.updated_page_numbers:
+ self.page_version_index[updated_page_number] = self.version_number
+ self.page_frame_index[updated_page_number] = self.frames[updated_page_number].frame_number
+
+ self.database_size_in_pages = self.committed_page_size
+
+ """
+
+ Check to make sure the page version index length match the database size in pages as it should.
+
+ Note: The database size in pages can and has been found to be wrong in some cases where the database
+ size in pages is specified where the version valid for number equals the file change counter. It is
+ still unsure of why this can occur but in the use cases this was seen, the database size in pages was
+ correct and the file was inflated (padded) with empty space at the end indicating additional pages.
+ For this reason a warning is thrown instead of an exception (in the case that the version valid for
+ number equals the file change counter and database e in pages is set).
+
+ This may involve the WAL file and checkpoints as the file referred to above had a checkpoint sequence
+ number that was not 0. More investigation is needed.
+
+ """
+
+ if len(self.page_version_index) != self.database_size_in_pages:
+ log_message = "The page version index of length: {} does not equal the database size in pages: {} " \
+ "in version: {} for page version index: {}. Possibly erroneous use cases may occur " \
+ "when parsing."
+ log_message = log_message.format(len(self.page_version_index), self.database_size_in_pages,
+ self.version_number, self.page_version_index)
+ self._logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ """
+
+ Initialize the root page and master schema to none.
+
+ Note: These are only initialized if the SQLite master schema root page is in the updated pages and the root
+ b-tree (not including the header) is updated or the master schema is updated. If the root page is set
+ the master schema will always be set, and vice-versa.
+
+ """
+
+ self._root_page = None
+ self._master_schema = None
+
+ """
+
+ Here we check to see if the SQLite root page was updated or if any of the master schema pages were
+ updated since the previous version. This is done by keeping track of the master schema pages (which
+ will always include the root page SQLITE_MASTER_SCHEMA_ROOT_PAGE (1)) and checking if the new
+ commit record contains any of these pages in the frame array.
+
+ If the root page is in the frame array that means that either:
+ a.) The database header was updated and the rest of the root page remained unchanged.
+ b.) Both the database header and root page were changed.
+ c.) Neither the database header or root page was changed.
+
+ The most observed case is a.) since the schema itself does not seem to change often but rather the
+ freelist pages, database size in pages, and other fields found in the database header.
+
+ If any of the non-root master schema pages are in the frame array then the master schema was
+ updated. The master schema is assumed to be able to be updated without always updating the root
+ page. However, any change in the master schema should result in the schema cookie being updated
+ in the database header meaning that there should never be a case where the master schema is updated
+ without updating the database header.
+
+ First we will check to see if the root page is in this commit record's updated page numbers. If it is, then
+ we will check the database header md5 against the last database header md5 and the root page only md5 hex
+ digest against the previous master schema root page root page only md5 hex digest.
+
+ This will tell us if the database header changed, and insight into if the master schema changed.
+ We will not know 100% if the master schema changed until we check all master schema pages against the updated
+ pages in this commit record. However, if we did find out that the master schema has changed this last step
+ is not needed.
+
+ """
+
+ if SQLITE_MASTER_SCHEMA_ROOT_PAGE in self.updated_page_numbers:
+
+ # Remove it from the updated b-tree pages
+ self.updated_b_tree_page_numbers.remove(SQLITE_MASTER_SCHEMA_ROOT_PAGE)
+
+ """
+
+ Note: There is a redundancy here in calculating these md5 hash values but the trade off is to
+ parse the objects when not needed versus calculating md5s of a small portion of that data.
+ Keep in mind this only occurs when the SQLite master schema root page is in the updated page numbers.
+
+ """
+
+ root_page_data = self.get_page_data(SQLITE_MASTER_SCHEMA_ROOT_PAGE)
+ database_header_md5_hex_digest = get_md5_hash(root_page_data[:SQLITE_DATABASE_HEADER_LENGTH])
+ root_page_only_md5_hex_digest = get_md5_hash(root_page_data[SQLITE_DATABASE_HEADER_LENGTH:])
+
+ if last_database_header.md5_hex_digest != database_header_md5_hex_digest:
+ self.database_header_modified = True
+ self._database_header = DatabaseHeader(root_page_data[:SQLITE_DATABASE_HEADER_LENGTH])
+
+ if self._database_header.md5_hex_digest != database_header_md5_hex_digest:
+ log_message = "The database header md5 hex digest: {} did not match the previously retrieved " \
+ "calculated database header md5 hex digest: {} in commit record version: {} " \
+ "on updated pages: {}."
+ log_message = log_message.format(self._database_header.md5_hex_digest,
+ database_header_md5_hex_digest, self.version_number,
+ self.updated_page_numbers)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ """
+
+ Note: The root b-tree page modified flag may be False where the master schema modified flag may be True
+ depending on if the pages in the master schema updated included the SQLite master schema root
+ page (1) or not.
+
+ """
+
+ if last_master_schema.root_page.header.root_page_only_md5_hex_digest != root_page_only_md5_hex_digest:
+ self.root_b_tree_page_modified = True
+ self.master_schema_modified = True
+
+ """
+
+ The root page may be in the updated page numbers in the WAL commit record even if neither the database
+ header or the root page itself was modified (ie. the page in general). It is not sure why this occurs
+ and more research needs to be done into the exact reasoning. One theory is that if pointer map pages
+ are updated, then the root page is automatically included. This could be a flag in the SQLite source
+ code that sets the root page to have been modified for instance if the largest b-tree root page number
+ is updated, but updated to the same number. For this reason, we throw a warning below
+
+ """
+
+ if not self.database_header_modified and not self.root_b_tree_page_modified:
+ log_message = "The sqlite database root page was found in version: {} in the updated pages: {} when " \
+ "both the database header and the root b-tree page were not modified."
+ log_message = log_message.format(self.version_number, self.updated_page_numbers)
+ self._logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ if not self.master_schema_modified:
+
+ for last_master_schema_page_number in last_master_schema.master_schema_page_numbers:
+
+ """
+
+ Since we are removing the use case of the SQLite master schema root page and checking for master
+ schema modifications on other pages, as long as we find at least one page here, we satisfy our
+ use case and can break.
+
+ Note: We could argue that we should parse the master schema again to make sure the master schema
+ did not change, but we can do the same by checking the previous master schema pages and if
+ any of them were updated, as they would have to be if any change was made, figure out from there
+ without having to deal with the extra overhead of parsing the master schema.
+
+ """
+
+ if last_master_schema_page_number != SQLITE_MASTER_SCHEMA_ROOT_PAGE:
+ if last_master_schema_page_number in self.updated_page_numbers:
+ self.master_schema_modified = True
+ break
+
+ if not self.database_header_modified and self.master_schema_modified:
+ log_message = "The database header was not modified when the master schema was modified in version: {}."
+ log_message = log_message.format(self.version_number)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ """
+
+ The database header differences will be a dictionary with the key being within the
+ DATABASE_HEADER_VERSIONED_FIELDS Enum constant variables and value will be a tuple where
+ the first element will be the value that field held previously and the second element will
+ be the new value of that field.
+
+ """
+
+ if self.database_header_modified:
+
+ if not self._database_header:
+ log_message = "The database header does not exist when the database header was modified in commit " \
+ "record version: {} on updated pages: {}."
+ log_message = log_message.format(self.version_number, self.updated_page_numbers)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ self.database_header_differences = compare_database_headers(last_database_header, self._database_header)
+
+ log_message = "Database header was modified in version: {} with differences: {}."
+ log_message = log_message.format(self.version_number, self.database_header_differences)
+ self._logger.info(log_message)
+
+ else:
+
+ self.database_header_differences = {}
+
+ """
+
+ Note: Below we do not need to worry about the database page in sizes being 0 since this is a write ahead
+ log file being parsed which requires SQLite version >= 3.7.0. However, there may still be a use
+ case where the page number is wrong depending on if it was previously opened with a SQLite version
+ < 3.7.0 and has not been updated yet, however, this use case may not occur and has still yet to be
+ seen. For now, an exception is raised.
+
+ Note: Below a warning is thrown instead of an exception because the committed page size has been found to
+ be wrong in some cases where the database size in pages is specified where the version valid for
+ number equals the file change counter. It is still unsure of why this can occur but in the use cases
+ this was seen, the committed page size was correct and the file was inflated (padded) with empty
+ space at the end indicating additional pages when calculating page size from file size. The
+ database class has additional documentation on this occurring and allows this since it has not been
+ determined why exactly this occurs.
+
+ """
+
+ # Make sure the database size in pages remained the same as the committed page size
+ if self.committed_page_size != last_database_header.database_size_in_pages:
+
+ log_message = "Database header for version: {} specifies a database size in pages of {} but the " \
+ "committed page size is {}. Possibly erroneous use cases may occur when parsing."
+ log_message = log_message.format(self.version_number, last_database_header.database_size_in_pages,
+ self.committed_page_size)
+ self._logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ if self.master_schema_modified:
+
+ log_message = "Master schema was modified in version: {}."
+ log_message = log_message.format(self.version_number)
+ self._logger.info(log_message)
+
+ """
+
+ Below are fields that are set in the case that the database header is modified.
+
+ These variables are set by the parse database header differences private function. If the value is
+ not a boolean, then it will only be set if it was updated in the header.
+
+ Note: Even though the number of freelist pages modified may not be set, it does not mean that there have not
+ been updates to the pages. Same with the first freelist trunk page as well as both fields.
+
+ Note: Pointer map pages may still be updated even if the modified largest root b-tree page number was not
+ modified. (Assuming it was not 0 and auto-vacuuming is turned on.)
+
+ Note: If the database text encoding was not previously set in the versions, it will be set here.
+
+ """
+
+ self.file_change_counter_incremented = False
+ self.version_valid_for_number_incremented = False
+ self.database_size_in_pages_modified = False
+ self.modified_first_freelist_trunk_page_number = None
+ self.modified_number_of_freelist_pages = None
+ self.modified_largest_root_b_tree_page_number = None
+ self.schema_cookie_modified = False
+ self.schema_format_number_modified = False
+ self.database_text_encoding_modified = False
+ self.user_version_modified = False
+
+ """
+
+ Call the _parse_database_header_differences method to setup the above variables and check header use cases.
+
+ """
+
+ self._parse_database_header_differences()
+
+ """
+
+ Create the root page and master schema if the master schema was detected to be modified. Also, remove all
+ master schema page numbers from the updated b-tree pages.
+
+ """
+
+ if self.master_schema_modified:
+
+ self._root_page = self.get_b_tree_root_page(SQLITE_MASTER_SCHEMA_ROOT_PAGE)
+
+ self._master_schema = MasterSchema(self, self._root_page)
+
+ # Remove the master schema page numbers from the updated b-tree pages
+ for master_schema_page_number in self._master_schema.master_schema_page_numbers:
+ if master_schema_page_number in self.updated_b_tree_page_numbers:
+ self.updated_b_tree_page_numbers.remove(master_schema_page_number)
+
+ """
+
+ Since we do not know if the freelist pages could have been updated or not we always set them here.
+ We also set the pointer map pages if they the largest root b-tree page number is specified.
+
+ Note: If there are no freelist pages, the first freelist trunk page will be None and there will be an empty
+ array for the freelist page numbers.
+
+ Note: We could check and only set the pointer map pages if they were updated but it was decided to do that
+ regardless in order to fit the object structure of the version and database better and due to the low
+ overhead of doing this.
+
+ """
+
+ first_freelist_trunk_page_number = last_database_header.first_freelist_trunk_page_number
+ if self._database_header:
+ first_freelist_trunk_page_number = self._database_header.first_freelist_trunk_page_number
+
+ if first_freelist_trunk_page_number:
+ self.first_freelist_trunk_page = FreelistTrunkPage(self, first_freelist_trunk_page_number,
+ FIRST_FREELIST_TRUNK_PARENT_PAGE_NUMBER,
+ FIRST_FREELIST_TRUNK_PAGE_INDEX)
+
+ self.freelist_page_numbers = []
+ observed_freelist_pages = 0
+ freelist_trunk_page = self.first_freelist_trunk_page
+ while freelist_trunk_page:
+ self.freelist_page_numbers.append(freelist_trunk_page.number)
+ observed_freelist_pages += 1
+ for freelist_leaf_page in freelist_trunk_page.freelist_leaf_pages:
+ self.freelist_page_numbers.append(freelist_leaf_page.number)
+ observed_freelist_pages += 1
+ freelist_trunk_page = freelist_trunk_page.next_freelist_trunk_page
+
+ number_of_freelist_pages = last_database_header.number_of_freelist_pages
+ if self._database_header:
+ number_of_freelist_pages = self._database_header.number_of_freelist_pages
+
+ if observed_freelist_pages != number_of_freelist_pages:
+ log_message = "The number of observed freelist pages: {} does not match the number of freelist pages " \
+ "specified in the header: {} for version: {}."
+ log_message = log_message.format(observed_freelist_pages, number_of_freelist_pages, self.version_number)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ for freelist_page_number in self.freelist_page_numbers:
+ if freelist_page_number in self.updated_page_numbers:
+ self.freelist_pages_modified = True
+
+ # Remove the freelist page numbers from the updated b-tree pages
+ if freelist_page_number in self.updated_b_tree_page_numbers:
+ self.updated_b_tree_page_numbers.remove(freelist_page_number)
+
+ """
+
+ Create the pointer map pages.
+
+ Note: If there are no pointer map pages, both the pointer map pages and pointer map page numbers will be an
+ empty array.
+
+ """
+
+ largest_root_b_tree_page_number = last_database_header.largest_root_b_tree_page_number
+ if self._database_header:
+ largest_root_b_tree_page_number = self._database_header.largest_root_b_tree_page_number
+
+ if largest_root_b_tree_page_number:
+ self.pointer_map_pages = create_pointer_map_pages(self, self.database_size_in_pages, self.page_size)
+ else:
+ self.pointer_map_pages = []
+
+ self.pointer_map_page_numbers = []
+ for pointer_map_page in self.pointer_map_pages:
+ self.pointer_map_page_numbers.append(pointer_map_page.number)
+
+ for pointer_map_page_number in self.pointer_map_page_numbers:
+ if pointer_map_page_number in self.updated_page_numbers:
+ self.pointer_map_pages_modified = True
+
+ # Remove the pointer map page numbers from the updated b-tree pages
+ if pointer_map_page_number in self.updated_b_tree_page_numbers:
+ self.updated_b_tree_page_numbers.remove(pointer_map_page_number)
+
+ """
+
+ Note: At this point the updated_b_tree_page_numbers has all of the page numbers that refer to updated b-trees
+ in this commit record with all master schema, freelist, and pointer map pages filtered out.
+
+ """
+
+ """
+
+ The last database header and last master schema are set if no database header or master schema was parsed from
+ this commit record for reference.
+
+ """
+
+ self.last_database_header = None
+ if not self.database_header_modified:
+ self.last_database_header = last_database_header
+
+ self.last_master_schema = None
+ if not self.master_schema_modified:
+ self.last_master_schema = last_master_schema
+
+ """
+
+ If the version information is being stored in memory, parse out the database header, root page, and master
+ schema (if it was already not parsed out) and pages and store them as a private variable.
+
+ """
+
+ if self.store_in_memory:
+
+ if not self._database_header:
+ root_page_data = self.get_page_data(SQLITE_MASTER_SCHEMA_ROOT_PAGE)
+ self._database_header = DatabaseHeader(root_page_data[:SQLITE_DATABASE_HEADER_LENGTH])
+
+ if not self._root_page:
+ self._root_page = self.get_b_tree_root_page(SQLITE_MASTER_SCHEMA_ROOT_PAGE)
+
+ if not self._master_schema:
+ self._master_schema = MasterSchema(self, self._root_page)
+
+ self._pages = self.pages
+
+ log_message = "Commit record: {} on page numbers: {} successfully created."
+ log_message = log_message.format(self.version_number, self.updated_page_numbers)
+ self._logger.info(log_message)
+
+ def stringify(self, padding="", print_pages=True, print_schema=True, print_frames=True):
+
+ # Create the initial string
+ string = "\n" \
+ + padding + "Committed: {}\n" \
+ + padding + "Committed Page Size: {}\n" \
+ + padding + "Frames Length: {}\n" \
+ + padding + "Page Frame Index: {}\n" \
+ + padding + "File Change Counter Incremented: {}\n" \
+ + padding + "Version Valid for Number Incremented: {}\n" \
+ + padding + "Database Size in Pages Modified: {}\n" \
+ + padding + "Modified First Freelist Trunk Page Number: {}\n" \
+ + padding + "Modified Number of Freelist Pages: {}\n" \
+ + padding + "Modified Largest Root B-Tree Page Number: {}\n" \
+ + padding + "Schema Cookie Modified: {}\n" \
+ + padding + "Schema Format Number Modified: {}\n" \
+ + padding + "Database Text Encoding Modified: {}\n" \
+ + padding + "User Version Modified: {}"
+
+ # Format the string
+ string = string.format(self.committed,
+ self.committed_page_size,
+ self.frames_length,
+ self.page_frame_index,
+ self.file_change_counter_incremented,
+ self.version_valid_for_number_incremented,
+ self.database_size_in_pages_modified,
+ self.modified_first_freelist_trunk_page_number,
+ self.modified_number_of_freelist_pages,
+ self.modified_largest_root_b_tree_page_number,
+ self.schema_cookie_modified,
+ self.schema_format_number_modified,
+ self.database_text_encoding_modified,
+ self.user_version_modified)
+
+ # Add the database header differences
+ string += "\n" + padding + "Database Header Differences:"
+
+ # Parse the database header differences
+ for field, difference in self.database_header_differences.iteritems():
+ difference_string = "\n" + padding + "\t" + "Field: {} changed from previous Value: {} to new Value: {}"
+ string += difference_string.format(field, difference[0], difference[1])
+
+ # Print the frames if specified
+ if print_frames:
+ for page_number in self.frames:
+ string += "\n" + padding + "Frame:\n{}".format(self.frames[page_number].stringify(padding + "\t"))
+
+ # Get the super stringify information and concatenate it with this string and return it
+ return super(WriteAheadLogCommitRecord, self).stringify(padding, print_pages, print_schema) + string
+
+ @property
+ def frames_length(self):
+ return len(self.frames)
+
+ def get_page_data(self, page_number, offset=0, number_of_bytes=None):
+
+ page_version = self.page_version_index[page_number]
+
+ if page_version == BASE_VERSION_NUMBER:
+
+ return self._database.get_page_data(page_number, offset, number_of_bytes)
+
+ else:
+
+ # Set the number of bytes to the rest of the page if it was not set
+ number_of_bytes = self.page_size - offset if not number_of_bytes else number_of_bytes
+
+ if offset >= self.page_size:
+ log_message = "Requested offset: {} is >= the page size: {} for page: {}."
+ log_message = log_message.format(offset, self.page_size, page_number)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ if offset + number_of_bytes > self.page_size:
+ log_message = "Requested length of data: {} at offset {} to {} is > than the page size: {} " \
+ "for page: {}."
+ log_message = log_message.format(number_of_bytes, offset, number_of_bytes + offset,
+ self.page_size, page_number)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ page_offset = self.get_page_offset(page_number)
+
+ return self.file_handle.read_data(page_offset + offset, number_of_bytes)
+
+ def get_page_offset(self, page_number):
+
+ """
+
+
+
+ Note: This method will return the correct page offset depending on where it last showed up in relation to
+ this commit frame. Therefore the page offset may be very close to the beginning of the WAL file when
+ the last committed record in the set of frames is near the end of the WAL file. This could also return
+ an offset in the database file if the WAL file did not have the page updated in it's frames yet.
+
+ This is presumed safe since the get_page_data takes in a page number and unless people are using the
+ read method directly from the file handles, this function is more for informative purposes. If someone
+ was reading directly from the file handles, it is assumed they would know the inner workings of this
+ library.
+
+ :param page_number:
+
+ :return:
+
+ """
+
+ if page_number < 1 or page_number > self.database_size_in_pages:
+ log_message = "Invalid page number: {} for version: {} with database size in pages: {}."
+ log_message = log_message.format(page_number, self.version_number, self.database_size_in_pages)
+ self._logger.error(log_message)
+ raise ValueError(log_message)
+
+ page_version = self.page_version_index[page_number]
+
+ if page_version == BASE_VERSION_NUMBER:
+
+ return (page_number - 1) * self.page_size
+
+ else:
+
+ if page_version == self.version_number:
+
+ if page_number not in self.frames:
+ log_message = "Page number has version: {} but not in frame pages: {}."
+ log_message = log_message.format(page_number, self.frames.keys())
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ if page_number not in self.page_frame_index:
+ log_message = "Page number: {} with version: {} is not in the page frame index: {}."
+ log_message = log_message.format(page_number, page_version, self.page_frame_index)
+ self._logger.error(log_message)
+ raise KeyError(log_message)
+
+ frame_number = self.page_frame_index[page_number]
+
+ """
+
+ The WAL file is structured with a file header, then a series of frames that each have a frame header and
+ page in them. The offset is determined by adding the WAL header length to the number of frame header
+ before the page content and then added to the page size multiplied by the number of frames (minus the
+ current one).
+
+ """
+
+ # Return where the offset of the page to this commit record in the WAL file would start at
+ return WAL_HEADER_LENGTH + WAL_FRAME_HEADER_LENGTH * frame_number + self.page_size * (frame_number - 1)
+
+ def _parse_database_header_differences(self):
+
+ """
+
+ This function is a private function that will check and set the variables for this commit record for differences
+ in database headers between this commit record and the last database header.
+
+ Note: The database header differences will be a dictionary keyed by the DATABASE_HEADER_VERSIONED_FIELDS
+ which will refer to a tuple where the first value will be the previous database header value and the
+ second value will be the new database header value.
+
+ :param self:
+
+ :raise:
+
+ """
+
+ # Make sure there are database header differences
+ if not self.database_header_differences:
+
+ # There are no differences so return
+ return
+
+ # Make a copy of the database header differences to work with
+ database_header_differences = dict.copy(self.database_header_differences)
+
+ """
+
+ This shows that the database headers are different and therefore one of the database header fields
+ have been updated. There are only a specific set of database header fields we expect to change here.
+ These are found in the DATABASE_HEADER_VERSIONED_FIELDS constant as the following properties of
+ the database header class:
+ 1.) MD5_HEX_DIGEST: md5_hex_digest
+ 2.) FILE_CHANGE_COUNTER: file_change_counter
+ 3.) VERSION_VALID_FOR_NUMBER: version_valid_for_number
+ 4.) DATABASE_SIZE_IN_PAGES: database_size_in_pages
+ 5.) FIRST_FREELIST_TRUNK_PAGE_NUMBER: first_freelist_trunk_page_number
+ 6.) NUMBER_OF_FREE_LIST_PAGES: number_of_freelist_pages
+ 7.) LARGEST_ROOT_B_TREE_PAGE_NUMBER: largest_root_b_tree_page_number
+ 8.) SCHEMA_COOKIE: schema_cookie
+ 9.) SCHEMA_FORMAT_NUMBER: schema_format_number
+ 10.) DATABASE_TEXT_ENCODING: database_text_encoding
+ 11.) USER_VERSION: user_version
+
+ In order to check these fields we first compare the two headers to get back a dictionary keyed by
+ the property name (above in capitals) with a tuple value where the first element is the previous
+ database header value and the second element is the modified database header value. The property will
+ only exist in the dictionary if the values between the two headers are different. If additional
+ fields not defined above are found to be different, an exception is thrown in order to alert us
+ to the "assumed not to happen" use case.
+
+ Note: The MD5_HEX_DIGEST: md5_hex_digest is a field of the database header class but not a field in the
+ actual database header itself.
+
+ """
+
+ """
+
+ 1.) MD5_HEX_DIGEST: md5_hex_digest:
+ This will be different between both database headers since it was checked in order to enter this
+ area of code. However, this is still a property of the database header class and therefore needs to
+ be accounted for. If the md5 hex digests are not different (are not in the returned database
+ header differences dictionary), then a very weird use case has shown up.
+
+ """
+
+ if DATABASE_HEADER_VERSIONED_FIELDS.MD5_HEX_DIGEST not in database_header_differences:
+ log_message = "The database header md5 hex digests are not different in the database headers " \
+ "for version: {}."
+ log_message = log_message.format(self.version_number)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Delete the entry from the dictionary
+ del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.MD5_HEX_DIGEST]
+
+ """
+
+ The next two fields we will check together are:
+ 2.) FILE_CHANGE_COUNTER: file_change_counter
+ 3.) VERSION_VALID_FOR_NUMBER: version_valid_for_number
+
+ These fields are changed whenever the database file is unlocked after having been modified. However,
+ since this is parsed in a commit record, WAL mode will be in use. In WAL mode, changes to the database
+ are instead detected using the wal-index (shm) file so this change counter is not needed. Therefore,
+ the change counter may not be incremented on each transaction.
+
+ Previously, an assumption was made that these fields were incremented only when a checkpoint occurred in
+ a WAL file. However, these fields were found incremented in commit records of the WAL file outside of
+ checkpoints occurring. It is still not sure exactly what may or may not cause these fields to increment
+ in the WAL commit record itself.
+
+ If either one of these fields is incremented, then the other field must also be incremented and both
+ must be equal. If the case appears that one has been modified and the other one has been not, an
+ exception will be thrown.
+
+ """
+
+ # Check that the file change counter was not modified without the version valid for number
+ if DATABASE_HEADER_VERSIONED_FIELDS.FILE_CHANGE_COUNTER in database_header_differences \
+ and DATABASE_HEADER_VERSIONED_FIELDS.VERSION_VALID_FOR_NUMBER not in database_header_differences:
+ log_message = "The database header file change counter: {} was found in the database header " \
+ "differences but the version valid for number was not for version: {}."
+ log_message = log_message.format(database_header_differences[
+ DATABASE_HEADER_VERSIONED_FIELDS.FILE_CHANGE_COUNTER],
+ self.version_number)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Check that the version valid for number was not modified without the file change counter
+ elif DATABASE_HEADER_VERSIONED_FIELDS.VERSION_VALID_FOR_NUMBER in database_header_differences \
+ and DATABASE_HEADER_VERSIONED_FIELDS.FILE_CHANGE_COUNTER not in database_header_differences:
+ log_message = "The database header version valid for number: {} was found in the database header " \
+ "differences but the file change counter was not for version: {}."
+ log_message = log_message.format(database_header_differences[
+ DATABASE_HEADER_VERSIONED_FIELDS.VERSION_VALID_FOR_NUMBER],
+ self.version_number)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Check if both file change counter and version valid for number was modified
+ elif DATABASE_HEADER_VERSIONED_FIELDS.FILE_CHANGE_COUNTER in database_header_differences \
+ and DATABASE_HEADER_VERSIONED_FIELDS.VERSION_VALID_FOR_NUMBER in database_header_differences:
+
+ """
+
+ Note: We check both fields are incremented only one value from their value in the previous version.
+ If they are not, an exception is thrown. This may be incorrect and their values may be able to
+ increment more than one value but more investigation is needed on this.
+
+ """
+
+ # Get the file change counter difference
+ file_change_counter_difference = database_header_differences[
+ DATABASE_HEADER_VERSIONED_FIELDS.FILE_CHANGE_COUNTER]
+
+ # Check the file change counter difference against it's previous value as stated above
+ if file_change_counter_difference[0] + 1 != file_change_counter_difference[1]:
+ log_message = "The previous database header file change counter: {} is more than one off from the " \
+ "new database header file change counter: {} for version: {}."
+ log_message = log_message.format(file_change_counter_difference[0], file_change_counter_difference[1],
+ self.version_number)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Get the version valid for number difference
+ version_valid_for_number_difference = database_header_differences[
+ DATABASE_HEADER_VERSIONED_FIELDS.VERSION_VALID_FOR_NUMBER]
+
+ # Check the version valid for number difference against it's previous value as stated above
+ if version_valid_for_number_difference[0] + 1 != version_valid_for_number_difference[1]:
+ log_message = "The previous database header version valid for number: {} is more than one off from " \
+ "the new database header version valid for number: {} for version: {}."
+ log_message = log_message.format(version_valid_for_number_difference[0],
+ version_valid_for_number_difference[1], self.version_number)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Update the class variables to signify these fields were incremented
+ self.file_change_counter_incremented = True
+ self.version_valid_for_number_incremented = True
+
+ # Delete the entries from the dictionary
+ del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.FILE_CHANGE_COUNTER]
+ del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.VERSION_VALID_FOR_NUMBER]
+
+ """
+
+ 4.) DATABASE_SIZE_IN_PAGES: database_size_in_pages:
+
+ Here we check if the database size in pages was updated from it's previous size. If it was we check this
+ against the committed page size for the commit record.
+
+ Note: We check that the committed page size obtained from the size of the database file in pages field
+ in the commit record frame is equal to the database size in pages. This should always be equal
+ unless the previous use case occurs which is checked for above where the "version valid for"
+ field does not match the change counter. But this will cause an exception preventing the code
+ from reaching this point. This should additionally be checked since the committed page size
+ should equal the database header of the previous version database header if the database size
+ in pages field did not change.
+
+ """
+
+ if DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_SIZE_IN_PAGES in database_header_differences:
+
+ # Get the database size in pages difference
+ database_size_in_pages_difference = database_header_differences[
+ DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_SIZE_IN_PAGES]
+
+ # The committed page size is checked here but should also be checked at the end of this process
+ if self.committed_page_size != database_size_in_pages_difference[1]:
+ log_message = "The committed page size: {} of commit record version: {} does not match the database" \
+ "header size in pages: {} changed from {} on updated pages: {}."
+ log_message = log_message.format(self.committed_page_size, self.version_number,
+ database_size_in_pages_difference[1],
+ database_size_in_pages_difference[0],
+ self.updated_page_numbers)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Set the database size in pages modified flag
+ self.database_size_in_pages_modified = True
+
+ # Delete the entry from the dictionary
+ del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_SIZE_IN_PAGES]
+
+ """
+
+ The next two fields we are going to pay attention to are in respect to freelist pages:
+ 5.) FIRST_FREELIST_TRUNK_PAGE_NUMBER: first_freelist_trunk_page_number
+ 6.) NUMBER_OF_FREELIST_PAGES: number_of_freelist_pages
+
+ If either of these two fields are different it signifies that the freelist pages in the database were
+ changed. If there were no freelist pages previously then both of these should values should be 0 and
+ not included in the database header differences dictionary after comparison.
+
+ Additional use cases:
+
+ 1.) The first freelist trunk page number could be 0 as well as the number of freelist pages whereas
+ previously there was at least one freelist trunk page existing. This is checked by making sure
+ all previous freelist pages are checked that they are either accounted for in this freelist page set
+ or not in this freelist set but in the pages of this commit record as another page. If not, an
+ exception is thrown.
+
+ 2.) There is a possibility where the freelist pages were updated without changing the
+ number of freelist pages and/or freelist trunk page which additionally needs to be checked.
+ This would mean freelist pages could change without updates to the database header itself.
+
+ 3.) If the database size in pages changed then the freelist pages could be out of range if the modified
+ size is less than the previous size. However, this use case applies to all other page types as well
+ and will be checked when the database size is checked against all of the page numbers in the
+ database/WAL commit record so it is not needed to be worried about here.
+
+ """
+
+ if DATABASE_HEADER_VERSIONED_FIELDS.FIRST_FREELIST_TRUNK_PAGE_NUMBER in database_header_differences:
+ value = database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.FIRST_FREELIST_TRUNK_PAGE_NUMBER]
+ self.modified_first_freelist_trunk_page_number = value[1]
+
+ # Delete the entry from the dictionary
+ del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.FIRST_FREELIST_TRUNK_PAGE_NUMBER]
+
+ if DATABASE_HEADER_VERSIONED_FIELDS.NUMBER_OF_FREE_LIST_PAGES in database_header_differences:
+ value = database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.NUMBER_OF_FREE_LIST_PAGES]
+ self.modified_number_of_freelist_pages = value[1]
+
+ # Delete the entry from the dictionary
+ del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.NUMBER_OF_FREE_LIST_PAGES]
+
+ """
+
+ 7.) LARGEST_ROOT_B_TREE_PAGE_NUMBER: largest_root_b_tree_page_number
+ The next thing to check in the header is the largest root b tree page number. We will check further
+ down if pointer map pages are being used by seeing if this field is set to a non-zero value. Here
+ we are going to see if it changed. If it did change, we are only worried over the use case of it going
+ from 0 to a non-zero value. According to the SQLite documentation, the auto-vacuuming mode has to be set
+ (enabled) before any tables are created in the schema. Once a table has been created, it cannot be turned
+ off. However, the mode can be changed between full (1) and incremental (2).
+
+ """
+
+ if DATABASE_HEADER_VERSIONED_FIELDS.LARGEST_ROOT_B_TREE_PAGE_NUMBER in database_header_differences:
+ change = database_header_differences[
+ DATABASE_HEADER_VERSIONED_FIELDS.LARGEST_ROOT_B_TREE_PAGE_NUMBER]
+ previous_largest_root_b_tree_page_number = change[0]
+ new_largest_root_b_tree_page_number = change[1]
+
+ # Check if auto-vacuuming was turned off
+ if previous_largest_root_b_tree_page_number and not new_largest_root_b_tree_page_number:
+ log_message = "The previous largest root b-tree page number: {} existed where the new one does not " \
+ "meaning that auto-vacuuming was turned off which cannot occur in version: {} on " \
+ "updated pages: {}."
+ log_message = log_message.format(previous_largest_root_b_tree_page_number, self.version_number,
+ self.updated_page_numbers)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Check if auto-vacuuming was turned on
+ elif not previous_largest_root_b_tree_page_number and new_largest_root_b_tree_page_number:
+ log_message = "The previous largest root b-tree page number did not exist where the new one is: {} " \
+ "meaning that auto-vacuuming was turned on which cannot occur in version: {} on " \
+ "updated pages: {}."
+ log_message = log_message.format(previous_largest_root_b_tree_page_number, self.version_number,
+ self.updated_page_numbers)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ """
+
+ Note: Since an exception is being thrown here, we do not delete the entry from the dictionary.
+
+ """
+
+ """
+
+ At this point we know that auto-vacuuming was on and has remained on and only the largest root
+ b tree page number changed. We had five use cases to be concerned about here:
+ 1.) Auto-Vacuuming was on initially and then turned off:
+ This use case was handled above and an exception is currently thrown.
+ 2.) Auto-Vacuuming was off initially and then turned on:
+ This use case was handled above and an exception is currently thrown.
+ 3.) Auto-Vacuuming was never on:
+ In this case there would be a zero in both headers meaning there would not be a change
+ from the previous version and this portion of the code would not be executing.
+ 4.) Auto-Vacuuming was turned on and the largest root b tree page number did not change:
+ In this case both headers would have the same non-zero value meaning there would not be a change
+ from the previous version and this portion of the code would not be executing.
+ 5.) Auto-Vacuuming was turned on and the largest root b tree page number changed:
+ Here we don't have to worry about doing anything extra other than removing the change from the
+ database header differences so it does not cause a exception later on. Other areas of the code
+ will use the modified largest root b-tree page number to handle pointer map pages.
+
+ """
+
+ # Set the modified largest root b-tree page number
+ self.modified_largest_root_b_tree_page_number = new_largest_root_b_tree_page_number
+
+ # Delete the entry from the dictionary
+ del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.LARGEST_ROOT_B_TREE_PAGE_NUMBER]
+
+ """
+
+ 8.) SCHEMA_COOKIE: schema_cookie
+ Next we check for the schema cookie. This field is incremented if a change to the database schema
+ occurs. This will mean that at least one of the master schema pages had to change and be in this
+ version's pages. This could be the root page or any of it's b-tree pages (if any). Keep in mind
+ that the schema cookie being incremented does not mean the root page b-tree content has to change, rather
+ a leaf page to the root page could change. Later on in this process, the schema cookie will be checked
+ against the master schema pages to make make sure at least one of the pages was in this version, otherwise
+ an exception is thrown since this is not expected.
+
+ Note: If the schema cookie is updated, then the master schema must have been updated so this is check as well.
+
+ """
+
+ if DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_COOKIE in database_header_differences:
+
+ # Get the schema cookie difference
+ schema_cookie_difference = database_header_differences[
+ DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_COOKIE]
+
+ # Check the schema cookie difference against 'previous value to make sure it is not less
+ if schema_cookie_difference[0] > schema_cookie_difference[1]:
+ log_message = "The schema cookie was modified but the previous value: {} is greater than the new " \
+ "value: {} which cannot occur in version: {} on updated pages: {}."
+ log_message = log_message.format(schema_cookie_difference[0], schema_cookie_difference[1],
+ self.version_number, self.updated_page_numbers)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Update the file change counter modified variable to signify this field was incremented
+ self.schema_cookie_modified = True
+
+ if not self.master_schema_modified:
+ log_message = "The schema cookie was modified from {} to: {} indicating the master schema was " \
+ "modified but was found not to have been in version: {} on updated pages: {}."
+ log_message = log_message.format(schema_cookie_difference[0], schema_cookie_difference[1],
+ self.version_number, self.updated_page_numbers)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Delete the entry from the dictionary
+ del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_COOKIE]
+
+ elif self.master_schema_modified:
+ log_message = "The schema cookie was not modified indicating the master schema was not modified " \
+ "as well but was found to have been in version: {} on updated pages: {}."
+ log_message = log_message.format(self.version_number, self.updated_page_numbers)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ """
+
+ The next two fields to check are the:
+ 9.) SCHEMA_FORMAT_NUMBER: schema_format_number
+ 10.) DATABASE_TEXT_ENCODING: database_text_encoding
+
+ These should only appear where the master schema was originally empty and then had entries added to it. In
+ this case both of these numbers should originally have been zero. When changed, the schema format number will
+ be within the VALID_SCHEMA_FORMATS and the the database text encoding will be within the
+ DATABASE_TEXT_ENCODINGS. However, it is not needed that we check against this since this is done when parsing
+ the database header itself.
+
+ When these are specified we check for the following use cases to validate:
+ 1.) Both fields exist in the database header differences.
+ 1.) Both of their values were originally 0.
+ 2.) The database size in pages was originally 1.
+
+ """
+
+ # Check that the schema format number was not modified without the database text encoding
+ if DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_FORMAT_NUMBER in database_header_differences \
+ and DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_TEXT_ENCODING not in database_header_differences:
+ log_message = "The database header schema format number: {} was found in the database header " \
+ "differences but the database text encoding was not for version: {}."
+ log_message = log_message.format(database_header_differences[
+ DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_FORMAT_NUMBER],
+ self.version_number)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Check that the database text encoding was not modified without the schema format number
+ elif DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_TEXT_ENCODING in database_header_differences \
+ and DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_FORMAT_NUMBER not in database_header_differences:
+ log_message = "The database header database text encoding: {} was found in the database header " \
+ "differences but the schema format number was not for version: {}."
+ log_message = log_message.format(database_header_differences[
+ DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_TEXT_ENCODING],
+ self.version_number)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Check if both the schema format number was not modified without the database text encoding was modified
+ elif DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_FORMAT_NUMBER in database_header_differences \
+ and DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_TEXT_ENCODING in database_header_differences:
+
+ # Get the schema format number difference
+ schema_format_number_difference = database_header_differences[
+ DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_FORMAT_NUMBER]
+
+ # Check that the schema format number was previously 0
+ if schema_format_number_difference[0] != 0:
+ log_message = "The previous database header schema format number: {} is not equal to 0 as expected " \
+ "and has a new database header schema format number: {} for version: {}."
+ log_message = log_message.format(schema_format_number_difference[0], schema_format_number_difference[1],
+ self.version_number)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Get the database text encoding difference
+ database_text_encoding_difference = database_header_differences[
+ DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_TEXT_ENCODING]
+
+ # Check that the database text encoding was previously 0
+ if database_text_encoding_difference[0] != 0:
+ log_message = "The previous database header database text encoding: {} is not equal to 0 as expected " \
+ "and has a new database header database text encoding: {} for version: {}."
+ log_message = log_message.format(database_text_encoding_difference[0],
+ database_text_encoding_difference[1], self.version_number)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ """
+
+ Make sure the database size in pages was previously 1.
+
+ Note: This is pulled from the original database header differences dictionary since it has already been
+ removed from the local copy.
+
+ """
+
+ if DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_SIZE_IN_PAGES not in self.database_header_differences:
+ log_message = "The schema format number was changed from: {} to: {} and database text encoding was " \
+ "changed from: {} to: {} when the database size in pages was not updated and " \
+ "stayed the same size of: {} when it should have initially been 1 and changed to a " \
+ "greater number in version: {} on updated pages: {}."
+ log_message = log_message.format(schema_format_number_difference[0], schema_format_number_difference[1],
+ database_text_encoding_difference[0],
+ database_text_encoding_difference[1],
+ self.database_size_in_pages,
+ self.version_number, self.updated_page_numbers)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Get the database size in pages difference
+ database_size_in_pages_difference = self.database_header_differences[
+ DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_SIZE_IN_PAGES]
+
+ # Check the database size in pages was previously 1
+ if database_size_in_pages_difference[0] != 1:
+ log_message = "The schema format number was changed from: {} to: {} and database text encoding was " \
+ "changed from: {} to: {} when the database size in pages was updated from: {} to:{} " \
+ "when it should have initially been 1 in version: {} on updated pages: {}."
+ log_message = log_message.format(schema_format_number_difference[0], schema_format_number_difference[1],
+ database_text_encoding_difference[0],
+ database_text_encoding_difference[1],
+ database_size_in_pages_difference[0],
+ database_size_in_pages_difference[1],
+ self.version_number, self.updated_page_numbers)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Update the class variables to signify these fields were incremented
+ self.schema_format_number_modified = True
+ self.database_text_encoding_modified = True
+
+ """
+
+ Since the database encoding as not been set yet, we set it in the WAL file handle by calling the
+ database_text_encoding property of the superclass.. Since nothing should be reading from the database
+ since nothing was written to it, we do not have to worry about setting the database text encoding in
+ the database.
+
+ Note: Once the database text encoding is set, it can no longer be changed.
+
+ """
+
+ database_text_encoding = database_text_encoding_difference[1]
+
+ if database_text_encoding == UTF_8_DATABASE_TEXT_ENCODING:
+ self.database_text_encoding = UTF_8
+ elif database_text_encoding == UTF_16LE_DATABASE_TEXT_ENCODING:
+ self.database_text_encoding = UTF_16LE
+ elif database_text_encoding == UTF_16BE_DATABASE_TEXT_ENCODING:
+ self.database_text_encoding = UTF_16BE
+ elif database_text_encoding:
+ log_message = "The database text encoding: {} is not recognized as a valid database text encoding."
+ log_message = log_message.format(database_text_encoding)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Delete the entries from the dictionary
+ del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.SCHEMA_FORMAT_NUMBER]
+ del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.DATABASE_TEXT_ENCODING]
+
+ """
+
+ 11.) USER_VERSION: user_version:
+
+ The user version is not used by SQLite and is a user-defined version for developers to be able to track their
+ own versions of a SQLite database file for instances where the schema may be modified constantly, etc.
+
+ Here we only check for this, and report it by setting the flag. Afterwards, we remove it from the database
+ header differences dictionary since it cannot be used to gleam any information about the database file while
+ parsing.
+
+ """
+
+ if DATABASE_HEADER_VERSIONED_FIELDS.USER_VERSION in database_header_differences:
+
+ # Set the user version modified flag
+ self.user_version_modified = True
+
+ # Delete the entry from the dictionary
+ del database_header_differences[DATABASE_HEADER_VERSIONED_FIELDS.USER_VERSION]
+
+ """
+
+ Make sure there are no additional differences that are not accounted for. If there are, throw an
+ exception in order to flag the use case for occurring.
+
+ """
+
+ # Throw an exception if any database header differences still exist
+ if database_header_differences:
+ log_message = "Database header differences still exist after checking the last database header against " \
+ "this current commit record version: {} on updated pages: {}. The main set of differences " \
+ "was: {} with remaining differences: {}."
+ log_message = log_message.format(self.version_number, self.updated_page_numbers,
+ self.database_header_differences,
+ database_header_differences)
+ self._logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
diff --git a/sqlite_dissect/file/wal/frame.py b/sqlite_dissect/file/wal/frame.py
new file mode 100644
index 0000000..77acbdd
--- /dev/null
+++ b/sqlite_dissect/file/wal/frame.py
@@ -0,0 +1,106 @@
+from binascii import hexlify
+from logging import getLogger
+from re import sub
+from sqlite_dissect.constants import FILE_TYPE
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import MASTER_PAGE_HEX_ID
+from sqlite_dissect.constants import SQLITE_DATABASE_HEADER_LENGTH
+from sqlite_dissect.constants import WAL_FRAME_HEADER_LENGTH
+from sqlite_dissect.constants import WAL_HEADER_LENGTH
+from sqlite_dissect.exception import WalParsingError
+from sqlite_dissect.file.wal.header import WriteAheadLogFrameHeader
+
+"""
+
+frame.py
+
+This script holds the objects used for parsing the WAL frame.
+
+Note: The WriteAheadLogFrame class is not responsible for parsing the page data itself. It is meant to give
+ information on the WALv frame and offsets of the page data but in order to parse the page data, the set of all
+ page changes to the commit record this frame belongs in is needed. Therefore the commit record class
+ (WriteAheadLogCommitRecord) will be responsible for parsing pages.
+
+ There was some discussion about the page being stored back in the WriteAheadLogFrame once parsed but it was
+ decided that this made little to no difference and should just be retrieved from the commit record.
+
+ As a side note, there are some basic things parsed from the page such as the page type. This is only for
+ debugging and logging purposes.
+
+This script holds the following object(s):
+WriteAheadLogFrame(object)
+
+"""
+
+
+class WriteAheadLogFrame(object):
+
+ def __init__(self, file_handle, frame_index, commit_record_number):
+
+ logger = getLogger(LOGGER_NAME)
+
+ if file_handle.file_type != FILE_TYPE.WAL:
+ log_message = "The wal frame file handle file type is not {} as expected but is {} for frame index: {} " \
+ "commit record number: {}."
+ log_message = log_message.format(FILE_TYPE.WAL, file_handle.file_type, frame_index, commit_record_number)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ self.frame_index = frame_index
+ self.frame_number = self.frame_index + 1
+ self.commit_record_number = commit_record_number
+
+ self.offset = self._get_write_ahead_log_frame_offset(self.frame_index, file_handle.header.page_size)
+ self.frame_size = WAL_FRAME_HEADER_LENGTH + file_handle.header.page_size
+
+ wal_frame = file_handle.read_data(self.offset, self.frame_size)
+ self.header = WriteAheadLogFrameHeader(wal_frame[:WAL_FRAME_HEADER_LENGTH])
+ self.commit_frame = True if self.header.page_size_after_commit else False
+ page_content = wal_frame[WAL_FRAME_HEADER_LENGTH:]
+
+ if len(page_content) != file_handle.header.page_size:
+ log_message = "Page content was found to be: {} when expected to be: {} as declared in the wal file " \
+ "header for frame index: {} commit record number: {}."
+ log_message = log_message.format(len(page_content), file_handle.header.page_size,
+ frame_index, commit_record_number)
+ logger.error(log_message)
+ raise WalParsingError(log_message)
+
+ self.contains_sqlite_database_header = False
+ self.page_hex_type = page_content[0:1]
+
+ if self.page_hex_type == MASTER_PAGE_HEX_ID:
+ self.page_hex_type = page_content[SQLITE_DATABASE_HEADER_LENGTH:SQLITE_DATABASE_HEADER_LENGTH + 1]
+ self.contains_sqlite_database_header = True
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding=""):
+ string = padding + "Frame Index: {}\n" \
+ + padding + "Frame Number: {}\n" \
+ + padding + "Commit Record Number: {}\n" \
+ + padding + "Offset: {}\n" \
+ + padding + "Frame Size: {}\n" \
+ + padding + "Commit Frame: {}\n" \
+ + padding + "Header:\n{}\n"\
+ + padding + "Contains SQLite Database Header: {}\n" \
+ + padding + "Page Hex Type (Hex): {}"
+ string = string.format(self.frame_index,
+ self.frame_number,
+ self.commit_record_number,
+ self.offset,
+ self.frame_size,
+ self.commit_frame,
+ self.header.stringify(padding + "\t"),
+ self.contains_sqlite_database_header,
+ hexlify(self.page_hex_type))
+ return string
+
+ @staticmethod
+ def _get_write_ahead_log_frame_offset(index, page_size):
+ wal_frame_size = WAL_FRAME_HEADER_LENGTH + page_size
+ return WAL_HEADER_LENGTH + index * wal_frame_size
diff --git a/sqlite_dissect/file/wal/header.py b/sqlite_dissect/file/wal/header.py
new file mode 100644
index 0000000..7acdd34
--- /dev/null
+++ b/sqlite_dissect/file/wal/header.py
@@ -0,0 +1,141 @@
+from logging import getLogger
+from re import sub
+from struct import unpack
+from warnings import warn
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import WAL_FILE_FORMAT_VERSION
+from sqlite_dissect.constants import WAL_FRAME_HEADER_LENGTH
+from sqlite_dissect.constants import WAL_HEADER_LENGTH
+from sqlite_dissect.constants import WAL_MAGIC_NUMBER_BIG_ENDIAN
+from sqlite_dissect.constants import WAL_MAGIC_NUMBER_LITTLE_ENDIAN
+from sqlite_dissect.exception import HeaderParsingError
+from sqlite_dissect.file.header import SQLiteHeader
+from sqlite_dissect.utilities import get_md5_hash
+
+"""
+
+header.py
+
+This script holds the header objects used for parsing the header of the WAL file and WAL frames.
+
+This script holds the following object(s):
+WriteAheadLogHeader(SQLiteHeader)
+WriteAheadLogFrameHeader(object)
+
+"""
+
+
+class WriteAheadLogHeader(SQLiteHeader):
+
+ def __init__(self, wal_header_byte_array):
+
+ super(WriteAheadLogHeader, self).__init__()
+
+ logger = getLogger(LOGGER_NAME)
+
+ if len(wal_header_byte_array) != WAL_HEADER_LENGTH:
+ log_message = "The wal header byte array of size: {} is not the expected size of: {}."
+ log_message = log_message.format(len(wal_header_byte_array), WAL_HEADER_LENGTH)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ self.magic_number = unpack(b">I", wal_header_byte_array[0:4])[0]
+
+ """
+
+ Note: The magic number specifies either big endian or little endian encoding for checksums.
+
+ """
+
+ if self.magic_number not in [WAL_MAGIC_NUMBER_BIG_ENDIAN, WAL_MAGIC_NUMBER_LITTLE_ENDIAN]:
+ log_message = "The magic number: {} is valid.".format(self.magic_number)
+ logger.error(log_message)
+ raise HeaderParsingError(log_message)
+
+ self.file_format_version = unpack(b">I", wal_header_byte_array[4:8])[0]
+
+ if self.file_format_version != WAL_FILE_FORMAT_VERSION:
+ log_message = "An unsupported file format version was found: {} instead of the expected value: {}."
+ log_message = log_message.format(self.file_format_version, WAL_FILE_FORMAT_VERSION)
+ logger.error(log_message)
+ raise HeaderParsingError(log_message)
+
+ self.page_size = unpack(b">I", wal_header_byte_array[8:12])[0]
+ self.checkpoint_sequence_number = unpack(b">I", wal_header_byte_array[12:16])[0]
+
+ if self.checkpoint_sequence_number != 0:
+ log_message = "Checkpoint sequence number is {} instead of 0 and may cause inconsistencies in wal parsing."
+ log_message = log_message.format(self.checkpoint_sequence_number)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ self.salt_1 = unpack(b">I", wal_header_byte_array[16:20])[0]
+ self.salt_2 = unpack(b">I", wal_header_byte_array[20:24])[0]
+ self.checksum_1 = unpack(b">I", wal_header_byte_array[24:28])[0]
+ self.checksum_2 = unpack(b">I", wal_header_byte_array[28:32])[0]
+
+ self.md5_hex_digest = get_md5_hash(wal_header_byte_array)
+
+ def stringify(self, padding=""):
+ string = padding + "Magic Number: {}\n" \
+ + padding + "File Format Version: {}\n" \
+ + padding + "Page Size: {}\n" \
+ + padding + "Checkpoint Sequence Number: {}\n" \
+ + padding + "Salt 1: {}\n" \
+ + padding + "Salt 2: {}\n" \
+ + padding + "Checksum 1: {}\n" \
+ + padding + "Checksum 2: {}\n" \
+ + padding + "MD5 Hex Digest: {}"
+ return string.format(self.magic_number,
+ self.file_format_version,
+ self.page_size,
+ self.checkpoint_sequence_number,
+ self.salt_1,
+ self.salt_2,
+ self.checksum_1,
+ self.checksum_2,
+ self.md5_hex_digest)
+
+
+class WriteAheadLogFrameHeader(object):
+
+ def __init__(self, wal_frame_header_byte_array):
+
+ logger = getLogger(LOGGER_NAME)
+
+ if len(wal_frame_header_byte_array) != WAL_FRAME_HEADER_LENGTH:
+ log_message = "The wal frame header byte array of size: {} is not the expected size of: {}."
+ log_message = log_message.format(len(wal_frame_header_byte_array), WAL_FRAME_HEADER_LENGTH)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ self.page_number = unpack(b">I", wal_frame_header_byte_array[0:4])[0]
+ self.page_size_after_commit = unpack(b">I", wal_frame_header_byte_array[4:8])[0]
+ self.salt_1 = unpack(b">I", wal_frame_header_byte_array[8:12])[0]
+ self.salt_2 = unpack(b">I", wal_frame_header_byte_array[12:16])[0]
+ self.checksum_1 = unpack(b">I", wal_frame_header_byte_array[16:20])[0]
+ self.checksum_2 = unpack(b">I", wal_frame_header_byte_array[20:24])[0]
+
+ self.md5_hex_digest = get_md5_hash(wal_frame_header_byte_array)
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding=""):
+ string = padding + "Page Number: {}\n" \
+ + padding + "Page Size After Commit: {}\n" \
+ + padding + "Salt 1: {}\n" \
+ + padding + "Salt 2: {}\n" \
+ + padding + "Checksum 1: {}\n" \
+ + padding + "Checksum 2: {}\n" \
+ + padding + "MD5 Hex Digest: {}"
+ return string.format(self.page_number,
+ self.page_size_after_commit,
+ self.salt_1,
+ self.salt_2,
+ self.checksum_1,
+ self.checksum_2,
+ self.md5_hex_digest)
diff --git a/sqlite_dissect/file/wal/utilities.py b/sqlite_dissect/file/wal/utilities.py
new file mode 100644
index 0000000..559faad
--- /dev/null
+++ b/sqlite_dissect/file/wal/utilities.py
@@ -0,0 +1,48 @@
+from logging import getLogger
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.file.database.header import DatabaseHeader
+
+"""
+
+utilities.py
+
+This script holds utility functions for dealing with WAL specific objects such as comparing database header rather
+than more general utility methods.
+
+This script holds the following function(s):
+compare_database_headers(previous_database_header, new_database_header)
+
+"""
+
+
+def compare_database_headers(previous_database_header, database_header):
+
+ logger = getLogger(LOGGER_NAME)
+
+ if not isinstance(previous_database_header, DatabaseHeader):
+ log_message = "The previous database header is not a Database Header but has a type of: {}."
+ log_message = log_message.format(type(previous_database_header))
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ if not isinstance(database_header, DatabaseHeader):
+ log_message = "The database header is not a Database Header but has a type of: {}."
+ log_message = log_message.format(type(database_header))
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ """
+
+ Since the two objects are the same, we are not worried about possible differences in what properties the
+ objects have.
+
+ """
+
+ database_header_changes = {}
+ for key in previous_database_header.__dict__.keys():
+ previous_value = getattr(previous_database_header, key)
+ value = getattr(database_header, key)
+ if previous_value != value:
+ database_header_changes[key] = (previous_value, value)
+
+ return database_header_changes
diff --git a/sqlite_dissect/file/wal/wal.py b/sqlite_dissect/file/wal/wal.py
new file mode 100644
index 0000000..80a0158
--- /dev/null
+++ b/sqlite_dissect/file/wal/wal.py
@@ -0,0 +1,240 @@
+from logging import getLogger
+from re import sub
+from warnings import warn
+from sqlite_dissect.constants import FILE_TYPE
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import WAL_FRAME_HEADER_LENGTH
+from sqlite_dissect.constants import WAL_HEADER_LENGTH
+from sqlite_dissect.exception import WalParsingError
+from sqlite_dissect.file.file_handle import FileHandle
+from sqlite_dissect.file.wal.frame import WriteAheadLogFrame
+
+"""
+
+frame.py
+
+This script holds the WAL objects used for parsing the WAL file.
+
+This script holds the following object(s):
+WriteAheadLog(object)
+
+"""
+
+
+class WriteAheadLog(object):
+
+ def __init__(self, file_identifier, store_in_memory=False, file_size=None, strict_format_checking=True):
+
+ """
+
+ Constructor.
+
+ :param file_identifier: str or file The full file path to the file to be opened or the file object.
+ :param store_in_memory: boolean Tells this class to store it's particular version information in memory or not.
+ :param file_size: int Optional parameter to supply the file size.
+ :param strict_format_checking: boolean Specifies if the application should exit if structural validations fail.
+
+ """
+
+ self.file_handle = FileHandle(FILE_TYPE.WAL, file_identifier, file_size=file_size)
+ self.store_in_memory = store_in_memory
+ self.strict_format_checking = strict_format_checking
+
+ logger = getLogger(LOGGER_NAME)
+
+ frame_size = (WAL_FRAME_HEADER_LENGTH + self.file_handle.header.page_size)
+
+ self.number_of_frames = (self.file_handle.file_size - WAL_HEADER_LENGTH) / frame_size
+
+ valid_frame_array = []
+ invalid_frame_array = []
+ commit_record_number = 1
+
+ """
+
+ Since we have the possibility of WAL files executing checkpoints and overwriting themselves, we can have
+ invalid frames trailing the valid frames. The calculations above will always prove true since the frames are
+ always the same size they will always fully overwrite. Therefore, we should never come across a situation
+ where a WAL file has partially overwritten WAL frames in it (assuming the file is not damaged itself).
+
+ In order to keep track of the invalid frames, we index the starting and ending frame indices that we find those
+ frames that correlate to a particular salt 1 value together. Salt 1 values are incremented on checkpoint
+ operations. Therefore we can determine the order of how the invalid frames were stored into the file by
+ looking at the checkpoint number and correlating the offset of the salt 1 value from the salt 1 value in
+ the WAL file header.
+
+ When we find invalid frames, we will set the commit record number to None for now until further implemented.
+
+ Below we initialize dictionary of salt 1 value to a tuple where the first and second values apply to the first
+ invalid frame index found and last invalid frame index found for that salt 1 value. Due to the way WAL files
+ overwrite and commit we should always have at least one frame in this use case at if it is only one frame, or
+ the last frame found, should always be a commit frame (ie. where the database page size after commit is set).
+
+ Also, if there are any entries in the invalid frame indices when a valid frame is found, an exception is raised
+ since this should never occur.
+
+ """
+
+ # Initialize the dictionary
+ self.invalid_frame_indices = {}
+
+ for frame_index in range(self.number_of_frames):
+
+ frame = WriteAheadLogFrame(self.file_handle, frame_index, commit_record_number)
+
+ # Check if the salt 1 values were different (invalid frame)
+ if frame.header.salt_1 != self.file_handle.header.salt_1:
+
+ log_message = "Frame index: {} after commit record number: {} has salt 1 of {} when expected to " \
+ "be: {} and is an invalid frame."
+ log_message = log_message.format(frame_index, commit_record_number - 1, frame.header.salt_1,
+ self.file_handle.header.salt_1)
+ logger.debug(log_message)
+
+ # Check if this salt value was already put into the invalid frame indices dictionary
+ if frame.header.salt_1 in self.invalid_frame_indices:
+
+ # Get the previous indices
+ indices = self.invalid_frame_indices[frame.header.salt_1]
+
+ # Check to make sure this frame index is the next one in the array
+ if indices[1] + 1 != frame_index:
+ log_message = "Frame index: {} with salt 1 of {} when expected to be: {} after commit " \
+ "record number: {} has a different frame index than the expected: {}."
+ log_message = log_message.format(frame_index, frame.header.salt_1,
+ self.file_handle.header.salt_1, commit_record_number - 1,
+ indices[1] + 1)
+ logger.error(log_message)
+ raise WalParsingError(log_message)
+
+ # Add the updated indices for the WAL value into the invalid frame indices dictionary
+ self.invalid_frame_indices[frame.header.salt_1] = (indices[0], frame_index)
+
+ # The salt value was not already put into the invalid frame indices dictionary
+ else:
+
+ # Add the indices for the salt value into the invalid frame indices dictionary
+ self.invalid_frame_indices[frame.header.salt_1] = (frame_index, frame_index)
+
+ # Update the commit record number to None (see above documentation and script header documentation)
+ frame.commit_record_number = None
+
+ # Append the frame to the invalid frame array
+ invalid_frame_array.append(frame)
+
+ # Check if the salt 2 values were different if the salt 1 values were the same (error)
+ elif frame.header.salt_2 != self.file_handle.header.salt_2:
+
+ log_message = "Frame index: {} after commit record number: {} has salt 2 of {} when expected to " \
+ "be: {} where the salt 1 values matched."
+ log_message = log_message.format(frame_index, commit_record_number - 1, frame.header.salt_1,
+ self.file_handle.header.salt_1)
+ logger.error(log_message)
+ raise WalParsingError(log_message)
+
+ # The frame is a valid frame
+ else:
+
+ # Make sure there are no entries in the invalid frame indices or else there was an error
+ if self.invalid_frame_indices:
+ log_message = "Frame index: {} in commit record number: {} follows invalid frames."
+ log_message = log_message.format(frame_index, commit_record_number)
+ logger.error(log_message)
+ raise WalParsingError(log_message)
+
+ # Append the frame to the valid frame array and increment the commit record number for a commit frame
+ valid_frame_array.append(frame)
+ if frame.commit_frame:
+ commit_record_number += 1
+
+ self.frames = dict(map(lambda x: [x.frame_index, x], valid_frame_array))
+ self.invalid_frames = dict(map(lambda x: [x.frame_index, x], invalid_frame_array))
+
+ # Check if we had invalid frames
+ if self.invalid_frames:
+
+ # Print debug log messages on the WAL frame details
+ log_message = "The number of frames found in the wal file are: {} with {} valid frames between frame" \
+ "indices {} and {} and {} invalid frames between frame indices {} and {}"
+ log_message = log_message.format(self.number_of_frames, len(self.frames), min(self.frames.keys()),
+ max(self.frames.keys()), len(self.invalid_frames),
+ min(self.invalid_frames.keys()), max(self.invalid_frames.keys()))
+ logger.debug(log_message)
+
+ log_message = "The invalid frame indices pertaining to salt 1 values are: {}."
+ log_message = log_message.format(self.invalid_frame_indices)
+ logger.debug(log_message)
+
+ """
+
+ Below we output a warning and a log message warning that implementation for invalid frames is not
+ handled or parsed yet.
+
+ """
+
+ log_message = "The wal file contains {} invalid frames. Invalid frames are currently skipped and not " \
+ "implemented which may cause loss in possible carved data at this time until implemented."
+ log_message = log_message.format(len(self.invalid_frames))
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ self.last_frame_commit_record = None
+ last_wal_frame_commit_record_index = max(self.frames.keys())
+ while last_wal_frame_commit_record_index >= 0:
+
+ """
+
+ Starting from the end of the file and working backwards, we find the last commit record in the file
+ to determine at which point the data was committed to the database file. Soon as we find that frame,
+ we break from the while loop.
+
+ """
+
+ if self.frames[last_wal_frame_commit_record_index].header.page_size_after_commit != 0:
+ self.last_frame_commit_record = self.frames[last_wal_frame_commit_record_index]
+ break
+ else:
+ last_wal_frame_commit_record_index -= 1
+
+ if last_wal_frame_commit_record_index != len(self.frames) - 1:
+
+ """
+
+ If the last WAL frame commit record index does not equal the number of frames, that means that there was
+ at least one entry in the WAL file beyond the last committed record. This use case has not been discovered
+ yet and a NotImplementedError will be raised here until the use case is handled.
+
+ """
+
+ log_message = "The last wal frame commit record index: {} was not the last committed frame of in {} frames."
+ log_message = log_message.format(last_wal_frame_commit_record_index, len(self.frames))
+ logger.error(log_message)
+ raise NotImplementedError(log_message)
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding="", print_frames=True):
+ string = padding + "File Handle:\n{}"
+ string = string.format(self.file_handle.stringify(padding + "\t"))
+ string += "\n" \
+ + padding + "Number of Frames: {}\n" \
+ + padding + "Number of Valid Frames: {}\n" \
+ + padding + "Number of Invalid Frames: {}\n" \
+ + padding + "Invalid Frames Indices: {}\n" \
+ + padding + "Last Frame Commit Record Number: {}"
+ string = string.format(self.number_of_frames,
+ len(self.frames),
+ len(self.invalid_frames),
+ self.invalid_frame_indices,
+ self.last_frame_commit_record.frame_index + 1)
+ if print_frames:
+ for frame in self.frames.itervalues():
+ string += "\n" + padding + "Frame:\n{}".format(frame.stringify(padding + "\t"))
+ if print_frames and self.invalid_frames:
+ for invalid_frame in self.invalid_frames.itervalues():
+ string += "\n" + padding + "Invalid Frame:\n{}".format(invalid_frame.stringify(padding + "\t"))
+ return string
diff --git a/sqlite_dissect/file/wal_index/README.md b/sqlite_dissect/file/wal_index/README.md
new file mode 100644
index 0000000..7668f16
--- /dev/null
+++ b/sqlite_dissect/file/wal_index/README.md
@@ -0,0 +1,59 @@
+
+# sqlite_dissect.file.wal_index
+
+This package will control parsing and access to the sqlite wal index files.
+
+- header.py
+- wal_index.py
+
+TODO items for the "wal_index" package:
+
+- [ ] Finish UML class diagrams.
+
+
+
+### header.py
+This script holds the header objects used for parsing the header of the wal index file.
+
+This script holds the following object(s):
+- WriteAheadLogIndexHeader(SQLiteHeader)
+- WriteAheadLogIndexSubHeader(SQLiteHeader)
+- WriteAheadLogIndexCheckpointInfo(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Finish try/except exception handling for struct.error and ord in classes.
+- [ ] Implement big endian parsing (if needed).
+- [ ] Create arrays for salt and checksum values rather than separate variables? They are arrays in the sqlite c code.
+ ##### WriteAheadLogIndexHeader Class:
+ - [ ] Check the two sub headers against each other to ensure they are equal.
+ - [ ] Document and handle exceptions that may be raised from creating subcomponents better.
+ ##### WriteAheadLogIndexCheckpointInfo Class:
+ - [ ] Handle the use case of 0xffffffff which is defined as READMARK_NOT_USED.
+ - [ ] Handle the use case of the first reader mark always being 0. (Check this)
+
+
+
+### wal_index.py
+This script holds the class to parse the wal index file.
+
+This script holds the following object(s):
+- WriteAheadLogIndex(object)
+
+
+TODO:
+- [ ] Documentation improvements.
+- [ ] Check variables against None and Type constraints, possibly by using descriptors and/or decorators.
+- [ ] Finish try/except exception handling for struct.error in classes.
+- [ ] Implement big endian parsing (if needed).
+ ##### WriteAheadLogIndex Class:
+ - [ ] This class was a test of parsing a (single page) wal index and needs to be fully implemented.
+ - [ ] Should this be incorporated with the version/version history somehow?
+ - [ ] Update to support a file object.
+ - [ ] Constants for static integers.
+ - [ ] Use cases for implementation of retrieving unallocated space for carving?
+ - [ ] Check logging statements for correctness.
+ - [ ] Account for the database text encoding in the file handle.
+ - [ ] The file_size arg may not be needed since it is in the file handle and may be removed
diff --git a/sqlite_dissect/file/wal_index/__init__.py b/sqlite_dissect/file/wal_index/__init__.py
new file mode 100644
index 0000000..124f7d1
--- /dev/null
+++ b/sqlite_dissect/file/wal_index/__init__.py
@@ -0,0 +1,10 @@
+
+"""
+
+__init__.py
+
+This init script will initialize any needed logic for this package.
+
+This package will control parsing and access to the sqlite wal index files.
+
+"""
diff --git a/sqlite_dissect/file/wal_index/header.py b/sqlite_dissect/file/wal_index/header.py
new file mode 100644
index 0000000..a7e3eee
--- /dev/null
+++ b/sqlite_dissect/file/wal_index/header.py
@@ -0,0 +1,252 @@
+from binascii import hexlify
+from logging import getLogger
+from re import sub
+from struct import unpack
+from sqlite_dissect.constants import ENDIANNESS
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import WAL_INDEX_CHECKPOINT_INFO_LENGTH
+from sqlite_dissect.constants import WAL_INDEX_FILE_FORMAT_VERSION
+from sqlite_dissect.constants import WAL_INDEX_HEADER_LENGTH
+from sqlite_dissect.constants import WAL_INDEX_LOCK_RESERVED_LENGTH
+from sqlite_dissect.constants import WAL_INDEX_NUMBER_OF_SUB_HEADERS
+from sqlite_dissect.constants import WAL_INDEX_NUMBER_OF_FRAMES_BACKFILLED_IN_DATABASE_LENGTH
+from sqlite_dissect.constants import WAL_INDEX_READER_MARK_LENGTH
+from sqlite_dissect.constants import WAL_INDEX_READER_MARK_SIZE
+from sqlite_dissect.constants import WAL_INDEX_SUB_HEADER_LENGTH
+from sqlite_dissect.exception import HeaderParsingError
+from sqlite_dissect.file.header import SQLiteHeader
+from sqlite_dissect.utilities import get_md5_hash
+
+"""
+
+header.py
+
+This script holds the header objects used for parsing the header of the wal index file.
+
+This script holds the following object(s):
+WriteAheadLogIndexHeader(SQLiteHeader)
+WriteAheadLogIndexSubHeader(SQLiteHeader)
+WriteAheadLogIndexCheckpointInfo(object)
+
+"""
+
+
+class WriteAheadLogIndexHeader(SQLiteHeader):
+
+ def __init__(self, wal_index_header_byte_array):
+
+ super(WriteAheadLogIndexHeader, self).__init__()
+
+ logger = getLogger(LOGGER_NAME)
+
+ if len(wal_index_header_byte_array) != WAL_INDEX_HEADER_LENGTH:
+ log_message = "The wal index header byte array of size: {} is not the expected size of: {}."
+ log_message = log_message.format(len(wal_index_header_byte_array), WAL_INDEX_HEADER_LENGTH)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ """
+
+ Note: The sub header will always be followed by an exact copy of itself in the WAL index file header.
+ Therefore, there will always be two (WAL_INDEX_NUMBER_OF_SUB_HEADERS) headers. Instead of having two
+ separate sub header variables, it was decided to do an array for the two since it is similarly
+ implemented like this in the sqlite c code.
+
+ """
+
+ self.sub_headers = []
+
+ for sub_header_index in range(WAL_INDEX_NUMBER_OF_SUB_HEADERS):
+ start_offset = sub_header_index * WAL_INDEX_SUB_HEADER_LENGTH
+ end_offset = start_offset + WAL_INDEX_SUB_HEADER_LENGTH
+ self.sub_headers.append(WriteAheadLogIndexSubHeader(sub_header_index,
+ wal_index_header_byte_array[start_offset:end_offset]))
+
+ """
+
+ Note: Since both of the sub headers are the same, they should each have the same endianness as well as page
+ size and therefore it does not matter from which one we retrieve it from.
+
+ """
+
+ # Set variables for this class for page size and endianness
+ self.page_size = self.sub_headers[0].page_size
+ self.endianness = self.sub_headers[0].endianness
+
+ checkpoint_start_offset = WAL_INDEX_NUMBER_OF_SUB_HEADERS * WAL_INDEX_SUB_HEADER_LENGTH
+ checkpoint_end_offset = checkpoint_start_offset + WAL_INDEX_CHECKPOINT_INFO_LENGTH
+ wal_index_checkpoint_info_byte_array = wal_index_header_byte_array[checkpoint_start_offset:
+ checkpoint_end_offset]
+ self.checkpoint_info = WriteAheadLogIndexCheckpointInfo(wal_index_checkpoint_info_byte_array, self.endianness)
+
+ lock_reserved_start_offset = checkpoint_start_offset + WAL_INDEX_CHECKPOINT_INFO_LENGTH
+ lock_reserved_end_offset = lock_reserved_start_offset + WAL_INDEX_LOCK_RESERVED_LENGTH
+ self.lock_reserved = wal_index_header_byte_array[lock_reserved_start_offset:lock_reserved_end_offset]
+
+ self.md5_hex_digest = get_md5_hash(wal_index_header_byte_array)
+
+ def stringify(self, padding=""):
+ string = padding + "Page Size: {}\n" \
+ + padding + "MD5 Hex Digest: {}"
+ string = string.format(self.page_size,
+ self.md5_hex_digest)
+ for sub_header_index in range(len(self.sub_headers)):
+ string += "\n" + padding + "Sub Header:\n{}"
+ string = string.format(self.sub_headers[sub_header_index].stringify(padding + "\t"))
+ string += "\n" + padding + "Checkpoint Info:\n{}".format(self.checkpoint_info.stringify(padding + "\t"))
+ string += "\n" + padding + "Lock Reserved (Hex): {}".format(hexlify(self.lock_reserved))
+ return string
+
+
+class WriteAheadLogIndexSubHeader(SQLiteHeader):
+
+ def __init__(self, index, wal_index_sub_header_byte_array):
+
+ super(WriteAheadLogIndexSubHeader, self).__init__()
+
+ logger = getLogger(LOGGER_NAME)
+
+ if index < 0 or index > WAL_INDEX_NUMBER_OF_SUB_HEADERS:
+ log_message = "Invalid wal index sub header index: {}.".format(index)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ self.index = index
+
+ if len(wal_index_sub_header_byte_array) != WAL_INDEX_SUB_HEADER_LENGTH:
+ log_message = "The wal index sub header byte array of size: {} is not the expected size of: {}."
+ log_message = log_message.format(len(wal_index_sub_header_byte_array), WAL_INDEX_SUB_HEADER_LENGTH)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ self.endianness = ENDIANNESS.LITTLE_ENDIAN
+
+ # Retrieve the file format version in little endian
+ self.file_format_version = unpack(b"I", wal_index_sub_header_byte_array[0:4])[0]
+
+ if self.file_format_version != WAL_INDEX_FILE_FORMAT_VERSION:
+
+ log_message = "The file format version is invalid"
+ logger.error(log_message)
+ raise HeaderParsingError(log_message)
+
+ else:
+
+ self.endianness = ENDIANNESS.BIG_ENDIAN
+
+ log_message = "The wal index file is in big endian which is currently not supported."
+ logger.error(log_message)
+ raise NotImplementedError(log_message)
+
+ self.unused_padding_field = unpack(b" {}: page version {} at offset {} with {} cells"
+ string = string.format(b_tree_root_page.number, version_interface.get_page_version(b_tree_root_page.number),
+ b_tree_root_page.offset, len(b_tree_root_page.cells))
+ elif isinstance(b_tree_root_page, IndexLeafPage):
+ string += "\n" + padding + "B-Tree Index Leaf Page -> {}: page version {} at offset {} with {} cells"
+ string = string.format(b_tree_root_page.number, version_interface.get_page_version(b_tree_root_page.number),
+ b_tree_root_page.offset, len(b_tree_root_page.cells))
+ elif isinstance(b_tree_root_page, TableInteriorPage):
+ string += "\n" + padding + "B-Tree Table Interior Page -> {}: page version {} at offset {} with {} cells"
+ string = string.format(b_tree_root_page.number, version_interface.get_page_version(b_tree_root_page.number),
+ b_tree_root_page.offset, len(b_tree_root_page.cells))
+ string += stringify_b_tree(version_interface, b_tree_root_page.right_most_page, padding + "\t")
+ for b_tree_interior_cell in b_tree_root_page.cells:
+ string += stringify_b_tree(version_interface, b_tree_interior_cell.left_child_page, padding + "\t")
+ elif isinstance(b_tree_root_page, IndexInteriorPage):
+ string += "\n" + padding + "B-Tree Index Interior Page -> {}: page version {} at offset {} with {} cells"
+ string = string.format(b_tree_root_page.number, version_interface.get_page_version(b_tree_root_page.number),
+ b_tree_root_page.offset, len(b_tree_root_page.cells))
+ string += stringify_b_tree(version_interface, b_tree_root_page.right_most_page, padding + "\t")
+ for b_tree_interior_cell in b_tree_root_page.cells:
+ string += stringify_b_tree(version_interface, b_tree_interior_cell.left_child_page, padding + "\t")
+ else:
+ log_message = "The b-tree root page is not a b-tree root page type but instead: {} in version: {}."
+ log_message = log_message.format(b_tree_root_page.page_type, version_interface.number)
+ getLogger(LOGGER_NAME).error(log_message)
+ raise ValueError(log_message)
+
+ if not isinstance(b_tree_root_page, TableInteriorPage):
+ for cell in b_tree_root_page.cells:
+ if cell.has_overflow:
+ overflow_padding = padding
+ overflow_page = cell.overflow_pages[cell.overflow_page_number]
+ overflow_padding += "\t"
+ string += "\n" + overflow_padding + "Overflow Page -> {}: page version {} at offset {}"
+ string = string.format(overflow_page.number, version_interface.get_page_version(overflow_page.number),
+ overflow_page.offset)
+ while overflow_page.next_overflow_page_number:
+ overflow_page = cell.overflow_pages[overflow_page.next_overflow_page_number]
+ overflow_padding += "\t"
+ string += "\n" + overflow_padding + "Overflow Page -> {}: page version {} at offset {}"
+ string = string.format(overflow_page.number,
+ version_interface.get_page_version(overflow_page.number),
+ overflow_page.offset)
+
+ return string
+
+
+def stringify_cell_record(cell, database_text_encoding, page_type):
+ if page_type == PAGE_TYPE.B_TREE_TABLE_LEAF:
+
+ column_values = []
+ for record_column in cell.payload.record_columns:
+ text_affinity = True if record_column.serial_type >= 13 and record_column.serial_type % 2 == 1 else False
+ value = record_column.value
+ if record_column.value:
+ if text_affinity:
+ column_values.append(value.decode(database_text_encoding, "replace").encode(UTF_8))
+ else:
+ column_values.append(str(value))
+ else:
+ column_values.append("NULL")
+ content = "(" + ", ".join(column_values) + ")"
+ return "#{}: {}".format(cell.row_id, content)
+
+ elif page_type == PAGE_TYPE.B_TREE_INDEX_LEAF:
+
+ column_values = []
+ for record_column in cell.payload.record_columns:
+ text_affinity = True if record_column.serial_type >= 13 and record_column.serial_type % 2 == 1 else False
+ value = record_column.value
+ if record_column.value:
+ if text_affinity:
+ column_values.append(value.decode(database_text_encoding, "replace").encode(UTF_8))
+ else:
+ column_values.append(str(value))
+ else:
+ column_values.append("NULL")
+ content = "(" + ", ".join(column_values) + ")"
+ return content
+
+ else:
+ log_message = "Invalid page type specified for stringify cell record: {}. Page type should " \
+ "be either {} or {}."
+ log_message = log_message.format(page_type, PAGE_TYPE.B_TREE_TABLE_LEAF, PAGE_TYPE.B_TREE_INDEX_LEAF)
+ getLogger(LOGGER_NAME).error(log_message)
+ raise ValueError(log_message)
+
+
+def stringify_cell_records(cells, database_text_encoding, page_type):
+ cell_records = set()
+ for cell in cells:
+ cell_records.add(stringify_cell_record(cell, database_text_encoding, page_type))
+ return cell_records
+
+
+def stringify_master_schema_version(version):
+
+ string = ""
+
+ for master_schema_entry in version.master_schema.master_schema_entries:
+
+ entry_string = "Version: {} Added Master Schema Entry: Root Page Number: {} Type: {} Name: {} " \
+ "Table Name: {} SQL: {}.\n"
+ entry_string = entry_string.format(version.version_number, master_schema_entry.root_page_number,
+ master_schema_entry.row_type, master_schema_entry.name,
+ master_schema_entry.table_name, master_schema_entry.sql)
+ string += entry_string
+
+ return string
+
+
+def stringify_master_schema_versions(version_history):
+
+ string = ""
+
+ master_schema_entries = {}
+
+ for version_number, version in version_history.versions.iteritems():
+
+ if version.master_schema_modified:
+
+ modified_master_schema_entries = dict(map(lambda x: [x.md5_hash_identifier, x],
+ version.master_schema.master_schema_entries))
+
+ for md5_hash_identifier, master_schema_entry in modified_master_schema_entries.iteritems():
+
+ if md5_hash_identifier not in master_schema_entries:
+
+ added_string = "Version: {} Added Master Schema Entry: Root Page Number: {} Type: {} Name: {} " \
+ "Table Name: {} SQL: {}.\n"
+ added_string = added_string.format(version_number, master_schema_entry.root_page_number,
+ master_schema_entry.row_type, master_schema_entry.name,
+ master_schema_entry.table_name, master_schema_entry.sql)
+ string += added_string
+
+ master_schema_entries[md5_hash_identifier] = master_schema_entry
+
+ elif master_schema_entry.root_page_number != master_schema_entries[
+ md5_hash_identifier].root_page_number:
+
+ previous_root_page_number = master_schema_entries[md5_hash_identifier].root_page_number
+
+ updated_string = "Version: {} Updated Master Schema Entry: Root Page Number From: {} To: {} " \
+ "Type: {} Name: {} Table Name: {} SQL: {}.\n"
+ updated_string = updated_string.format(version_number, previous_root_page_number,
+ master_schema_entry.root_page_number,
+ master_schema_entry.row_type, master_schema_entry.name,
+ master_schema_entry.table_name, master_schema_entry.sql)
+ string += updated_string
+
+ master_schema_entries[md5_hash_identifier] = master_schema_entry
+
+ for md5_hash_identifier, master_schema_entry in master_schema_entries.iteritems():
+
+ if md5_hash_identifier not in modified_master_schema_entries:
+
+ removed_string = "Version: {} Removed Master Schema Entry: Root Page Number: {} Type: {} " \
+ "Name: {} Table Name: {} SQL: {}.\n"
+ removed_string = removed_string.format(version_number, master_schema_entry.root_page_number,
+ master_schema_entry.row_type, master_schema_entry.name,
+ master_schema_entry.table_name, master_schema_entry.sql)
+ string += removed_string
+
+ return string
+
+
+def stringify_page_history(version_history, padding=""):
+ string = ""
+ for version_number in version_history.versions:
+ string += "\n" if string else ""
+ string += stringify_version_pages(version_history.versions[version_number], padding)
+ return string
+
+
+def stringify_page_information(version, padding=""):
+ string = padding + "Page Breakdown:"
+ for page_type, page_array in get_page_breakdown(version.pages).iteritems():
+ page_array_length = len(page_array)
+ string += "\n" + padding + "\t" + "{}: {} Page Numbers: {}"
+ string = string.format(page_type, page_array_length, page_array)
+ string += "\n" + padding + "Page Structure:\n{}".format(stringify_page_structure(version, padding + "\t"))
+ if version.pointer_map_pages:
+ string += "\n" + padding + "Pointer Map Entry Breakdown across {} Pages:".format(version.database_size_in_pages)
+ for pointer_map_entry_breakdown in get_pointer_map_entries_breakdown(version):
+ string += "\n" + padding + "\t" + "Pointer Map Page {}: Page {} -> {} ({}) had Pointer Page Type (Hex) {}"
+ string = string.format(pointer_map_entry_breakdown[0], pointer_map_entry_breakdown[1],
+ pointer_map_entry_breakdown[2], pointer_map_entry_breakdown[3],
+ pointer_map_entry_breakdown[4])
+ return string
+
+
+def stringify_page_structure(version, padding=""):
+
+ string = padding + "{} Pages of {} bytes".format(version.database_size_in_pages, version.page_size)
+
+ string += "\n" + padding + "Database Root Page:"
+ string += stringify_b_tree(version, version.root_page, padding + "\t")
+
+ pointer_map_pages = version.pointer_map_pages
+ if pointer_map_pages:
+ for pointer_map_page in pointer_map_pages:
+ string += "\n" + padding + "Pointer Map Page -> {}".format(pointer_map_page.number)
+
+ freelist_trunk_page = version.first_freelist_trunk_page
+ if freelist_trunk_page:
+ string += "\n" + padding + "Freelist Trunk Page -> {}".format(freelist_trunk_page.number)
+ freelist_padding = padding + "\t"
+ for freelist_leaf_page in freelist_trunk_page.freelist_leaf_pages:
+ string += "\n" + freelist_padding + "Freelist Leaf Page -> {}".format(freelist_leaf_page.number)
+ while freelist_trunk_page.next_freelist_trunk_page:
+ freelist_trunk_page = freelist_trunk_page.next_freelist_trunk_page
+ string += "\n" + freelist_padding + "Freelist Trunk Page -> {}".format(freelist_trunk_page.number)
+ freelist_padding += "\t"
+ for freelist_leaf_page in freelist_trunk_page.freelist_leaf_pages:
+ string += "\n" + freelist_padding + "Freelist Leaf Page -> {}".format(freelist_leaf_page.number)
+
+ if version.master_schema:
+ string += "\n" + padding + "Master Schema Root Pages:"
+ for master_schema_root_page_number in version.master_schema.master_schema_b_tree_root_page_numbers:
+ master_schema_root_page = version.get_b_tree_root_page(master_schema_root_page_number)
+ string += stringify_b_tree(version, master_schema_root_page, padding + "\t")
+
+ return string
+
+
+def stringify_unallocated_space(version, padding="", include_empty_space=True):
+ string = ""
+ calculated_total_fragmented_bytes = 0
+ for page_number, page in version.pages.iteritems():
+
+ unallocated_content = page.unallocated_content
+ if len(unallocated_content):
+ if (not include_empty_space and has_content(unallocated_content)) or include_empty_space:
+ string += "\n" if string else ""
+ string += padding + "Page #{}: {} Page Unallocated Space Start Offset: {} " \
+ "End Offset: {} Size: {} Hex: [{}]"
+ string = string.format(page_number, page.page_type, page.unallocated_space_start_offset,
+ page.unallocated_space_end_offset, page.unallocated_space_length,
+ hexlify(page.unallocated_content))
+
+ if isinstance(page, BTreePage):
+ for freeblock in page.freeblocks:
+ freeblock_content = freeblock.content
+ if len(freeblock_content) and has_content(freeblock_content):
+ string += "\n" if string else ""
+ string += padding + "Page #{}: {} Page Freeblock #{}: Unallocated Space Start Offset: {} " \
+ "End Offset: {} Size: {} Hex: [{}]"
+ string = string.format(page_number, page.page_type, freeblock.index, freeblock.start_offset,
+ freeblock.end_offset, freeblock.content_length,
+ hexlify(freeblock_content))
+
+ for fragment in page.fragments:
+ fragment_content = fragment.content
+ if fragment_content and has_content(fragment_content):
+ string += "\n" if string else ""
+ string += padding + "Page #{}: {} Page Fragment #{}: Unallocated Space Start Offset: {} " \
+ "End Offset: {} Size: {} Hex: [{}]"
+ string = string.format(page_number, page.page_type, fragment.index, fragment.start_offset,
+ fragment.end_offset, fragment.byte_size, hexlify(fragment_content))
+ calculated_total_fragmented_bytes += page.header.number_of_fragmented_free_bytes
+
+ string += "\n" if string else ""
+ string += padding + "Calculated Total Fragmented Bytes: {}".format(calculated_total_fragmented_bytes)
+ return string
+
+
+def stringify_version_pages(version, padding=""):
+ string = padding + "Version {} with {} of {} Pages: {}".format(version.version_number,
+ len(version.updated_page_numbers),
+ version.database_size_in_pages,
+ version.updated_page_numbers)
+
+ page_versions = {}
+ for page_number, page_version_number in version.page_version_index.iteritems():
+ if page_version_number in page_versions:
+ page_versions[page_version_number] = page_versions[page_version_number] + ", " + str(page_number)
+ else:
+ page_versions[page_version_number] = str(page_number)
+
+ for version_number in reversed(range(version.version_number + 1)):
+ page_version_string = "\n" + padding + "\t" + "Version: {} has Pages: {}"
+ if version_number in page_versions:
+ string += page_version_string.format(version_number, page_versions[version_number])
+ else:
+ string += page_version_string.format(version_number, str())
+ return string
diff --git a/sqlite_dissect/utilities.py b/sqlite_dissect/utilities.py
new file mode 100644
index 0000000..5a8f2ca
--- /dev/null
+++ b/sqlite_dissect/utilities.py
@@ -0,0 +1,248 @@
+from binascii import hexlify
+from hashlib import md5
+from logging import getLogger
+from re import compile
+from struct import pack
+from struct import unpack
+from sqlite_dissect.constants import ALL_ZEROS_REGEX
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import OVERFLOW_HEADER_LENGTH
+from sqlite_dissect.constants import BLOB_SIGNATURE_IDENTIFIER
+from sqlite_dissect.constants import STORAGE_CLASS
+from sqlite_dissect.constants import TEXT_SIGNATURE_IDENTIFIER
+from sqlite_dissect.exception import InvalidVarIntError
+
+"""
+
+utilities.py
+
+This script holds general utility functions for reference by the sqlite carving library.
+
+This script holds the following function(s):
+calculate_expected_overflow(overflow_byte_size, page_size)
+decode_varint(byte_array, offset)
+encode_varint(value)
+get_class_instance(class_name)
+get_md5_hash(string)
+get_record_content(serial_type, record_body, offset=0)
+get_serial_type_signature(serial_type)
+get_storage_class(serial_type)
+has_content(byte_array)
+
+"""
+
+
+def calculate_expected_overflow(overflow_byte_size, page_size):
+
+ overflow_pages = 0
+ last_overflow_page_content_size = overflow_byte_size
+
+ if overflow_byte_size > 0:
+ while overflow_byte_size > 0:
+ overflow_pages += 1
+ last_overflow_page_content_size = overflow_byte_size
+ overflow_byte_size = overflow_byte_size - page_size + OVERFLOW_HEADER_LENGTH
+
+ return overflow_pages, last_overflow_page_content_size
+
+
+def decode_varint(byte_array, offset=0):
+
+ unsigned_integer_value = 0
+ varint_relative_offset = 0
+
+ for x in xrange(1, 10):
+
+ varint_byte = ord(byte_array[offset + varint_relative_offset:offset + varint_relative_offset + 1])
+ varint_relative_offset += 1
+
+ if x == 9:
+ unsigned_integer_value <<= 1
+ unsigned_integer_value |= varint_byte
+ else:
+ msb_set = varint_byte & 0x80
+ varint_byte &= 0x7f
+ unsigned_integer_value |= varint_byte
+ if msb_set == 0:
+ break
+ else:
+ unsigned_integer_value <<= 7
+
+ signed_integer_value = unsigned_integer_value
+ if signed_integer_value & 0x80000000 << 32:
+ signed_integer_value -= 0x10000000000000000
+
+ return signed_integer_value, varint_relative_offset
+
+
+def encode_varint(value):
+
+ max_allowed = 0x7fffffffffffffff
+ min_allowed = (max_allowed + 1) - 0x10000000000000000
+ if value > max_allowed or value < min_allowed:
+ log_message = "The value: {} is not able to be cast into a 64 bit signed integer for encoding."
+ log_message = log_message.format(value)
+ getLogger(LOGGER_NAME).error(log_message)
+ raise InvalidVarIntError(log_message)
+
+ byte_array = bytearray()
+
+ value += 1 << 64 if value < 0 else 0
+
+ if value & 0xff000000 << 32:
+
+ byte = value & 0xff
+ byte_array.insert(0, pack("B", byte))
+ value >>= 8
+
+ for _ in xrange(8):
+ byte_array.insert(0, pack("B", (value & 0x7f) | 0x80))
+ value >>= 7
+
+ else:
+
+ while value:
+ byte_array.insert(0, pack("B", (value & 0x7f) | 0x80))
+ value >>= 7
+
+ if len(byte_array) >= 9:
+ log_message = "The value: {} produced a varint with a byte array of length: {} beyond the 9 bytes " \
+ "allowed for a varint."
+ log_message = log_message.format(value, len(byte_array))
+ getLogger(LOGGER_NAME).error(log_message)
+ raise InvalidVarIntError(log_message)
+
+ byte_array[-1] &= 0x7f
+
+ return byte_array
+
+
+def get_class_instance(class_name):
+ if class_name.find(".") != -1:
+ path_array = class_name.split(".")
+ module = ".".join(path_array[:-1])
+ instance = __import__(module)
+ for section in path_array[1:]:
+ instance = getattr(instance, section)
+ return instance
+ else:
+ log_message = "Class name: {} did not specify needed modules in order to initialize correctly."
+ log_message = log_message.format(log_message)
+ getLogger(LOGGER_NAME).error(log_message)
+ raise ValueError(log_message)
+
+
+def get_md5_hash(string):
+ md5_hash = md5()
+ md5_hash.update(string)
+ return md5_hash.hexdigest().upper()
+
+
+def get_record_content(serial_type, record_body, offset=0):
+
+ # NULL
+ if serial_type == 0:
+ content_size = 0
+ value = None
+
+ # 8-bit twos-complement integer
+ elif serial_type == 1:
+ content_size = 1
+ value = unpack(b">b", record_body[offset:offset + content_size])[0]
+
+ # Big-endian 16-bit twos-complement integer
+ elif serial_type == 2:
+ content_size = 2
+ value = unpack(b">h", record_body[offset:offset + content_size])[0]
+
+ # Big-endian 24-bit twos-complement integer
+ elif serial_type == 3:
+ content_size = 3
+ value_byte_array = '\0' + record_body[offset:offset + content_size]
+ value = unpack(b">I", value_byte_array)[0]
+ if value & 0x800000:
+ value -= 0x1000000
+
+ # Big-endian 32-bit twos-complement integer
+ elif serial_type == 4:
+ content_size = 4
+ value = unpack(b">i", record_body[offset:offset + content_size])[0]
+
+ # Big-endian 48-bit twos-complement integer
+ elif serial_type == 5:
+ content_size = 6
+ value_byte_array = '\0' + '\0' + record_body[offset:offset + content_size]
+ value = unpack(b">Q", value_byte_array)[0]
+ if value & 0x800000000000:
+ value -= 0x1000000000000
+
+ # Big-endian 64-bit twos-complement integer
+ elif serial_type == 6:
+ content_size = 8
+ value = unpack(b">q", record_body[offset:offset + content_size])[0]
+
+ # Big-endian IEEE 754-2008 64-bit floating point number
+ elif serial_type == 7:
+ content_size = 8
+ value = unpack(b">d", record_body[offset:offset + content_size])[0]
+
+ # Integer constant 0 (schema format == 4)
+ elif serial_type == 8:
+ content_size = 0
+ value = 0
+
+ # Integer constant 1 (schema format == 4)
+ elif serial_type == 9:
+ content_size = 0
+ value = 1
+
+ # These values are not used/reserved and should not be found in sqlite files
+ elif serial_type == 10 or serial_type == 11:
+ raise Exception()
+
+ # A BLOB that is (N-12)/2 bytes in length
+ elif serial_type >= 12 and serial_type % 2 == 0:
+ content_size = (serial_type - 12) / 2
+ value = record_body[offset:offset + content_size]
+
+ # A string in the database encoding and is (N-13)/2 bytes in length. The nul terminator is omitted
+ elif serial_type >= 13 and serial_type % 2 == 1:
+ content_size = (serial_type - 13) / 2
+ value = record_body[offset:offset + content_size]
+
+ else:
+ log_message = "Invalid serial type: {} at offset: {} in record body: {}."
+ log_message = log_message.format(serial_type, offset, hexlify(record_body))
+ getLogger(LOGGER_NAME).error(log_message)
+ raise ValueError(log_message)
+
+ return content_size, value
+
+
+def get_serial_type_signature(serial_type):
+ if serial_type >= 12:
+ if serial_type % 2 == 0:
+ return BLOB_SIGNATURE_IDENTIFIER
+ elif serial_type % 2 == 1:
+ return TEXT_SIGNATURE_IDENTIFIER
+ return serial_type
+
+
+def get_storage_class(serial_type):
+ if serial_type == 0:
+ return STORAGE_CLASS.NULL
+ if serial_type in [1, 2, 3, 4, 5, 6, 8, 9]:
+ return STORAGE_CLASS.INTEGER
+ if serial_type == 7:
+ return STORAGE_CLASS.REAL
+ if serial_type >= 12 and serial_type % 2 == 0:
+ return STORAGE_CLASS.BLOB
+ if serial_type >= 13 and serial_type % 2 == 0:
+ return STORAGE_CLASS.TEXT
+
+
+def has_content(byte_array):
+ pattern = compile(ALL_ZEROS_REGEX)
+ if pattern.match(hexlify(byte_array)):
+ return False
+ return True
diff --git a/sqlite_dissect/version_history.py b/sqlite_dissect/version_history.py
new file mode 100644
index 0000000..4d3336f
--- /dev/null
+++ b/sqlite_dissect/version_history.py
@@ -0,0 +1,813 @@
+from logging import getLogger
+from re import sub
+from warnings import warn
+from sqlite_dissect.carving.carver import SignatureCarver
+from sqlite_dissect.constants import BASE_VERSION_NUMBER
+from sqlite_dissect.constants import CELL_SOURCE
+from sqlite_dissect.constants import COMMIT_RECORD_BASE_VERSION_NUMBER
+from sqlite_dissect.constants import LOGGER_NAME
+from sqlite_dissect.constants import MASTER_SCHEMA_ROW_TYPE
+from sqlite_dissect.constants import PAGE_TYPE
+from sqlite_dissect.exception import VersionParsingError
+from sqlite_dissect.exception import WalCommitRecordParsingError
+from sqlite_dissect.exception import WalFrameParsingError
+from sqlite_dissect.file.database.page import BTreePage
+from sqlite_dissect.file.database.utilities import aggregate_leaf_cells
+from sqlite_dissect.file.database.utilities import get_pages_from_b_tree_page
+from sqlite_dissect.file.schema.master import VirtualTableRow
+from sqlite_dissect.file.wal.commit_record import WriteAheadLogCommitRecord
+from sqlite_dissect.file.version_parser import VersionParser
+
+"""
+
+version_history.py
+
+This script holds the superclass objects used for parsing the database and write ahead log in a sequence of versions
+throughout all of the commit records in the write ahead log.
+
+This script holds the following object(s):
+VersionHistory(object)
+VersionHistoryParser(VersionParser) (with VersionHistoryParserIterator(object) as an inner class)
+Commit(object)
+
+"""
+
+
+class VersionHistory(object):
+
+ """
+
+
+
+ This class represents the SQL database and WAL commit records as a sequence of versions. This way the changes
+ from commit record to commit record can be viewed and worked with and each version has information in them that
+ lends to them being carved easier. Here version 0 (BASE_VERSION_NUMBER) is used to always represent the main
+ database and then 1 to N versions following the base version represent the commit records up to N. To note,
+ the final commit record, N, has the possibility of being half written and not committed depending if the
+ committed page size is set in one of the frames in the commit record or not.
+
+ """
+
+ def __init__(self, database, write_ahead_log=None):
+
+ logger = getLogger(LOGGER_NAME)
+
+ # Set the database and write ahead log
+ self._database = database
+ self._write_ahead_log = write_ahead_log
+
+ """
+
+ Initialize the versions in for them of:
+ versions[VERSION_NUMBER] = database where VERSION_NUMBER = BASE_VERSION_NUMBER (0)
+ versions[VERSION_NUMBER] = commit_record_VERSION_NUMBER where VERSION_NUMBER is 1 to N for N commit records.
+
+ """
+
+ self.versions = {BASE_VERSION_NUMBER: self._database}
+
+ if self._write_ahead_log:
+
+ # Set the database text encoding to the write ahead log file if it was set in the database file
+ if self._database.database_text_encoding:
+ self._write_ahead_log.file_handle.database_text_encoding = self._database.database_text_encoding
+
+ # Set the last database header and master schema to refer to
+ last_database_header = self._database.database_header
+ last_master_schema = self._database.master_schema
+
+ # These two dictionaries will be updated and sent into every commit record
+ page_version_index = self._database.page_version_index
+ page_frame_index = {}
+
+ # Setup variables for frame association with commit records
+ frames = []
+ commit_record_number = COMMIT_RECORD_BASE_VERSION_NUMBER
+
+ # Iterate through all of the frames in the write ahead log
+ for frame_index in range(len(self._write_ahead_log.frames)):
+
+ # Set the frame
+ frame = self._write_ahead_log.frames[frame_index]
+
+ # Make sure the frame index matches the frame
+ if frame_index != frame.frame_index:
+ log_message = "Current frame index: {} did not match the expected frame index: {} while parsing " \
+ "frames for commit record version: {}."
+ log_message = log_message.format(frame_index, frame.frame_index, commit_record_number)
+ logger.error(log_message)
+ raise WalFrameParsingError(log_message)
+
+ # Add the frame to the frames array
+ frames.append(frame)
+
+ # Make sure the frame belongs to the commit record we are currently working on creating
+ if frame.commit_record_number != commit_record_number:
+ log_message = "Current frame commit record number: {} did not match the expected commit record " \
+ "number : {}."
+ log_message = log_message.format(frame.commit_record_number, commit_record_number)
+ logger.error(log_message)
+ raise WalFrameParsingError(log_message)
+
+ """
+
+ According to SQLite documentation, the frame with the page size after commit field in the header set
+ is the commit frame and therefore all frames before this one (up to the previous one) are considered
+ the commit record. No frames will appear beyond this frame with additional information in this commit
+ record.
+
+ """
+
+ # Check if this frame is a commit frame
+ if frame.commit_frame:
+
+ # Create the commit record since we now have all the frames for this commit record
+ commit_record = WriteAheadLogCommitRecord(commit_record_number, self._database,
+ self._write_ahead_log, frames, page_frame_index,
+ page_version_index, last_database_header,
+ last_master_schema,
+ store_in_memory=write_ahead_log.store_in_memory,
+ strict_format_checking=write_ahead_log.
+ strict_format_checking)
+
+ if commit_record.database_header_modified:
+ last_database_header = commit_record.database_header
+
+ if not last_database_header:
+ log_message = "Database header was detected as modified for commit record version: {} " \
+ "but no database header was found."
+ log_message = log_message.format(commit_record_number)
+ logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ if commit_record.master_schema_modified:
+ last_master_schema = commit_record.master_schema
+
+ if not last_master_schema:
+ log_message = "Master schema was detected as modified for commit record version: {} " \
+ "but no master schema was found."
+ log_message = log_message.format(commit_record_number)
+ logger.error(log_message)
+ raise WalCommitRecordParsingError(log_message)
+
+ # Set the page version and page frame dictionaries variables for the next commit record
+ page_frame_index = commit_record.page_frame_index
+ page_version_index = commit_record.page_version_index
+
+ self.versions[commit_record_number] = commit_record
+
+ # Increment the commit record number and clear the frames array (reset to an empty array).
+ commit_record_number += 1
+ frames = []
+
+ # Check if there are remaining frames which indicates the last commit record was not committed
+ if len(frames) > 0:
+
+ # Create the commit record
+ commit_record = WriteAheadLogCommitRecord(commit_record_number, self._database, self._write_ahead_log,
+ frames, page_frame_index, page_version_index,
+ last_database_header, last_master_schema,
+ store_in_memory=write_ahead_log.store_in_memory,
+ strict_format_checking=write_ahead_log.strict_format_checking)
+
+ """
+
+ Note: We do not need to worry about setting the last database header or last master schema here. We
+ also do not need to worry about setting the page frame index or page version index.
+
+ """
+
+ self.versions[commit_record_number] = commit_record
+
+ """
+
+ Since we have not seen use cases where the write ahead log file has had additional frames beyond the
+ last frame that was a commit frame, we throw a warning here since this use case could result in
+ adverse logic.
+
+ """
+
+ log_message = "Version (commit record): {} has additional frames beyond the last commit frame found " \
+ "in the write ahead log and erroneous use cases may occur when parsing."
+ log_message = log_message.format(commit_record_number)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ # Set the number of versions
+ self.number_of_versions = len(self.versions)
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding="", print_versions=True):
+ string = "File Type: {}"
+ string = string.format(self.number_of_versions)
+ if print_versions:
+ for version in self.versions:
+ string += "\n" + padding + "Page:\n{}".format(version.stringify(padding + "\t"))
+ return string
+
+
+class VersionHistoryParser(VersionParser):
+
+ def __init__(self, version_history, master_schema_entry,
+ version_number=None, ending_version_number=None, signature=None, carve_freelist_pages=False):
+
+ """
+
+
+
+ Note: The updated cells currently only apply to table leaf pages (table master schema entries that are not
+ "without rowid" tables). Therefore no index pages will have updated cells. This is due to the fact
+ that updates are determined off of the row id at the moment which is only available in the b-tree table
+ pages. However, it is important to note that even if the row id is the same and that cell is determined
+ to have been updated by this process, there is still a chance that it is not an offset and has error
+ in this assumption. Additional checking may need to be done into the file offsets and/or primary keys.
+ (Although file offsets can change as well as page numbers on updates or vacuuming.) Investigation needs
+ to be done more into table pages as well as how to determine updates for index pages.
+
+ Note: If there are duplicate entries found in consecutive versions (ie. entries that did not change), those
+ will be left out and only reported in the first version they are found ("added"). The first version
+ to be parsed, whether that be the base version or one of the commit records, will have all entries
+ considered "added" and no deleted or updated entries.
+
+ :param version_history:
+ :param master_schema_entry:
+ :param version_number:
+ :param ending_version_number:
+
+ :return:
+
+ :raise:
+
+ """
+
+ # Call to the super class
+ super(VersionHistoryParser, self).__init__(version_history, master_schema_entry,
+ version_number, ending_version_number)
+
+ logger = getLogger(LOGGER_NAME)
+
+ self._versions = version_history.versions
+
+ log_message = "Creating version history parser for master schema entry with name: {} table name: {} " \
+ "row type: {} and sql: {} for version number: {} and ending version number: {}."
+ log_message = log_message.format(self.name, self.table_name, self.row_type, self.sql,
+ self.parser_starting_version_number, self.parser_ending_version_number)
+ logger.debug(log_message)
+
+ self._virtual_table = isinstance(master_schema_entry, VirtualTableRow)
+
+ if signature:
+
+ if signature.name != self.name:
+ log_message = "Invalid signature name: {} for version history parser on master schema entry name: {}."
+ log_message = log_message.format(signature.name, self.name)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ if signature.row_type != self.row_type:
+ log_message = "Invalid signature row type: {} for signature name: {} for version history parser on " \
+ "master schema entry name: {} and row type: {}."
+ log_message = log_message.format(signature.row_type, signature.name, self.name, self.row_type)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ if signature.row_type != MASTER_SCHEMA_ROW_TYPE.TABLE:
+ log_message = "Not carving version history parser for master schema entry with name: {} table " \
+ "name: {} row type: {} and sql: {} for version number: {} and ending version number: " \
+ "{} since the row type is not a {} type but: {}."
+ log_message = log_message.format(self.name, self.table_name, self.row_type, self.sql,
+ self.parser_starting_version_number, self.parser_ending_version_number,
+ MASTER_SCHEMA_ROW_TYPE.TABLE, signature.row_type)
+ logger.warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ # Set the signature
+ self.signature = signature
+
+ self.carve_freelist_pages = carve_freelist_pages
+
+ if self.carve_freelist_pages and not self.signature:
+ log_message = "Carve freelist pages set with no signature defined. A signatures is needed in order to " \
+ "carve freelist pages for master schema entry with name: {} table name: {} row type: {} " \
+ "and sql: {} for version number: {} and ending version number: {}."
+ log_message = log_message.format(self.name, self.table_name, self.row_type, self.sql,
+ self.parser_starting_version_number, self.parser_ending_version_number)
+ logger.error(log_message)
+ raise ValueError(log_message)
+
+ def __iter__(self):
+ if self.row_type not in [MASTER_SCHEMA_ROW_TYPE.TABLE, MASTER_SCHEMA_ROW_TYPE.INDEX]:
+ # Return an empty iterator
+ return iter([])
+ elif self._virtual_table:
+
+ """
+
+ In the case this is a virtual table, we check to see if the root page is 0. Additional use cases
+ handling for virtual tables needs to be investigated. For now, if a virtual table exists with a
+ root page of 0 we do not iterate through it and return a StopIteration() since we do not have anything
+ to iterate. We do throw a warning here (again) for informative purposes.
+
+ Note: If all root page numbers in the root page number version index are not 0, an exception is raised.
+
+ """
+
+ # Check to make sure all root page numbers are 0 as should be with virtual tables.
+ if not all(root_page_number == 0 for root_page_number in self.root_page_number_version_index.values()):
+ log_message = "Virtual table found with root page version index: {} where all root page numbers " \
+ "are not equal to 0 in version history parser for master schema entry with " \
+ "name: {} table name: {} row type: {} and sql: {} for version number: {} " \
+ "and ending version number: {}."
+ log_message = log_message.format(self.root_page_number_version_index,
+ self.name, self.table_name, self.row_type, self.sql,
+ self.parser_starting_version_number,
+ self.parser_ending_version_number)
+ getLogger(LOGGER_NAME).error(log_message)
+ raise ValueError(log_message)
+
+ log_message = "Virtual table found with root page 0 for in version history parser for master schema " \
+ "entry with name: {} table name: {} row type: {} and sql: {} for version number: {} " \
+ "and ending version number: {}. An iterator will not be returned since there " \
+ "is no content."
+ log_message = log_message.format(self.name, self.table_name, self.row_type, self.sql,
+ self.parser_starting_version_number, self.parser_ending_version_number)
+ getLogger(LOGGER_NAME).warn(log_message)
+ warn(log_message, RuntimeWarning)
+
+ # Return an empty iterator
+ return iter([])
+
+ elif self.parser_starting_version_number is not None and self.parser_ending_version_number is not None:
+ return self.VersionParserIterator(self.name, self._versions, self.page_type,
+ self.parser_starting_version_number, self.parser_ending_version_number,
+ self.root_page_number_version_index,
+ self.signature, self.carve_freelist_pages)
+ else:
+ # Return an empty iterator
+ return iter([])
+
+ def stringify(self, padding="", print_cells=True):
+ string = ""
+ for commit in self:
+ string += "\n" + padding + "Commit:\n{}".format(commit.stringify(padding + "\t", print_cells))
+ return super(VersionHistoryParser, self).stringify(padding) + string
+
+ class VersionParserIterator(object):
+
+ """
+
+
+
+ Note: See VersionHistoryParser class documentation regarding entries returned from this iterator
+ (specifically on updates).
+
+ """
+
+ def __init__(self, name, versions, page_type, parser_starting_version_number, parser_ending_version_number,
+ root_page_number_version_index, signature=None, carve_freelist_pages=False):
+ self._name = name
+ self._versions = versions
+ self._page_type = page_type
+ self._parser_starting_version_number = parser_starting_version_number
+ self._parser_ending_version_number = parser_ending_version_number
+ self._root_page_number_version_index = root_page_number_version_index
+
+ # Set the signature
+ self._signature = signature
+
+ self._carve_freelist_pages = carve_freelist_pages
+
+ # Initialize the current cells
+ self._current_cells = {}
+
+ # Initialize the carved cell md5 hex digests
+ self._carved_cell_md5_hex_digests = []
+
+ # Initialize the current b-tree page numbers
+ self._current_b_tree_page_numbers = []
+
+ self._current_version_number = self._parser_starting_version_number
+
+ def __iter__(self):
+ return self
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding="", print_cells=True):
+ string = padding + "Page Type: {}\n" \
+ + padding + "Parser Starting Version Number: {}\n" \
+ + padding + "Parser Ending Version Number: {}\n" \
+ + padding + "Root Page Number Version Index: {}\n" \
+ + padding + "Current Version Number: {}\n" \
+ + padding + "Current B-Tree Page Numbers: {}\n" \
+ + padding + "Carve Freelist Pages: {}"
+ string = string.format(self._page_type,
+ self._parser_starting_version_number,
+ self._parser_ending_version_number,
+ self._current_version_number,
+ self._current_b_tree_page_numbers,
+ self._carve_freelist_pages)
+ if print_cells:
+ for current_cell in self._current_cells.itervalues():
+ string += "\n" + padding + "Cell:\n{}".format(current_cell.stringify(padding + "\t"))
+ return string
+
+ def next(self):
+
+ if self._current_version_number <= self._parser_ending_version_number:
+
+ version = self._versions[self._current_version_number]
+ root_page_number = self._root_page_number_version_index[self._current_version_number]
+
+ # Create the commit object
+ commit = Commit(self._name, version.file_type, self._current_version_number,
+ version.database_text_encoding, self._page_type, root_page_number,self._current_b_tree_page_numbers)
+
+ b_tree_updated = False
+
+ # Check if this is the first version to be investigated
+ if self._current_version_number == self._parser_starting_version_number:
+ b_tree_updated = True
+
+ # Check if the root page number changed
+ elif root_page_number != self._root_page_number_version_index[self._current_version_number - 1]:
+ b_tree_updated = True
+
+ # Check if any of the pages changed (other than the root page specifically here)
+ elif [page_number for page_number in self._current_b_tree_page_numbers
+ if page_number in version.updated_b_tree_page_numbers]:
+ b_tree_updated = True
+
+ # Parse the b-tree page structure if it was updated
+ if b_tree_updated:
+
+ # Get the root page and root page numbers from the first version
+ root_page = version.get_b_tree_root_page(root_page_number)
+ b_tree_pages = get_pages_from_b_tree_page(root_page)
+ self._current_b_tree_page_numbers = [b_tree_page.number for b_tree_page in b_tree_pages]
+
+ # Update the b-tree page numbers in the commit record
+ commit.b_tree_page_numbers = self._current_b_tree_page_numbers
+
+ updated_b_tree_page_numbers = [page_number for page_number in self._current_b_tree_page_numbers
+ if page_number in version.updated_b_tree_page_numbers]
+
+ # Set the updated b-tree page numbers in the commit object
+ commit.updated_b_tree_page_numbers = updated_b_tree_page_numbers
+
+ """
+
+ Below we aggregate the cells together. This function returns the total of cells and then
+ a dictionary of cells indexed by their cell md5 hex digest to record. Here, we do not
+ want to ignore any entries since we want to be able to obtain those that were added along
+ with cells that were deleted and/or updated. Therefore, the total should match the length
+ of the cells returned.
+
+ """
+
+ total, cells = aggregate_leaf_cells(root_page)
+
+ if total != len(cells):
+ log_message = "The total aggregated leaf cells: {} does not match the length of the " \
+ "cells parsed: {} for version: {} of page type: {} iterating between versions " \
+ "{} and {} over b-tree page numbers: {} with updated b-tree pages: {}."
+ log_message = log_message.format(total, len(cells), self._current_version_number,
+ self._page_type, self._parser_starting_version_number,
+ self._parser_ending_version_number,
+ self._current_b_tree_page_numbers,
+ updated_b_tree_page_numbers)
+ getLogger(LOGGER_NAME).error(log_message)
+ raise VersionParsingError(log_message)
+
+ """
+
+ Go through the cells and determine which cells have been added, deleted, and/or updated.
+
+ """
+
+ # Copy the newly found cells to a new dictionary
+ added_cells = dict.copy(cells)
+
+ # Initialize the deleted cells
+ deleted_cells = {}
+
+ # Iterate through the current cells
+ for current_cell_md5, current_cell in self._current_cells.iteritems():
+
+ # Remove the cell from the added cells if it was already pre-existing
+ if current_cell_md5 in added_cells:
+ del added_cells[current_cell_md5]
+
+ # The cell was in the previously current cells but now deleted
+ else:
+ deleted_cells[current_cell_md5] = current_cell
+
+ # Set the current cells to this versions cells
+ self._current_cells = cells
+
+ """
+
+ At this point we have the following two dictionaries:
+ added_cells: All of the cells that were found to be new in this version for this table/index.
+ deleted_cells: All of the cells that were found to be deleted in this version for this table/index.
+
+ The current cells are set back to the cells for future version iterations to compare against. This
+ is set to the whole dictionary of cells and not the added cells since pre-existing cells can carry
+ over into consecutive versions.
+
+ """
+
+ if self._page_type == PAGE_TYPE.B_TREE_TABLE_LEAF:
+
+ # Organize a added cells dictionary keyed off of row id
+ added_cells_by_row_id = {added_cell.row_id: added_cell for added_cell in added_cells.values()}
+
+ # Get the row ids of the cells that were updated by checking against the deleted cells
+ updated_cell_row_ids = [deleted_cell.row_id for deleted_cell in deleted_cells.values()
+ if deleted_cell.row_id in added_cells_by_row_id]
+
+ # Get the cells that might possibly have been updated by comparing the row ids
+ updated_cells = {updated_cell.md5_hex_digest: updated_cell
+ for updated_cell in added_cells.values()
+ if updated_cell.row_id in updated_cell_row_ids}
+
+ # Update the deleted cells to remove any possibly updated cells just determined
+ deleted_cells = {deleted_cell.md5_hex_digest: deleted_cell
+ for deleted_cell in deleted_cells.values()
+ if deleted_cell.row_id not in updated_cell_row_ids}
+
+ # Before we can set the added cells, we need to remove the updated cells detected above
+ added_cells = {added_cell.md5_hex_digest: added_cell
+ for added_cell in added_cells.values()
+ if added_cell.md5_hex_digest not in updated_cells}
+
+ # Set the added, updated, and deleted cells
+ commit.added_cells = added_cells
+ commit.updated_cells = updated_cells
+ commit.deleted_cells = deleted_cells
+
+ """
+
+ Right now we only carve if the signature is specified and only from pages that were updated in
+ this particular b-tree in this version.
+
+ Note: Once index page carving is implemented this section will need to be updated to correctly
+ address it.
+
+ """
+
+ if self._signature:
+ log_message = "Carving table master schema entry name: {} for page type: {} for version: " \
+ "{} with root page: {} between versions {} and {} over b-tree page " \
+ "numbers: {} with updated b-tree pages: {}."
+ log_message = log_message.format(self._signature.name, self._page_type,
+ self._current_version_number,
+ root_page_number, self._parser_starting_version_number,
+ self._parser_ending_version_number,
+ self._current_b_tree_page_numbers,
+ updated_b_tree_page_numbers)
+ getLogger(LOGGER_NAME).debug(log_message)
+
+ # Initialize the carved cells
+ carved_cells = []
+
+ b_tree_pages_by_number = {b_tree_page.number: b_tree_page for b_tree_page in b_tree_pages}
+
+ for updated_b_tree_page_number in updated_b_tree_page_numbers:
+
+ page = b_tree_pages_by_number[updated_b_tree_page_number]
+
+ # For carving freeblocks make sure the page is a b-tree page and not overflow
+ if isinstance(page, BTreePage):
+ carvings = SignatureCarver.carve_freeblocks(version, CELL_SOURCE.B_TREE,
+ page.freeblocks, self._signature)
+ carved_cells.extend(carvings)
+
+ # Carve unallocated space
+ carvings = SignatureCarver.carve_unallocated_space(version, CELL_SOURCE.B_TREE,
+ updated_b_tree_page_number,
+ page.unallocated_space_start_offset,
+ page.unallocated_space,
+ self._signature)
+ carved_cells.extend(carvings)
+
+ # Remove all carved cells that may be duplicates from previous version carvings
+ carved_cells = {carved_cell.md5_hex_digest: carved_cell for carved_cell in carved_cells
+ if carved_cell.md5_hex_digest not in self._carved_cell_md5_hex_digests}
+
+ # Update the carved cells in the commit object
+ commit.carved_cells.update(carved_cells)
+
+ # Update the carved cell md5 hex digests
+ self._carved_cell_md5_hex_digests.extend([cell_md5_hex_digest
+ for cell_md5_hex_digest in carved_cells.keys()])
+
+ elif self._page_type == PAGE_TYPE.B_TREE_INDEX_LEAF:
+
+ # Set the added cells
+ commit.added_cells = added_cells
+
+ # As noted above, we will not define updates for index cells yet so just set the deleted cells
+ commit.deleted_cells = deleted_cells
+
+ else:
+ log_message = "Invalid page type: {} found for version: {} iterating between versions {} " \
+ "and {} over b-tree page numbers: {} with updated b-tree pages: {}."
+ log_message = log_message.format(self._page_type, self._current_version_number,
+ self._parser_starting_version_number,
+ self._parser_ending_version_number,
+ self._current_b_tree_page_numbers,
+ updated_b_tree_page_numbers)
+ getLogger(LOGGER_NAME).error(log_message)
+ raise VersionParsingError(log_message)
+
+ """
+
+ Note: The outer class checks on if the signature is defined in relation to the carving of freelist
+ pages being set and handles it accordingly. Here we can assume that the signature is defined
+ if we are carving freelist pages.
+
+ """
+
+ # See if we are also
+ if self._carve_freelist_pages:
+
+ freelist_pages_updated = False
+
+ # Check if this is the first version to be investigated
+ if self._current_version_number == self._parser_starting_version_number:
+ freelist_pages_updated = True
+
+ # Check if the freelist pages were modified in this version
+ elif version.freelist_pages_modified:
+ freelist_pages_updated = True
+
+ # Carve the freelist pages if any were updated
+ if freelist_pages_updated:
+
+ """
+
+ Note: We only have to worry about caring the B_TREE_TABLE_LEAF pages right now since this is
+ the only page really supported in carving so far. The super class already prints the
+ needed warnings that carving will not occur if it is an B_TREE_INDEX_LEAF page.
+
+ Note: As also stated above the signature by this point will be set.
+
+ """
+
+ if self._page_type == PAGE_TYPE.B_TREE_TABLE_LEAF:
+
+ # Populate the updated freelist pages into a dictionary keyed by page number
+ updated_freelist_pages = {}
+ freelist_trunk_page = version.first_freelist_trunk_page
+ while freelist_trunk_page:
+ if freelist_trunk_page.number in version.updated_page_numbers:
+ updated_freelist_pages[freelist_trunk_page.number] = freelist_trunk_page
+ for freelist_leaf_page in freelist_trunk_page.freelist_leaf_pages:
+ if freelist_leaf_page.number in version.updated_page_numbers:
+ updated_freelist_pages[freelist_leaf_page.number] = freelist_leaf_page
+ freelist_trunk_page = freelist_trunk_page.next_freelist_trunk_page
+
+ # Update the commit object
+ commit.freelist_pages_carved = True
+ commit.updated_freelist_page_numbers = updated_freelist_pages.keys()
+
+ log_message = "Carving freelist pages for table master schema entry name: {} " \
+ "for page type: {} for version: {} with root page: {} between versions {} " \
+ "and {} over updated freelist pages: {}."
+ log_message = log_message.format(self._signature.name, self._page_type,
+ self._current_version_number,
+ root_page_number, self._parser_starting_version_number,
+ self._parser_ending_version_number,
+ updated_freelist_pages.keys())
+ getLogger(LOGGER_NAME).debug(log_message)
+
+ # Initialize the carved cells
+ carved_cells = []
+
+ for freelist_page_number, freelist_page in updated_freelist_pages.iteritems():
+
+ # Carve unallocated space
+ carvings = SignatureCarver.carve_unallocated_space(version, CELL_SOURCE.FREELIST,
+ freelist_page_number,
+ freelist_page.
+ unallocated_space_start_offset,
+ freelist_page.unallocated_space,
+ self._signature)
+
+ carved_cells.extend(carvings)
+
+ # Remove all carved cells that may be duplicates from previous version carvings
+ carved_cells = {carved_cell.md5_hex_digest: carved_cell for carved_cell in carved_cells
+ if carved_cell.md5_hex_digest not in self._carved_cell_md5_hex_digests}
+
+ # Update the carved cells in the commit object
+ commit.carved_cells.update(carved_cells)
+
+ # Update the carved cell md5 hex digests
+ self._carved_cell_md5_hex_digests.extend([cell_md5_hex_digest
+ for cell_md5_hex_digest in carved_cells.keys()])
+
+ # Increment the current version number
+ self._current_version_number += 1
+
+ # Return the commit object
+ return commit
+
+ else:
+ raise StopIteration()
+
+
+class Commit(object):
+
+ def __init__(self, name, file_type, version_number, database_text_encoding, page_type, root_page_number,
+ b_tree_page_numbers, updated_b_tree_page_numbers=None, freelist_pages_carved=False,
+ updated_freelist_page_numbers=None):
+
+ """
+
+
+
+ Note: This may not be updated in the case where carved cells were found, but found to be duplicates of a
+ previous commit and therefore removed.
+
+ :param name:
+ :param file_type:
+ :param version_number:
+ :param database_text_encoding:
+ :param page_type:
+ :param root_page_number:
+ :param b_tree_page_numbers:
+ :param updated_b_tree_page_numbers:
+ :param freelist_pages_carved:
+ :param updated_freelist_page_numbers:
+
+ :return:
+
+ """
+
+ self.name = name
+ self.file_type = file_type
+ self.version_number = version_number
+ self.database_text_encoding = database_text_encoding
+ self.page_type = page_type
+ self.root_page_number = root_page_number
+
+ self.b_tree_page_numbers = b_tree_page_numbers
+
+ self.updated_b_tree_page_numbers = updated_b_tree_page_numbers
+ self.freelist_pages_carved = freelist_pages_carved
+ self.updated_freelist_page_numbers = updated_freelist_page_numbers
+ self.added_cells = {}
+ self.deleted_cells = {}
+ self.updated_cells = {}
+ self.carved_cells = {}
+
+ def __repr__(self):
+ return self.__str__().encode("hex")
+
+ def __str__(self):
+ return sub("\t", "", sub("\n", " ", self.stringify()))
+
+ def stringify(self, padding="", print_cells=True):
+ string = padding + "Version Number: {}\n" \
+ + padding + "Database Text Encoding: {}\n" \
+ + padding + "Page Type: {}\n" \
+ + padding + "Root Page Number: {}\n" \
+ + padding + "B-Tree Page Numbers: {}\n" \
+ + padding + "Updated: {}\n" \
+ + padding + "Updated B-Tree Page Numbers: {}\n" \
+ + padding + "Freelist Pages Carved: {}\n" \
+ + padding + "Updated Freelist Page Numbers: {}\n"
+ string = string.format(self.version_number,
+ self.database_text_encoding,
+ self.page_type,
+ self.root_page_number,
+ self.b_tree_page_numbers,
+ self.updated,
+ self.updated_b_tree_page_numbers,
+ self.freelist_pages_carved,
+ self.updated_freelist_page_numbers)
+ if print_cells:
+ for added_cell in self.added_cells.itervalues():
+ string += "\n" + padding + "Added Cell:\n{}".format(added_cell.stringify(padding + "\t"))
+ for deleted_cell in self.deleted_cells.itervalues():
+ string += "\n" + padding + "Deleted Cell:\n{}".format(deleted_cell.stringify(padding + "\t"))
+ for updated_cell in self.updated_cells.itervalues():
+ string += "\n" + padding + "Updated Cell:\n{}".format(updated_cell.stringify(padding + "\t"))
+ for carved_cell in self.carved_cells.itervalues():
+ string += "\n" + padding + "Carved Cell:\n{}".format(carved_cell.stringify(padding + "\t"))
+ return string
+
+ @property
+ def updated(self):
+ return True if (self.added_cells or self.deleted_cells or self.updated_cells or self.carved_cells) else False