From 48b319c38808197d0e6ec32425ce02b290649299 Mon Sep 17 00:00:00 2001 From: Andrew Northall Date: Fri, 7 Feb 2025 03:59:44 +0000 Subject: [PATCH] refactor: move data files to public s3 bucket --- README.md | 26 +- data/azure-ocr/json/ACA 1991.pdf.json | 3 - data/azure-ocr/json/ACA 1992.pdf.json | 3 - data/azure-ocr/json/ACA 1993.pdf.json | 3 - data/azure-ocr/json/ACA 1994-1995.pdf.json | 3 - data/azure-ocr/json/ACA 1996-1998.pdf.json | 3 - data/azure-ocr/json/ACA 1999-2001.pdf.json | 3 - data/azure-ocr/json/ACA 2002-2003.pdf.json | 3 - data/azure-ocr/json/ACA 2004-2005.pdf.json | 3 - data/azure-ocr/json/ACA 2006.pdf.json | 3 - data/azure-ocr/json/ACA 2007-2008.pdf.json | 3 - data/azure-ocr/json/ACA 2009-2010.pdf.json | 3 - data/azure-ocr/json/ACA 2011-2012.pdf.json | 3 - data/azure-ocr/json/ACA 2013-2014.pdf.json | 3 - .../ACA 2015-2016 (50th Anniversary).pdf.json | 3 - data/azure-ocr/json/ACA 2017-2018.pdf.json | 3 - data/azure-ocr/json/ACA 2019-2020.pdf.json | 3 - data/azure-ocr/json/ACA 2021-2022.pdf.json | 3 - data/azure-ocr/txt/ACA 1991.pdf.txt | 3 - data/azure-ocr/txt/ACA 1992.pdf.txt | 3 - data/azure-ocr/txt/ACA 1993.pdf.txt | 3 - data/azure-ocr/txt/ACA 1994-1995.pdf.txt | 3 - data/azure-ocr/txt/ACA 1996-1998.pdf.txt | 3 - data/azure-ocr/txt/ACA 1999-2001.pdf.txt | 3 - data/azure-ocr/txt/ACA 2002-2003.pdf.txt | 3 - data/azure-ocr/txt/ACA 2004-2005.pdf.txt | 3 - data/azure-ocr/txt/ACA 2006.pdf.txt | 3 - data/azure-ocr/txt/ACA 2007-2008.pdf.txt | 3 - data/azure-ocr/txt/ACA 2009-2010.pdf.txt | 3 - data/azure-ocr/txt/ACA 2011-2012.pdf.txt | 3 - data/azure-ocr/txt/ACA 2013-2014.pdf.txt | 3 - .../ACA 2015-2016 (50th Anniversary).pdf.txt | 3 - data/azure-ocr/txt/ACA 2017-2018.pdf.txt | 3 - data/azure-ocr/txt/ACA 2019-2020.pdf.txt | 3 - data/azure-ocr/txt/ACA 2021-2022.pdf.txt | 3 - data/openai-formatter/.gitignore | 4 - data/openai-formatter/fix-titles.py | 30 - data/openai-formatter/process.py | 592 ------------------ data/openai-formatter/prompt.txt | 3 - data/pdf/ACA 1967-1970.pdf | Bin 132 -> 0 bytes data/pdf/ACA 1971.pdf | Bin 132 -> 0 bytes data/pdf/ACA 1972.pdf | Bin 132 -> 0 bytes data/pdf/ACA 1973.pdf | Bin 132 -> 0 bytes data/pdf/ACA 1974.pdf | Bin 132 -> 0 bytes data/pdf/ACA 1975.pdf | Bin 132 -> 0 bytes data/pdf/ACA 1976-1979.pdf | Bin 133 -> 0 bytes data/pdf/ACA 1980-1981.pdf | Bin 132 -> 0 bytes data/pdf/ACA 1982.pdf | Bin 133 -> 0 bytes data/pdf/ACA 1983-1984.pdf | Bin 132 -> 0 bytes data/pdf/ACA 1983.pdf | Bin 132 -> 0 bytes data/pdf/ACA 1984-1985.pdf | Bin 132 -> 0 bytes data/pdf/ACA 1986.pdf | Bin 133 -> 0 bytes data/pdf/ACA 1987.pdf | Bin 133 -> 0 bytes data/pdf/ACA 1988.pdf | Bin 133 -> 0 bytes data/pdf/ACA 1989.pdf | Bin 132 -> 0 bytes data/pdf/ACA 1990.pdf | Bin 132 -> 0 bytes data/pdf/ACA 1991.pdf | Bin 132 -> 0 bytes data/pdf/ACA 1992.pdf | Bin 132 -> 0 bytes data/pdf/ACA 1993.pdf | Bin 133 -> 0 bytes data/pdf/ACA 1994-1995.pdf | Bin 132 -> 0 bytes data/pdf/ACA 1996-1998.pdf | Bin 132 -> 0 bytes data/pdf/ACA 1999-2001.pdf | Bin 132 -> 0 bytes data/pdf/ACA 2002-2003.pdf | Bin 131 -> 0 bytes data/pdf/ACA 2004-2005.pdf | Bin 132 -> 0 bytes data/pdf/ACA 2006.pdf | Bin 132 -> 0 bytes data/pdf/ACA 2007-2008.pdf | Bin 133 -> 0 bytes data/pdf/ACA 2009-2010.pdf | Bin 132 -> 0 bytes data/pdf/ACA 2011-2012.pdf | Bin 132 -> 0 bytes data/pdf/ACA 2013-2014.pdf | Bin 132 -> 0 bytes data/pdf/ACA 2015-2016 (50th Anniversary).pdf | Bin 133 -> 0 bytes data/pdf/ACA 2017-2018.pdf | Bin 132 -> 0 bytes data/pdf/ACA 2019-2020.pdf | Bin 132 -> 0 bytes data/pdf/ACA 2021-2022.pdf | Bin 133 -> 0 bytes data/pdf/ACA Before 1900.pdf | Bin 132 -> 0 bytes data/processed/csv/.gitattributes | 8 - data/processed/csv/1971.csv | 3 - data/processed/csv/1972.csv | 3 - data/processed/csv/1973.csv | 3 - data/processed/csv/1974.csv | 3 - data/processed/csv/1975.csv | 3 - data/processed/csv/1976-1979.csv | 3 - data/processed/csv/1980-1981.csv | 3 - data/processed/csv/1981.xlsx | 3 - data/processed/csv/template.csv | 3 - data/processed/json/ACA 1982.json | 3 - data/processed/json/ACA 1983-1984.json | 3 - data/processed/json/ACA 1983.json | 3 - data/processed/json/ACA 1984-1985.json | 3 - data/processed/json/ACA 1986.json | 3 - data/processed/json/ACA 1987.json | 3 - data/processed/json/ACA 1988.json | 3 - data/processed/json/ACA 1989.json | 3 - data/processed/json/ACA 1990.json | 3 - data/processed/json/ACA 1991.json | 3 - data/processed/json/ACA 1992.json | 3 - data/processed/json/ACA 1993.json | 3 - data/processed/json/ACA 1994-1995.json | 3 - data/processed/json/ACA 1996-1998.json | 3 - data/processed/json/ACA 2019-2020.json | 3 - data/processed/json/ACA 2021-2022.json | 3 - data/processed/txt-split/ACA 1982.txt | 3 - data/processed/txt-split/ACA 1983-1984.txt | 3 - data/processed/txt-split/ACA 1983.txt | 3 - data/processed/txt-split/ACA 1984-1985.txt | 3 - data/processed/txt-split/ACA 1986.txt | 3 - data/processed/txt-split/ACA 1987.txt | 3 - data/processed/txt-split/ACA 1988.txt | 3 - data/processed/txt-split/ACA 1989.txt | 3 - data/processed/txt-split/ACA 1990.txt | 3 - data/processed/txt-split/ACA 1991.txt | 3 - data/processed/txt-split/ACA 1992.txt | 3 - data/processed/txt-split/ACA 1993.txt | 3 - data/processed/txt-split/ACA 1994-1995.txt | 3 - data/processed/txt-split/ACA 1996-1998.txt | 3 - data/processed/txt-split/ACA 2019-2020.txt | 3 - data/processed/txt-split/ACA 2021-2022.txt | 3 - data/processed/txt-split/README.md | 8 - data/processed/txt/ACA 1967-1970.txt | 3 - data/processed/txt/ACA 1971.txt | 3 - data/processed/txt/ACA 1972.txt | 3 - data/processed/txt/ACA 1973.txt | 3 - data/processed/txt/ACA 1974.txt | 3 - data/processed/txt/ACA 1975.txt | 3 - data/processed/txt/ACA 1976-1979.txt | 3 - data/processed/txt/ACA 1980-1981.txt | 3 - data/processed/txt/ACA 1982.txt | 3 - data/processed/txt/ACA 1983-1984.txt | 3 - data/processed/txt/ACA 1983.txt | 3 - data/processed/txt/ACA 1984-1985.txt | 3 - data/processed/txt/ACA 1986.txt | 3 - data/processed/txt/ACA 1987.txt | 3 - data/processed/txt/ACA 1988.txt | 3 - data/processed/txt/ACA 1989.txt | 3 - data/processed/txt/ACA 1990.txt | 3 - data/processed/txt/ACA 1991.txt | 3 - data/processed/txt/ACA 1992.txt | 3 - data/processed/txt/ACA 1993.txt | 3 - data/processed/txt/ACA 1994-1995.txt | 3 - data/processed/txt/ACA 1996-1998.txt | 3 - data/processed/txt/ACA 1999-2001.txt | 3 - data/processed/txt/ACA 2002-2003.txt | 3 - data/processed/txt/ACA 2004-2005.txt | 3 - data/processed/txt/ACA 2006.txt | 3 - data/processed/txt/ACA 2007-2008.txt | 3 - data/processed/txt/ACA 2009-2010.txt | 3 - data/processed/txt/ACA 2011-2012.txt | 3 - data/processed/txt/ACA 2013-2014.txt | 3 - data/processed/txt/ACA 2017-2018.txt | 3 - data/processed/txt/ACA 2019-2020.txt | 3 - data/processed/txt/ACA 50th Anniversary.txt | 3 - data/processed/txt/ACA Before 1900.txt | 3 - data/textract-layout/csv/ACA 1991.csv | 3 - data/textract-layout/csv/ACA 1992.csv | 3 - data/textract-layout/csv/ACA 1993.csv | 3 - data/textract-layout/csv/ACA 1994-1995.csv | 3 - data/textract-layout/csv/ACA 1996-1998.csv | 3 - data/textract-layout/csv/ACA 1999-2001.csv | 3 - data/textract-layout/csv/ACA 2002-2003.csv | 3 - data/textract-layout/csv/ACA 2004-2005.csv | 3 - data/textract-layout/csv/ACA 2006.csv | 3 - data/textract-layout/csv/ACA 2007-2008.csv | 3 - data/textract-layout/csv/ACA 2009-2010.csv | 3 - data/textract-layout/csv/ACA 2011-2012.csv | 3 - data/textract-layout/csv/ACA 2013-2014.csv | 3 - .../csv/ACA 2015-2016 (50th Anniversary).csv | 3 - data/textract-layout/csv/ACA 2017-2018.csv | 3 - data/textract-layout/csv/ACA 2019-2020.csv | 3 - data/textract-layout/csv/ACA 2021-2022.csv | 3 - data/textract-layout/json/ACA 1991.json | 3 - data/textract-layout/json/ACA 1992.json | 3 - data/textract-layout/json/ACA 1993.json | 3 - data/textract-layout/json/ACA 1994-1995.json | 3 - data/textract-layout/json/ACA 1996-1998.json | 3 - data/textract-layout/json/ACA 1999-2001.json | 3 - data/textract-layout/json/ACA 2002-2003.json | 3 - data/textract-layout/json/ACA 2004-2005.json | 3 - data/textract-layout/json/ACA 2006.json | 3 - data/textract-layout/json/ACA 2007-2008.json | 3 - data/textract-layout/json/ACA 2009-2010.json | 3 - data/textract-layout/json/ACA 2011-2012.json | 3 - data/textract-layout/json/ACA 2013-2014.json | 3 - .../ACA 2015-2016 (50th Anniversary).json | 3 - data/textract-layout/json/ACA 2017-2018.json | 3 - data/textract-layout/json/ACA 2019-2020.json | 3 - data/textract-layout/json/ACA 2021-2022.json | 3 - data/textract-layout/txt/ACA 1991.txt | 3 - data/textract-layout/txt/ACA 1992.txt | 3 - data/textract-layout/txt/ACA 1993.txt | 3 - data/textract-layout/txt/ACA 1994-1995.txt | 3 - data/textract-layout/txt/ACA 1996-1998.txt | 3 - data/textract-layout/txt/ACA 1999-2001.txt | 3 - data/textract-layout/txt/ACA 2002-2003.txt | 3 - data/textract-layout/txt/ACA 2004-2005.txt | 3 - data/textract-layout/txt/ACA 2006.txt | 3 - data/textract-layout/txt/ACA 2007-2008.txt | 3 - data/textract-layout/txt/ACA 2009-2010.txt | 3 - data/textract-layout/txt/ACA 2011-2012.txt | 3 - data/textract-layout/txt/ACA 2013-2014.txt | 3 - .../txt/ACA 2015-2016 (50th Anniversary).txt | 3 - data/textract-layout/txt/ACA 2017-2018.txt | 3 - data/textract-layout/txt/ACA 2019-2020.txt | 3 - data/textract-layout/txt/ACA 2021-2022.txt | 3 - data/textract-layout/txt/make_txt.py | 50 -- data/textract-ocr/json/ACA 1967-1970.json | 3 - data/textract-ocr/json/ACA 1971.json | 3 - data/textract-ocr/json/ACA 1972.json | 3 - data/textract-ocr/json/ACA 1973.json | 3 - data/textract-ocr/json/ACA 1974.json | 3 - data/textract-ocr/json/ACA 1975.json | 3 - data/textract-ocr/json/ACA 1976-1979.json | 3 - data/textract-ocr/json/ACA 1980-1981.json | 3 - data/textract-ocr/json/ACA 1982.json | 3 - data/textract-ocr/json/ACA 1983-1984.json | 3 - data/textract-ocr/json/ACA 1983.json | 3 - data/textract-ocr/json/ACA 1984-1985.json | 3 - data/textract-ocr/json/ACA 1986.json | 3 - data/textract-ocr/json/ACA 1987.json | 3 - data/textract-ocr/json/ACA 1988.json | 3 - data/textract-ocr/json/ACA 1989.json | 3 - data/textract-ocr/json/ACA 1990.json | 3 - data/textract-ocr/json/ACA 1991.json | 3 - data/textract-ocr/json/ACA 1992.json | 3 - data/textract-ocr/json/ACA 1993.json | 3 - data/textract-ocr/json/ACA 1994-1995.json | 3 - data/textract-ocr/json/ACA 1996-1998.json | 3 - data/textract-ocr/json/ACA 1999-2001.json | 3 - data/textract-ocr/json/ACA 2002-2003.json | 3 - data/textract-ocr/json/ACA 2004-2005.json | 3 - data/textract-ocr/json/ACA 2006.json | 3 - data/textract-ocr/json/ACA 2007-2008.json | 3 - data/textract-ocr/json/ACA 2009-2010.json | 3 - data/textract-ocr/json/ACA 2011-2012.json | 3 - data/textract-ocr/json/ACA 2013-2014.json | 3 - data/textract-ocr/json/ACA 2017-2018.json | 3 - data/textract-ocr/json/ACA 2019-2020.json | 3 - .../json/ACA 50th Anniversary.json | 3 - data/textract-ocr/json/ACA Before 1900.json | 3 - data/textract-ocr/sort_into_columns.py | 215 ------- data/textract-ocr/txt/ACA 1967-1970.txt | 3 - data/textract-ocr/txt/ACA 1971.txt | 3 - data/textract-ocr/txt/ACA 1972.txt | 3 - data/textract-ocr/txt/ACA 1973.txt | 3 - data/textract-ocr/txt/ACA 1974.txt | 3 - data/textract-ocr/txt/ACA 1975.txt | 3 - data/textract-ocr/txt/ACA 1976-1979.txt | 3 - data/textract-ocr/txt/ACA 1980-1981.txt | 3 - data/textract-ocr/txt/ACA 1982.txt | 3 - data/textract-ocr/txt/ACA 1983-1984.txt | 3 - data/textract-ocr/txt/ACA 1983.txt | 3 - data/textract-ocr/txt/ACA 1984-1985.txt | 3 - data/textract-ocr/txt/ACA 1986.txt | 3 - data/textract-ocr/txt/ACA 1987.txt | 3 - data/textract-ocr/txt/ACA 1988.txt | 3 - data/textract-ocr/txt/ACA 1989.txt | 3 - data/textract-ocr/txt/ACA 1990.txt | 3 - data/textract-ocr/txt/ACA 1991.txt | 3 - data/textract-ocr/txt/ACA 1992.txt | 3 - data/textract-ocr/txt/ACA 1993.txt | 3 - data/textract-ocr/txt/ACA 1994-1995.txt | 3 - data/textract-ocr/txt/ACA 1996-1998.txt | 3 - data/textract-ocr/txt/ACA 1999-2001.txt | 3 - data/textract-ocr/txt/ACA 2002-2003.txt | 3 - data/textract-ocr/txt/ACA 2004-2005.txt | 3 - data/textract-ocr/txt/ACA 2006.txt | 3 - data/textract-ocr/txt/ACA 2007-2008.txt | 3 - data/textract-ocr/txt/ACA 2009-2010.txt | 3 - data/textract-ocr/txt/ACA 2011-2012.txt | 3 - data/textract-ocr/txt/ACA 2013-2014.txt | 3 - data/textract-ocr/txt/ACA 2017-2018.txt | 3 - data/textract-ocr/txt/ACA 2019-2020.txt | 3 - .../textract-ocr/txt/ACA 50th Anniversary.txt | 3 - data/textract-ocr/txt/ACA Before 1900.txt | 3 - 272 files changed, 9 insertions(+), 1611 deletions(-) delete mode 100644 data/azure-ocr/json/ACA 1991.pdf.json delete mode 100644 data/azure-ocr/json/ACA 1992.pdf.json delete mode 100644 data/azure-ocr/json/ACA 1993.pdf.json delete mode 100644 data/azure-ocr/json/ACA 1994-1995.pdf.json delete mode 100644 data/azure-ocr/json/ACA 1996-1998.pdf.json delete mode 100644 data/azure-ocr/json/ACA 1999-2001.pdf.json delete mode 100644 data/azure-ocr/json/ACA 2002-2003.pdf.json delete mode 100644 data/azure-ocr/json/ACA 2004-2005.pdf.json delete mode 100644 data/azure-ocr/json/ACA 2006.pdf.json delete mode 100644 data/azure-ocr/json/ACA 2007-2008.pdf.json delete mode 100644 data/azure-ocr/json/ACA 2009-2010.pdf.json delete mode 100644 data/azure-ocr/json/ACA 2011-2012.pdf.json delete mode 100644 data/azure-ocr/json/ACA 2013-2014.pdf.json delete mode 100644 data/azure-ocr/json/ACA 2015-2016 (50th Anniversary).pdf.json delete mode 100644 data/azure-ocr/json/ACA 2017-2018.pdf.json delete mode 100644 data/azure-ocr/json/ACA 2019-2020.pdf.json delete mode 100644 data/azure-ocr/json/ACA 2021-2022.pdf.json delete mode 100644 data/azure-ocr/txt/ACA 1991.pdf.txt delete mode 100644 data/azure-ocr/txt/ACA 1992.pdf.txt delete mode 100644 data/azure-ocr/txt/ACA 1993.pdf.txt delete mode 100644 data/azure-ocr/txt/ACA 1994-1995.pdf.txt delete mode 100644 data/azure-ocr/txt/ACA 1996-1998.pdf.txt delete mode 100644 data/azure-ocr/txt/ACA 1999-2001.pdf.txt delete mode 100644 data/azure-ocr/txt/ACA 2002-2003.pdf.txt delete mode 100644 data/azure-ocr/txt/ACA 2004-2005.pdf.txt delete mode 100644 data/azure-ocr/txt/ACA 2006.pdf.txt delete mode 100644 data/azure-ocr/txt/ACA 2007-2008.pdf.txt delete mode 100644 data/azure-ocr/txt/ACA 2009-2010.pdf.txt delete mode 100644 data/azure-ocr/txt/ACA 2011-2012.pdf.txt delete mode 100644 data/azure-ocr/txt/ACA 2013-2014.pdf.txt delete mode 100644 data/azure-ocr/txt/ACA 2015-2016 (50th Anniversary).pdf.txt delete mode 100644 data/azure-ocr/txt/ACA 2017-2018.pdf.txt delete mode 100644 data/azure-ocr/txt/ACA 2019-2020.pdf.txt delete mode 100644 data/azure-ocr/txt/ACA 2021-2022.pdf.txt delete mode 100644 data/openai-formatter/.gitignore delete mode 100644 data/openai-formatter/fix-titles.py delete mode 100755 data/openai-formatter/process.py delete mode 100644 data/openai-formatter/prompt.txt delete mode 100644 data/pdf/ACA 1967-1970.pdf delete mode 100644 data/pdf/ACA 1971.pdf delete mode 100644 data/pdf/ACA 1972.pdf delete mode 100644 data/pdf/ACA 1973.pdf delete mode 100644 data/pdf/ACA 1974.pdf delete mode 100644 data/pdf/ACA 1975.pdf delete mode 100644 data/pdf/ACA 1976-1979.pdf delete mode 100644 data/pdf/ACA 1980-1981.pdf delete mode 100644 data/pdf/ACA 1982.pdf delete mode 100644 data/pdf/ACA 1983-1984.pdf delete mode 100644 data/pdf/ACA 1983.pdf delete mode 100644 data/pdf/ACA 1984-1985.pdf delete mode 100644 data/pdf/ACA 1986.pdf delete mode 100644 data/pdf/ACA 1987.pdf delete mode 100644 data/pdf/ACA 1988.pdf delete mode 100644 data/pdf/ACA 1989.pdf delete mode 100644 data/pdf/ACA 1990.pdf delete mode 100644 data/pdf/ACA 1991.pdf delete mode 100644 data/pdf/ACA 1992.pdf delete mode 100644 data/pdf/ACA 1993.pdf delete mode 100644 data/pdf/ACA 1994-1995.pdf delete mode 100644 data/pdf/ACA 1996-1998.pdf delete mode 100644 data/pdf/ACA 1999-2001.pdf delete mode 100644 data/pdf/ACA 2002-2003.pdf delete mode 100644 data/pdf/ACA 2004-2005.pdf delete mode 100644 data/pdf/ACA 2006.pdf delete mode 100644 data/pdf/ACA 2007-2008.pdf delete mode 100644 data/pdf/ACA 2009-2010.pdf delete mode 100644 data/pdf/ACA 2011-2012.pdf delete mode 100644 data/pdf/ACA 2013-2014.pdf delete mode 100644 data/pdf/ACA 2015-2016 (50th Anniversary).pdf delete mode 100644 data/pdf/ACA 2017-2018.pdf delete mode 100644 data/pdf/ACA 2019-2020.pdf delete mode 100644 data/pdf/ACA 2021-2022.pdf delete mode 100644 data/pdf/ACA Before 1900.pdf delete mode 100644 data/processed/csv/.gitattributes delete mode 100644 data/processed/csv/1971.csv delete mode 100644 data/processed/csv/1972.csv delete mode 100644 data/processed/csv/1973.csv delete mode 100644 data/processed/csv/1974.csv delete mode 100644 data/processed/csv/1975.csv delete mode 100644 data/processed/csv/1976-1979.csv delete mode 100644 data/processed/csv/1980-1981.csv delete mode 100644 data/processed/csv/1981.xlsx delete mode 100644 data/processed/csv/template.csv delete mode 100644 data/processed/json/ACA 1982.json delete mode 100644 data/processed/json/ACA 1983-1984.json delete mode 100644 data/processed/json/ACA 1983.json delete mode 100644 data/processed/json/ACA 1984-1985.json delete mode 100644 data/processed/json/ACA 1986.json delete mode 100644 data/processed/json/ACA 1987.json delete mode 100644 data/processed/json/ACA 1988.json delete mode 100644 data/processed/json/ACA 1989.json delete mode 100644 data/processed/json/ACA 1990.json delete mode 100644 data/processed/json/ACA 1991.json delete mode 100644 data/processed/json/ACA 1992.json delete mode 100644 data/processed/json/ACA 1993.json delete mode 100644 data/processed/json/ACA 1994-1995.json delete mode 100644 data/processed/json/ACA 1996-1998.json delete mode 100644 data/processed/json/ACA 2019-2020.json delete mode 100644 data/processed/json/ACA 2021-2022.json delete mode 100644 data/processed/txt-split/ACA 1982.txt delete mode 100644 data/processed/txt-split/ACA 1983-1984.txt delete mode 100644 data/processed/txt-split/ACA 1983.txt delete mode 100644 data/processed/txt-split/ACA 1984-1985.txt delete mode 100644 data/processed/txt-split/ACA 1986.txt delete mode 100644 data/processed/txt-split/ACA 1987.txt delete mode 100644 data/processed/txt-split/ACA 1988.txt delete mode 100644 data/processed/txt-split/ACA 1989.txt delete mode 100644 data/processed/txt-split/ACA 1990.txt delete mode 100644 data/processed/txt-split/ACA 1991.txt delete mode 100644 data/processed/txt-split/ACA 1992.txt delete mode 100644 data/processed/txt-split/ACA 1993.txt delete mode 100644 data/processed/txt-split/ACA 1994-1995.txt delete mode 100644 data/processed/txt-split/ACA 1996-1998.txt delete mode 100644 data/processed/txt-split/ACA 2019-2020.txt delete mode 100644 data/processed/txt-split/ACA 2021-2022.txt delete mode 100644 data/processed/txt-split/README.md delete mode 100644 data/processed/txt/ACA 1967-1970.txt delete mode 100644 data/processed/txt/ACA 1971.txt delete mode 100644 data/processed/txt/ACA 1972.txt delete mode 100644 data/processed/txt/ACA 1973.txt delete mode 100644 data/processed/txt/ACA 1974.txt delete mode 100644 data/processed/txt/ACA 1975.txt delete mode 100644 data/processed/txt/ACA 1976-1979.txt delete mode 100644 data/processed/txt/ACA 1980-1981.txt delete mode 100644 data/processed/txt/ACA 1982.txt delete mode 100644 data/processed/txt/ACA 1983-1984.txt delete mode 100644 data/processed/txt/ACA 1983.txt delete mode 100644 data/processed/txt/ACA 1984-1985.txt delete mode 100644 data/processed/txt/ACA 1986.txt delete mode 100644 data/processed/txt/ACA 1987.txt delete mode 100644 data/processed/txt/ACA 1988.txt delete mode 100644 data/processed/txt/ACA 1989.txt delete mode 100644 data/processed/txt/ACA 1990.txt delete mode 100644 data/processed/txt/ACA 1991.txt delete mode 100644 data/processed/txt/ACA 1992.txt delete mode 100644 data/processed/txt/ACA 1993.txt delete mode 100644 data/processed/txt/ACA 1994-1995.txt delete mode 100644 data/processed/txt/ACA 1996-1998.txt delete mode 100644 data/processed/txt/ACA 1999-2001.txt delete mode 100644 data/processed/txt/ACA 2002-2003.txt delete mode 100644 data/processed/txt/ACA 2004-2005.txt delete mode 100644 data/processed/txt/ACA 2006.txt delete mode 100644 data/processed/txt/ACA 2007-2008.txt delete mode 100644 data/processed/txt/ACA 2009-2010.txt delete mode 100644 data/processed/txt/ACA 2011-2012.txt delete mode 100644 data/processed/txt/ACA 2013-2014.txt delete mode 100644 data/processed/txt/ACA 2017-2018.txt delete mode 100644 data/processed/txt/ACA 2019-2020.txt delete mode 100644 data/processed/txt/ACA 50th Anniversary.txt delete mode 100644 data/processed/txt/ACA Before 1900.txt delete mode 100644 data/textract-layout/csv/ACA 1991.csv delete mode 100644 data/textract-layout/csv/ACA 1992.csv delete mode 100644 data/textract-layout/csv/ACA 1993.csv delete mode 100644 data/textract-layout/csv/ACA 1994-1995.csv delete mode 100644 data/textract-layout/csv/ACA 1996-1998.csv delete mode 100644 data/textract-layout/csv/ACA 1999-2001.csv delete mode 100644 data/textract-layout/csv/ACA 2002-2003.csv delete mode 100644 data/textract-layout/csv/ACA 2004-2005.csv delete mode 100644 data/textract-layout/csv/ACA 2006.csv delete mode 100644 data/textract-layout/csv/ACA 2007-2008.csv delete mode 100644 data/textract-layout/csv/ACA 2009-2010.csv delete mode 100644 data/textract-layout/csv/ACA 2011-2012.csv delete mode 100644 data/textract-layout/csv/ACA 2013-2014.csv delete mode 100644 data/textract-layout/csv/ACA 2015-2016 (50th Anniversary).csv delete mode 100644 data/textract-layout/csv/ACA 2017-2018.csv delete mode 100644 data/textract-layout/csv/ACA 2019-2020.csv delete mode 100644 data/textract-layout/csv/ACA 2021-2022.csv delete mode 100644 data/textract-layout/json/ACA 1991.json delete mode 100644 data/textract-layout/json/ACA 1992.json delete mode 100644 data/textract-layout/json/ACA 1993.json delete mode 100644 data/textract-layout/json/ACA 1994-1995.json delete mode 100644 data/textract-layout/json/ACA 1996-1998.json delete mode 100644 data/textract-layout/json/ACA 1999-2001.json delete mode 100644 data/textract-layout/json/ACA 2002-2003.json delete mode 100644 data/textract-layout/json/ACA 2004-2005.json delete mode 100644 data/textract-layout/json/ACA 2006.json delete mode 100644 data/textract-layout/json/ACA 2007-2008.json delete mode 100644 data/textract-layout/json/ACA 2009-2010.json delete mode 100644 data/textract-layout/json/ACA 2011-2012.json delete mode 100644 data/textract-layout/json/ACA 2013-2014.json delete mode 100644 data/textract-layout/json/ACA 2015-2016 (50th Anniversary).json delete mode 100644 data/textract-layout/json/ACA 2017-2018.json delete mode 100644 data/textract-layout/json/ACA 2019-2020.json delete mode 100644 data/textract-layout/json/ACA 2021-2022.json delete mode 100644 data/textract-layout/txt/ACA 1991.txt delete mode 100644 data/textract-layout/txt/ACA 1992.txt delete mode 100644 data/textract-layout/txt/ACA 1993.txt delete mode 100644 data/textract-layout/txt/ACA 1994-1995.txt delete mode 100644 data/textract-layout/txt/ACA 1996-1998.txt delete mode 100644 data/textract-layout/txt/ACA 1999-2001.txt delete mode 100644 data/textract-layout/txt/ACA 2002-2003.txt delete mode 100644 data/textract-layout/txt/ACA 2004-2005.txt delete mode 100644 data/textract-layout/txt/ACA 2006.txt delete mode 100644 data/textract-layout/txt/ACA 2007-2008.txt delete mode 100644 data/textract-layout/txt/ACA 2009-2010.txt delete mode 100644 data/textract-layout/txt/ACA 2011-2012.txt delete mode 100644 data/textract-layout/txt/ACA 2013-2014.txt delete mode 100644 data/textract-layout/txt/ACA 2015-2016 (50th Anniversary).txt delete mode 100644 data/textract-layout/txt/ACA 2017-2018.txt delete mode 100644 data/textract-layout/txt/ACA 2019-2020.txt delete mode 100644 data/textract-layout/txt/ACA 2021-2022.txt delete mode 100644 data/textract-layout/txt/make_txt.py delete mode 100644 data/textract-ocr/json/ACA 1967-1970.json delete mode 100644 data/textract-ocr/json/ACA 1971.json delete mode 100644 data/textract-ocr/json/ACA 1972.json delete mode 100644 data/textract-ocr/json/ACA 1973.json delete mode 100644 data/textract-ocr/json/ACA 1974.json delete mode 100644 data/textract-ocr/json/ACA 1975.json delete mode 100644 data/textract-ocr/json/ACA 1976-1979.json delete mode 100644 data/textract-ocr/json/ACA 1980-1981.json delete mode 100644 data/textract-ocr/json/ACA 1982.json delete mode 100644 data/textract-ocr/json/ACA 1983-1984.json delete mode 100644 data/textract-ocr/json/ACA 1983.json delete mode 100644 data/textract-ocr/json/ACA 1984-1985.json delete mode 100644 data/textract-ocr/json/ACA 1986.json delete mode 100644 data/textract-ocr/json/ACA 1987.json delete mode 100644 data/textract-ocr/json/ACA 1988.json delete mode 100644 data/textract-ocr/json/ACA 1989.json delete mode 100644 data/textract-ocr/json/ACA 1990.json delete mode 100644 data/textract-ocr/json/ACA 1991.json delete mode 100644 data/textract-ocr/json/ACA 1992.json delete mode 100644 data/textract-ocr/json/ACA 1993.json delete mode 100644 data/textract-ocr/json/ACA 1994-1995.json delete mode 100644 data/textract-ocr/json/ACA 1996-1998.json delete mode 100644 data/textract-ocr/json/ACA 1999-2001.json delete mode 100644 data/textract-ocr/json/ACA 2002-2003.json delete mode 100644 data/textract-ocr/json/ACA 2004-2005.json delete mode 100644 data/textract-ocr/json/ACA 2006.json delete mode 100644 data/textract-ocr/json/ACA 2007-2008.json delete mode 100644 data/textract-ocr/json/ACA 2009-2010.json delete mode 100644 data/textract-ocr/json/ACA 2011-2012.json delete mode 100644 data/textract-ocr/json/ACA 2013-2014.json delete mode 100644 data/textract-ocr/json/ACA 2017-2018.json delete mode 100644 data/textract-ocr/json/ACA 2019-2020.json delete mode 100644 data/textract-ocr/json/ACA 50th Anniversary.json delete mode 100644 data/textract-ocr/json/ACA Before 1900.json delete mode 100644 data/textract-ocr/sort_into_columns.py delete mode 100644 data/textract-ocr/txt/ACA 1967-1970.txt delete mode 100644 data/textract-ocr/txt/ACA 1971.txt delete mode 100644 data/textract-ocr/txt/ACA 1972.txt delete mode 100644 data/textract-ocr/txt/ACA 1973.txt delete mode 100644 data/textract-ocr/txt/ACA 1974.txt delete mode 100644 data/textract-ocr/txt/ACA 1975.txt delete mode 100644 data/textract-ocr/txt/ACA 1976-1979.txt delete mode 100644 data/textract-ocr/txt/ACA 1980-1981.txt delete mode 100644 data/textract-ocr/txt/ACA 1982.txt delete mode 100644 data/textract-ocr/txt/ACA 1983-1984.txt delete mode 100644 data/textract-ocr/txt/ACA 1983.txt delete mode 100644 data/textract-ocr/txt/ACA 1984-1985.txt delete mode 100644 data/textract-ocr/txt/ACA 1986.txt delete mode 100644 data/textract-ocr/txt/ACA 1987.txt delete mode 100644 data/textract-ocr/txt/ACA 1988.txt delete mode 100644 data/textract-ocr/txt/ACA 1989.txt delete mode 100644 data/textract-ocr/txt/ACA 1990.txt delete mode 100644 data/textract-ocr/txt/ACA 1991.txt delete mode 100644 data/textract-ocr/txt/ACA 1992.txt delete mode 100644 data/textract-ocr/txt/ACA 1993.txt delete mode 100644 data/textract-ocr/txt/ACA 1994-1995.txt delete mode 100644 data/textract-ocr/txt/ACA 1996-1998.txt delete mode 100644 data/textract-ocr/txt/ACA 1999-2001.txt delete mode 100644 data/textract-ocr/txt/ACA 2002-2003.txt delete mode 100644 data/textract-ocr/txt/ACA 2004-2005.txt delete mode 100644 data/textract-ocr/txt/ACA 2006.txt delete mode 100644 data/textract-ocr/txt/ACA 2007-2008.txt delete mode 100644 data/textract-ocr/txt/ACA 2009-2010.txt delete mode 100644 data/textract-ocr/txt/ACA 2011-2012.txt delete mode 100644 data/textract-ocr/txt/ACA 2013-2014.txt delete mode 100644 data/textract-ocr/txt/ACA 2017-2018.txt delete mode 100644 data/textract-ocr/txt/ACA 2019-2020.txt delete mode 100644 data/textract-ocr/txt/ACA 50th Anniversary.txt delete mode 100644 data/textract-ocr/txt/ACA Before 1900.txt diff --git a/README.md b/README.md index fd6bb60..def8355 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # Caving Incident Report DB + This is a project which aims to digitise the archive of [National Speleological Society](https://caves.org/) *American Caving Accidents* caving incident reports, which cover most caving incidents that have happened in the @@ -13,6 +14,7 @@ to take a look. You may also wish to view the [about page](https://aca.caver.dev the website for more information about the project. ## Django application + This fairly straightforward application lives within the `reportdb/` and `etc/` folders, and is run using docker-compose (or Dokku in production). The applications allows a basic CRUD interface for incident reports, and has management commands (`import_json` and `import_csv`) to enable the mass @@ -23,29 +25,19 @@ the work of the AI formatter before marking an incident report as 'approved' and by the public. ## Data and processing -The `data/` directory contains the archive of ACA Journals in a number of formats with varied levels of -processing. - -The original PDFs of the journals are contained in `data/pdf/`. These PDF files were run -through Amazon AWS Textract, and processed with a simple script, to generate the text files within -`data/processed/txt`. These text files were then further processed by hand to generate the ones contained -within the `data/processed/txt-split/` directory, where non-incident report text has been removed and each -incident report separated by three dashes (`---`) within the text file to allow easier machine separation -of incidents. -The files from `data/processed/txt-split/` were then processed using the OpenAI API by means of the script -contained within the `data/openai-formatter/` directory. This script produces JSON arrays of each incident, -with relevant metadata (such as the cave name, date, incident report, cavers involved) separated. The results -from this are contained in the `data/json/` directory. - -These JSON files are the final stage of processing before the data is added to the Django web application, -which is then used by volunteers to check the work of the AI formatter before making the incident available -for all to view online. +The original data, including full ACA report PDF files, and the same in all sorts of different manually +and machine processed forms, are available in a public S3 bucket called `caving-incident-reports` in the +`eu-west-2` region. You can download the data from the bucket using the AWS CLI, or by any number of +graphical S3 clients. If you require any assistance in accessing the data, please join our +[Discord server](https://discord.gg/bUCYsmghVs) and ask for help. # Contributing + Contributions are welcome - both in terms of code, and volunteering to help edit incidents on the production Django app. For more information, please join [our Discord server](https://discord.gg/bUCYsmghVs). # Licence + This project is licensed under the GNU GPL v3.0. For more information see the LICENCE file. diff --git a/data/azure-ocr/json/ACA 1991.pdf.json b/data/azure-ocr/json/ACA 1991.pdf.json deleted file mode 100644 index 00f185b..0000000 --- a/data/azure-ocr/json/ACA 1991.pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f14f1aeb472fdcaf30e5f42030c91eaac6835a86f8facfad09cf275f2d476bbb -size 8814310 diff --git a/data/azure-ocr/json/ACA 1992.pdf.json b/data/azure-ocr/json/ACA 1992.pdf.json deleted file mode 100644 index d892046..0000000 --- a/data/azure-ocr/json/ACA 1992.pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2b86a189cec1a03ac3994d3b134f88c970843f6d7581f938106dd1e3d1357c8a -size 7144300 diff --git a/data/azure-ocr/json/ACA 1993.pdf.json b/data/azure-ocr/json/ACA 1993.pdf.json deleted file mode 100644 index 9bb67dd..0000000 --- a/data/azure-ocr/json/ACA 1993.pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d7e786e8da993bc48488905a62e0431972c7abc13e6b592ff717544b6fe38a5f -size 9305115 diff --git a/data/azure-ocr/json/ACA 1994-1995.pdf.json b/data/azure-ocr/json/ACA 1994-1995.pdf.json deleted file mode 100644 index c7b3f49..0000000 --- a/data/azure-ocr/json/ACA 1994-1995.pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:79026253803bb46da817907deb96ecf439782cf7d0ae3cf28f7cea5e3b65a05e -size 13302219 diff --git a/data/azure-ocr/json/ACA 1996-1998.pdf.json b/data/azure-ocr/json/ACA 1996-1998.pdf.json deleted file mode 100644 index 1f1527e..0000000 --- a/data/azure-ocr/json/ACA 1996-1998.pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34a2243482e6d86b32629096005cc1f378c24ea33cbd084864e580eaf008a067 -size 15921792 diff --git a/data/azure-ocr/json/ACA 1999-2001.pdf.json b/data/azure-ocr/json/ACA 1999-2001.pdf.json deleted file mode 100644 index 8a9afc0..0000000 --- a/data/azure-ocr/json/ACA 1999-2001.pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7af44f0309dc2ac92198cda5fef779c0d607cc380968b9cd99c3f830aceb1be2 -size 14303681 diff --git a/data/azure-ocr/json/ACA 2002-2003.pdf.json b/data/azure-ocr/json/ACA 2002-2003.pdf.json deleted file mode 100644 index 4c0910c..0000000 --- a/data/azure-ocr/json/ACA 2002-2003.pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6e9cd9e660bdd969e50dc69fca9fc7ad969eb8d0e05e9b39bb9682f70a4ce874 -size 11837460 diff --git a/data/azure-ocr/json/ACA 2004-2005.pdf.json b/data/azure-ocr/json/ACA 2004-2005.pdf.json deleted file mode 100644 index 5ad79fb..0000000 --- a/data/azure-ocr/json/ACA 2004-2005.pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c6b4306b86574ab5b0c5c34dd6a786ff05032b44620f3ecb0be329c8b00762a2 -size 11787340 diff --git a/data/azure-ocr/json/ACA 2006.pdf.json b/data/azure-ocr/json/ACA 2006.pdf.json deleted file mode 100644 index ae4aaf8..0000000 --- a/data/azure-ocr/json/ACA 2006.pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4c1dd931aa48fc512b42d65b13e5a48e451a99d095f296e18e51b358a5dea0de -size 4039566 diff --git a/data/azure-ocr/json/ACA 2007-2008.pdf.json b/data/azure-ocr/json/ACA 2007-2008.pdf.json deleted file mode 100644 index 421d70e..0000000 --- a/data/azure-ocr/json/ACA 2007-2008.pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e3a8986f2648d0c9c206475e11ca066ef7a4859b8c3d3b27b8f1e6e7992bea4c -size 7690885 diff --git a/data/azure-ocr/json/ACA 2009-2010.pdf.json b/data/azure-ocr/json/ACA 2009-2010.pdf.json deleted file mode 100644 index 862d81d..0000000 --- a/data/azure-ocr/json/ACA 2009-2010.pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6e885429f4986d27b8a113c10eab66b54192d7e5c855ca1cea9dbe03aee57d07 -size 5978271 diff --git a/data/azure-ocr/json/ACA 2011-2012.pdf.json b/data/azure-ocr/json/ACA 2011-2012.pdf.json deleted file mode 100644 index 37d60b4..0000000 --- a/data/azure-ocr/json/ACA 2011-2012.pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e497c4bae296e8b15ee60cd3221555c9797f60cd0c407b70121dd6d4ed50f5ff -size 8794170 diff --git a/data/azure-ocr/json/ACA 2013-2014.pdf.json b/data/azure-ocr/json/ACA 2013-2014.pdf.json deleted file mode 100644 index 36d5272..0000000 --- a/data/azure-ocr/json/ACA 2013-2014.pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6910a2213c2c5ecd0f90c6b629d8663a4ee64f43cb5f365013823eed2ad9e58e -size 10227812 diff --git a/data/azure-ocr/json/ACA 2015-2016 (50th Anniversary).pdf.json b/data/azure-ocr/json/ACA 2015-2016 (50th Anniversary).pdf.json deleted file mode 100644 index 55fbe56..0000000 --- a/data/azure-ocr/json/ACA 2015-2016 (50th Anniversary).pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:683e695e5404f67a9525a5031a7c2faed0725dbe852a9c33ff881fb59b45079e -size 10858085 diff --git a/data/azure-ocr/json/ACA 2017-2018.pdf.json b/data/azure-ocr/json/ACA 2017-2018.pdf.json deleted file mode 100644 index 5f57f62..0000000 --- a/data/azure-ocr/json/ACA 2017-2018.pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:592d3ce0e56807fee723a2e74f423f43007d8ecbcc09437721602e22d5bf60fa -size 8189069 diff --git a/data/azure-ocr/json/ACA 2019-2020.pdf.json b/data/azure-ocr/json/ACA 2019-2020.pdf.json deleted file mode 100644 index c7f3269..0000000 --- a/data/azure-ocr/json/ACA 2019-2020.pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:82766febec484340ff13f01b7898ff367485c49d6ac3e0ef1093698d9c1e6ab1 -size 6421735 diff --git a/data/azure-ocr/json/ACA 2021-2022.pdf.json b/data/azure-ocr/json/ACA 2021-2022.pdf.json deleted file mode 100644 index 57bb8c2..0000000 --- a/data/azure-ocr/json/ACA 2021-2022.pdf.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:faf93f42c3c3087ff5884dc1568f98399d3c694744f142e0a4784b12335fcb0f -size 5630788 diff --git a/data/azure-ocr/txt/ACA 1991.pdf.txt b/data/azure-ocr/txt/ACA 1991.pdf.txt deleted file mode 100644 index b5bc16e..0000000 --- a/data/azure-ocr/txt/ACA 1991.pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3c374c3f4cac3c9c335c7acf1f0969afaf6899ccd14745edb5ad127705d42542 -size 142010 diff --git a/data/azure-ocr/txt/ACA 1992.pdf.txt b/data/azure-ocr/txt/ACA 1992.pdf.txt deleted file mode 100644 index ae87044..0000000 --- a/data/azure-ocr/txt/ACA 1992.pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d85fa5978ae3cc19ec6b6339242378e95d4a3f8acdebf4a747f0db40a14b28f0 -size 115029 diff --git a/data/azure-ocr/txt/ACA 1993.pdf.txt b/data/azure-ocr/txt/ACA 1993.pdf.txt deleted file mode 100644 index 9bfc7d8..0000000 --- a/data/azure-ocr/txt/ACA 1993.pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a09ec9c319fcf43bc7b7306e402bfdf05cbc0710517c20835379ce6f24a09532 -size 145278 diff --git a/data/azure-ocr/txt/ACA 1994-1995.pdf.txt b/data/azure-ocr/txt/ACA 1994-1995.pdf.txt deleted file mode 100644 index c212652..0000000 --- a/data/azure-ocr/txt/ACA 1994-1995.pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cdde2e7ecdaa1cc4d58611f860014f29fb347effb3bc7014ce2d6ddc0cab21f1 -size 225309 diff --git a/data/azure-ocr/txt/ACA 1996-1998.pdf.txt b/data/azure-ocr/txt/ACA 1996-1998.pdf.txt deleted file mode 100644 index 607a2d8..0000000 --- a/data/azure-ocr/txt/ACA 1996-1998.pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ae8f09a5b4c9dec9dc581e99dc4715bdd1663c523da8e143ef23dcadc74f6b69 -size 249703 diff --git a/data/azure-ocr/txt/ACA 1999-2001.pdf.txt b/data/azure-ocr/txt/ACA 1999-2001.pdf.txt deleted file mode 100644 index e71421f..0000000 --- a/data/azure-ocr/txt/ACA 1999-2001.pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:69a4d651c32c923b15c93d2b27e0df31f41de5343ed1121167a9ec80ab19e16c -size 241599 diff --git a/data/azure-ocr/txt/ACA 2002-2003.pdf.txt b/data/azure-ocr/txt/ACA 2002-2003.pdf.txt deleted file mode 100644 index 9c82240..0000000 --- a/data/azure-ocr/txt/ACA 2002-2003.pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:622a5fa4a7d78d96e03397f97c975a9976f16aa4fab4843ae7e6ecca8b9ba918 -size 201489 diff --git a/data/azure-ocr/txt/ACA 2004-2005.pdf.txt b/data/azure-ocr/txt/ACA 2004-2005.pdf.txt deleted file mode 100644 index 66fefe3..0000000 --- a/data/azure-ocr/txt/ACA 2004-2005.pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7cf3a23eb02a7a888901f901024ffdb5319688c7392160f8c108d3adb504b39e -size 201246 diff --git a/data/azure-ocr/txt/ACA 2006.pdf.txt b/data/azure-ocr/txt/ACA 2006.pdf.txt deleted file mode 100644 index ae756fb..0000000 --- a/data/azure-ocr/txt/ACA 2006.pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:caf016bbeb60b6011774db6a18c55feafe800ed4c2952ae08017ecc42629c35e -size 68371 diff --git a/data/azure-ocr/txt/ACA 2007-2008.pdf.txt b/data/azure-ocr/txt/ACA 2007-2008.pdf.txt deleted file mode 100644 index 767c5ab..0000000 --- a/data/azure-ocr/txt/ACA 2007-2008.pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9437e1c3d9fd8bf343dc48b1da6fac2d629e290174cc5b3dfcb23501a613415e -size 130394 diff --git a/data/azure-ocr/txt/ACA 2009-2010.pdf.txt b/data/azure-ocr/txt/ACA 2009-2010.pdf.txt deleted file mode 100644 index efd6b2c..0000000 --- a/data/azure-ocr/txt/ACA 2009-2010.pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:971762c1fd0ed618f25798b966263e2aad22db9a3254fd0353d7e4c1bda718e6 -size 102715 diff --git a/data/azure-ocr/txt/ACA 2011-2012.pdf.txt b/data/azure-ocr/txt/ACA 2011-2012.pdf.txt deleted file mode 100644 index aeacbec..0000000 --- a/data/azure-ocr/txt/ACA 2011-2012.pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3d1bedebf0956a42558abb6771aed6f86e947fe9622538feefe795bfad2553a5 -size 152872 diff --git a/data/azure-ocr/txt/ACA 2013-2014.pdf.txt b/data/azure-ocr/txt/ACA 2013-2014.pdf.txt deleted file mode 100644 index dd21af6..0000000 --- a/data/azure-ocr/txt/ACA 2013-2014.pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4e6c96b1b9364a20143dd644a06252e672a60bf4ec886630a29b0f66e1b0269a -size 177274 diff --git a/data/azure-ocr/txt/ACA 2015-2016 (50th Anniversary).pdf.txt b/data/azure-ocr/txt/ACA 2015-2016 (50th Anniversary).pdf.txt deleted file mode 100644 index c26c188..0000000 --- a/data/azure-ocr/txt/ACA 2015-2016 (50th Anniversary).pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:89a7368002bd3e26bbbaa567d82966fcb8dfdfe3f14399130057f930ada189e3 -size 182352 diff --git a/data/azure-ocr/txt/ACA 2017-2018.pdf.txt b/data/azure-ocr/txt/ACA 2017-2018.pdf.txt deleted file mode 100644 index 8b3330b..0000000 --- a/data/azure-ocr/txt/ACA 2017-2018.pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c1e0c41a1c8faf0c0f3879796df6ae80b3d5460a7411081b17ad9d3000978364 -size 139184 diff --git a/data/azure-ocr/txt/ACA 2019-2020.pdf.txt b/data/azure-ocr/txt/ACA 2019-2020.pdf.txt deleted file mode 100644 index 6d23b82..0000000 --- a/data/azure-ocr/txt/ACA 2019-2020.pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:023d0d718c04f46080926732a3de5a825a8e4387909d7b41270b9dc3f156dc53 -size 109381 diff --git a/data/azure-ocr/txt/ACA 2021-2022.pdf.txt b/data/azure-ocr/txt/ACA 2021-2022.pdf.txt deleted file mode 100644 index b76a2db..0000000 --- a/data/azure-ocr/txt/ACA 2021-2022.pdf.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a40d8a7f6ecddbfdbcca82ba6a789e32590bb7f0f594d8720650609fe62e1c09 -size 97582 diff --git a/data/openai-formatter/.gitignore b/data/openai-formatter/.gitignore deleted file mode 100644 index 84548e7..0000000 --- a/data/openai-formatter/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -output/ -batches/ -settings.py -*.log diff --git a/data/openai-formatter/fix-titles.py b/data/openai-formatter/fix-titles.py deleted file mode 100644 index 4410851..0000000 --- a/data/openai-formatter/fix-titles.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Script to fix uppercase cave names in JSON results.""" - -import json - - -def process_json_objects(file_path, output_file_path): - # Read the JSON data from the input file - with open(file_path, 'r') as file: - data = json.load(file) - - # Iterate over each JSON object in the array - for obj in data: - if 'cave' in obj: - cave_value = obj['cave'] - # Check if all alphabetic characters are uppercase - if all(c.isupper() or not c.isalpha() for c in cave_value): - # Convert letters to title case - obj['cave'] = cave_value.title() - - # Write the modified JSON data to the output file - with open(output_file_path, 'w') as file: - json.dump(data, file, indent=4) - - -# Provide the input JSON file path and the output file path -input_file_path = 'ACA 1990.json' -output_file_path = 'ACA 1990.json' - -# Call the function with the provided file paths -process_json_objects(input_file_path, output_file_path) diff --git a/data/openai-formatter/process.py b/data/openai-formatter/process.py deleted file mode 100755 index 54b1588..0000000 --- a/data/openai-formatter/process.py +++ /dev/null @@ -1,592 +0,0 @@ -#!/usr/bin/env python3 -"""Process text data using the OpenAI API. - -Operations: -- batch: Run the chat completions API in batch mode. -- sync: Run the chat completions API synchronously. -- check: Open a results file and check the contents conform to the expected format. - -Usage: - -Running operations: -- python process.py --operation batch --name "ACA 1987" --file "processed/txt-split/ACA 1987.txt" -- python process.py --operation sync --name "ACA 1987" --file "processed/txt-split/ACA 1987.txt" -- python process.py --operation check --name "ACA 1987" --file "output/aca_1987.json" - -Collecting batch results: -- python process.py --operation collectbatch - -""" - -import argparse -from typing import Any - -import orjson -import os -import re -import time -from datetime import datetime, timedelta -from timeit import default_timer - -import humanize -from openai.types.batch import Batch -from openai.types.chat import ChatCompletion - -from openai import OpenAI -from openai.types.file_object import FileObject - -OPERATIONS = ["batch", "run", "check", "collectbatch"] -LOG_FILE = "openai.log" -PROMPT_FILE = "prompt.txt" -CHAR_LIMIT = 32768 - -MODEL = "gpt-4-turbo" - -MODEL_COSTS = { - "gpt-4-turbo": (0.00003, 0.00006), -} - -ALLOWED_MODELS = ["gpt-4-turbo", "gpt-3.5-turbo", "gpt3.5", "gpt-4"] -CHAT_COMPLETIONS_ENDPOINT = "/v1/chat/completions" - - -def log(msg: str, print_msg=False) -> None: - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - msg = f"[{timestamp}] {msg}" - if print_msg: - print(msg) - - with open(LOG_FILE, "a") as f: - f.write(msg + "\n") - - -def fmt_delta(delta: timedelta) -> str: - return humanize.precisedelta( - delta, - minimum_unit="seconds", - format="%0.0f", - ) - - -def generate_output_file_name(name: str, *, ext: str = "json", temp: bool = False, batches: bool = False,) -> str: - prefix: str = normalise_file_name(name) - timestamp = time.strftime("%Y%m%d-%H%M%S") - - if batches: - folder: str = f"batches/{prefix}" - else: - folder = f"output/{prefix}" - - if not os.path.isdir(folder): - os.mkdir(folder) - - if temp: - folder = f"{folder}/tmp" - - if not os.path.isdir(folder): - os.mkdir(folder) - - return f"{folder}/{prefix}-{timestamp}.{ext}" - - -def normalise_file_name(file_name: str) -> str: - """Normalise a string to be used as a file name.""" - result: str = ( - file_name - .strip() - .replace(" ", "-") - .replace(",", "-") - .replace("/", "-") - .replace("_", "-") - .replace(":", "-") - ) - - if not re.match(r"^[\w\-.]+$", result): - raise ValueError(f"Invalid file name: {result}") - - return result - - -def get_cost(model: str, input_tokens: int, output_tokens: int) -> float: - """Calculate the cost of a request based on the number of input and output tokens.""" - if model not in MODEL_COSTS: - return 0.0 - - input_cost: int = MODEL_COSTS[model][0] - output_cost: int = MODEL_COSTS[model][1] - - return round( - (input_tokens * input_cost) + (output_tokens * output_cost), 2 - ) - - -def check_args(args: argparse.Namespace) -> None: - """Check the command line arguments.""" - if args.operation not in OPERATIONS: - log(f"Error: Invalid operation: {args.operation}", print_msg=True) - exit(1) - - if args.operation == "collectbatch": - return - - error: bool = False - if not args.name: - log("Error: No name specified.", print_msg=True) - error = True - - if not args.file: - log("Error: No file specified.", print_msg=True) - error = True - - if args.limit < 0: - log(f"Error: Invalid limit: {args.limit}", print_msg=True) - error = True - - try: - normalise_file_name(args.name) - except ValueError as e: - log(f"Error: Invalid name: {e}", print_msg=True) - error = True - - try: - with open(LOG_FILE, "a"): - pass - except OSError: - log(f"Error: Unable to write to log file: {LOG_FILE}", print_msg=True) - error = True - - if error: - exit(1) - - -def main() -> None: - parser = argparse.ArgumentParser(description="Process text data using the OpenAI API.") - parser.add_argument( - "-o", - "--operation", - type=str, - choices=OPERATIONS, - help="Operation to perform.", - ) - parser.add_argument( - "-f", - "--file", - type=argparse.FileType("r"), - help="Path to the file containing the incidents to process.", - ) - parser.add_argument( - "-p", - "--prompt", - type=argparse.FileType("r"), - help="Path to the file containing the prompt for the LLM.", - default=PROMPT_FILE, - ) - parser.add_argument( - "-n", - "--name", - type=str, - help="Name of the batch of incidents being processed.", - ) - parser.add_argument( - "-l", - "--limit", - type=int, - help="Maximum number of incidents to process.", - default=0, - ) - parser.add_argument( - "--model", - type=str, - help="Model to use for processing.", - default=MODEL, - choices=ALLOWED_MODELS, - ) - args = parser.parse_args() - - check_args(args) - - if not os.path.isdir("output"): - os.mkdir("output") - - if args.operation == "check": - raise NotImplemented - - client: OpenAI = get_openai_client() - - if args.operation == "batch" or args.operation == "collectbatch": - if not os.path.isdir("batches"): - os.mkdir("batches") - - if args.operation == "collectbatch": - run_collect_batch(client) - return - - if args.operation == "batch": - run_processing_batch(client, args) - return - - if args.operation == "run": - run_processing_sync(client, args) - return - - parser.print_help() - - -def get_openai_client() -> OpenAI: - """Get an OpenAI client.""" - api_key: str | None = os.getenv("OPENAI_API_KEY") - - if not api_key: - log("Error: OPENAI_API_KEY environment variable not set.", print_msg=True) - exit(1) - - try: - return OpenAI(api_key=api_key) - except Exception as e: - log(f"Error: Unable to create OpenAI client: {e}", print_msg=True) - exit(1) - - -def setup_data(args: argparse.Namespace) -> tuple[list[str], str]: - raw_data: str = args.file.read().strip() - - prompt: str = args.prompt.read().strip() - prompt = prompt.replace("{{ PUBLICATION_NAME }}", args.name) - prompt_length: int = len(prompt) - - log( - f"Data and prompt loaded.", - print_msg=True, - ) - - queue: list[str] = [] - data: list[str] = raw_data.split("------") - for item in data: - if len(item) + prompt_length > CHAR_LIMIT: - log(f"Item too long: {len(item)} > {CHAR_LIMIT}.", print_msg=True) - log(f"Item:\n{item}", print_msg=True) - exit(1) - queue.append(item) - - data_length: int = sum(len(item) for item in data) - log(f"Data loaded. {len(data)} items. Total length: {data_length}.", print_msg=True) - - return queue, prompt - - -def build_messages(report: str, prompt: str) -> list[dict[str, str]]: - return [ - {"role": "system", "content": prompt}, - {"role": "user", "content": report}, - ] - - -def get_openai_request_args(model: str, messages: list[dict[str, str]]) -> dict[str, Any]: - return { - "model": model, - "response_format": {"type": "json_object"}, - "seed": 1, - "messages": messages, - "temperature": 0, - } - - -def limit_queue(queue: list[str], limit: int) -> list[str]: - if limit > 0: - stop_point: int = min(limit, len(queue)) - queue = queue[:stop_point] - return queue - - -def collect_batch_results(client: OpenAI, batch: Batch) -> str | None: - """Collect the results of a completed batch.""" - if not os.path.isdir("batches/output"): - os.mkdir("batches/output") - - output_folder_name: str = ( - normalise_file_name(batch.metadata.get("name", batch.id)) - ) - - if os.path.isdir(f"batches/output/{output_folder_name}"): - return None - - os.mkdir(f"batches/output/{output_folder_name}") - - log(f"Collecting results for batch {batch.id}.", print_msg=True) - log(f"Output folder: {output_folder_name}", print_msg=True) - - batch_output = client.files.content(batch.output_file_id) - - with open(f"batches/output/{output_folder_name}/{output_folder_name}.raw.json", "wb") as f: - f.write(batch_output.content) - - if not batch_output: - log(f"Error: unable to retrieve batch output file.", print_msg=True) - return None - - output_file_name: str = f"batches/output/{output_folder_name}/{output_folder_name}.json" - incident_file_name: str = ( - f"batches/output/{output_folder_name}/{output_folder_name}.incidents.json" - ) - - results: list[dict[str, Any]] = [ - orjson.loads(line) - for line in batch_output.content.decode("utf-8").split("\n") - if line - ] - - responses: list[str] = [] - incidents: list[dict[str, Any]] = [] - for line in results: - response: str = line["response"]["body"]["choices"][0]["message"]["content"] - - try: - response_obj: dict[str, Any] = orjson.loads(response) - incidents.extend(response_obj.get("results", [])) - except orjson.JSONDecodeError: - error_file_name: str = f"batches/output/{output_folder_name}/errors.jsonl" - log(f"Error decoding response. Saved to {error_file_name}.", print_msg=True) - with open(error_file_name, "a") as f: - f.write(response + "\n") - continue - - response_json: str = orjson.dumps(response_obj, option=orjson.OPT_INDENT_2).decode("utf-8") - line["response"]["body"]["choices"][0]["message"]["content"] = response_obj - responses.append(response_json) - - response_dict: dict[str, Any] = { - "responses": results, - } - formatted_responses: bytes = orjson.dumps(response_dict, option=orjson.OPT_INDENT_2) - - with open(output_file_name, "wb") as f: - f.write(formatted_responses) - - with open(incident_file_name, "wb") as f: - f.write(orjson.dumps(incidents, option=orjson.OPT_INDENT_2)) - - log(f"Batch output written to {output_file_name}.", print_msg=True) - - return output_file_name - - -def run_collect_batch(client: OpenAI) -> None: - """Collect completed batch results.""" - log("Collecting batch results.", print_msg=True) - batches: list[Batch] = list(client.batches.list()) - - log(f"Retrieved {len(batches)} batches from OpenAI:", print_msg=True) - for num, batch in enumerate(batches, 1): - name: str = batch.metadata.get("name", batch.id) - log(f"{num}: {name} - {batch.status}", print_msg=True) - - results_collected: list[str] = [] - for batch in batches: - if batch.status == "completed": - if result := collect_batch_results(client, batch): - results_collected.append(result) - - log(f"Results collected for {len(results_collected)} batches.", print_msg=True) - - for result in results_collected: - log(f"Collected: {result}", print_msg=True) - - log("Batch results collection complete.", print_msg=True) - - -def run_processing_batch(client: OpenAI, args: argparse.Namespace) -> None: - """Run the chat completions API in batch mode.""" - log("Starting batch run with chat completions API.", print_msg=True) - - queue, prompt = setup_data(args) - report_count: int = len(queue) - - queue = limit_queue(queue, args.limit) - num_to_process: int = len(queue) - - log( - f"Starting to queue batch requests. {report_count} reports found. " - f"Processing {num_to_process} reports.", - print_msg=True, - ) - - jobs: list[bytes] = [] - - # Build jobs - log(f"Building jobs for {num_to_process} reports.", print_msg=True) - - timestamp: str = datetime.now().strftime("%Y%m%d-%H%M%S") - for num, report in enumerate(queue, 1): - log(f"Building job for report {num} of {num_to_process}.") - - messages: list[dict[str, str]] = build_messages(report, prompt) - batch_args: dict[str, Any] = { - "custom_id": f"{normalise_file_name(args.name)}-{timestamp}-{num}", - "method": "POST", - "url": CHAT_COMPLETIONS_ENDPOINT, - "body": get_openai_request_args(args.model, messages), - } - jobs.append(orjson.dumps(batch_args) + b"\n") - - # Write jobs to file - batch_file_name: str = generate_output_file_name(args.name, ext="jsonl", batches=True) - with open(batch_file_name, "wb") as f: - for job in jobs: - f.write(job) - - log(f"Batch file written to {batch_file_name}.", print_msg=True) - - # Upload batch file to OpenAI - log("Uploading batch file to OpenAI.", print_msg=True) - batch_input: FileObject = client.files.create( - file=open(batch_file_name, "rb"), - purpose="batch", # type: ignore - ) - - if not batch_input or batch_input.status != "processed": - log(f"Error: unable to upload batch file.", print_msg=True) - exit(1) - - batch_input_json: str = batch_input.model_dump_json(indent=2) - - log(f"Batch file uploaded.", print_msg=True) - log(f"{batch_input_json = }") - - batch_input_json_file_name: str = generate_output_file_name(args.name, ext="batchinput.json", batches=True,) - - with open(batch_input_json_file_name, "w") as f: - f.write(batch_input_json) - - log(f"Batch input details written to {batch_input_json_file_name}", print_msg=True) - - # Start batch run - log("Starting batch run.", print_msg=True) - - batch_output: Batch = client.batches.create( - input_file_id=batch_input.id, - endpoint="/v1/chat/completions", - completion_window="24h", - metadata={ - "name": f"{args.name} {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", - } - ) - - if not batch_output: - log("Error: unable to start batch run.", print_msg=True) - exit(1) - - batch_output_json: str = batch_output.model_dump_json(indent=2) - - log(f"Batch run started.", print_msg=True) - log(f"{batch_output_json = }") - - batch_output_json_file_name: str = generate_output_file_name(args.name, ext="batchoutput.json", batches=True,) - with open(batch_output_json_file_name, "w") as f: - f.write(batch_output_json) - - log(f"Batch output details written to {batch_output_json_file_name}", print_msg=True) - log("Batch run processing complete.", print_msg=True) - - -def run_processing_sync(client: OpenAI, args: argparse.Namespace) -> None: - """Run the chat completions API synchronously.""" - time_now: str = datetime.now().strftime("%H:%M:%S") - log(f"Starting run with chat completions API at {time_now}.", print_msg=True) - - queue, prompt = setup_data(args) - report_count: int = len(queue) - - queue = limit_queue(queue, args.limit) - num_to_process: int = len(queue) - - log( - f"Starting chat completion loop. {report_count} reports found. " - f"Processing {num_to_process} reports.", - print_msg=True, - ) - - total_duration: timedelta = timedelta() - total_tokens: int = 0 - input_tokens: int = 0 - output_tokens: int = 0 - request_count: int = 0 - results: list[ChatCompletion] = [] - - while queue: - request_count += 1 - report: str = queue.pop(0) - - log(f"Starting request {request_count} of {num_to_process}.", print_msg=True) - log(f"Request data:\n{report}") - - messages: list[dict[str, str]] = build_messages(report, prompt) - - response: ChatCompletion | None = None - reply: str = "" - - try: - request_start_time: float = default_timer() - response = client.chat.completions.create( - **get_openai_request_args(args.model, messages), - ) - request_duration: timedelta = timedelta(seconds=default_timer() - request_start_time) - - if not response or not response.choices: - raise Exception("No response received.") - - total_duration += request_duration - total_tokens += response.usage.total_tokens - input_tokens += response.usage.prompt_tokens - output_tokens += response.usage.completion_tokens - - log(f"{response = }") - - reply: str = response.choices[0].message.content - file_name: str = generate_output_file_name(args.name, temp=True) - - with open(file_name, "a") as f: - f.write(reply) - - parsed: dict[str, Any] = orjson.loads(reply) - - if "results" not in parsed or len(parsed["results"]) == 0: - raise Exception("No results in response.") - - for incident in parsed["results"]: - results.append(incident) - - except Exception as e: - log(f"Error processing request number {request_count}.", print_msg=True) - log(f"Exception: {e}", print_msg=True) - log(f"Request data:\n{report}", print_msg=True) - log(f"{response = }", print_msg=True) - log(f"Reply:\n{reply}", print_msg=True) - log(f"Results:\n{results}", print_msg=True) - continue - - log( - f"Output from request {request_count} written to {file_name}.", - print_msg=True, - ) - log(f"Request took {fmt_delta(request_duration)} to complete.", print_msg=True) - log(f"Total duration: {fmt_delta(total_duration)}.", print_msg=True) - log(f"Cumulative cost: ${get_cost(args.model, input_tokens, output_tokens)}", print_msg=True,) - - log(f"Run complete with {request_count} requests.", print_msg=True) - log(f"Run took {fmt_delta(total_duration)} to complete.", print_msg=True) - log( - f"Total tokens used: {total_tokens}. " - f"Input: {input_tokens}. Output: {output_tokens} ", - print_msg=True, - ) - log(f"Total cost: ${get_cost(args.model, input_tokens, output_tokens)}", print_msg=True,) - - file_name = generate_output_file_name(args.name) - with open(file_name, "wb") as f: - f.write(orjson.dumps(results, option=orjson.OPT_INDENT_2)) - - log(f"Results written to {file_name}.", print_msg=True) - - -if __name__ == "__main__": - main() diff --git a/data/openai-formatter/prompt.txt b/data/openai-formatter/prompt.txt deleted file mode 100644 index 138b594..0000000 --- a/data/openai-formatter/prompt.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c307017828dbdf7f9d22615684a2049102fc53615550d23fcbfce3a161c5e95e -size 12516 diff --git a/data/pdf/ACA 1967-1970.pdf b/data/pdf/ACA 1967-1970.pdf deleted file mode 100644 index 5b3dfe20a2dd68aeaa7dd3088b0e8eff91975af2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132 zcmWN_$qmCG5CFhGRnS1pu{MXh0hSF&NwBimk5nJOMw+XBxsQ)*a~`}a_3ZQUB3^FW znU^x&ijP{-B}*n`i@JJBw0hDS4cS~EQ^Zgp;Jq6ph{=IP Pr%7=CRq4Pu4_f>IXeubP diff --git a/data/pdf/ACA 1971.pdf b/data/pdf/ACA 1971.pdf deleted file mode 100644 index 7cb2c2aaf6b15f51555029c5a34afd05a2635ae4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132 zcmWN?OA^8$3;@u5Pr(H&;TNE{Nkb53)OG~B@bvmN?~?cE$4j+2&#`HJ@ALMkWBuPg z`^v{tk5gD*W;uFMqmd1`u>=s|XhCABhS++d1n3ObT9`0rCJO}7V>CGz3=GjE2QlqF O`DOI4C}$n|5SAZLJSUt0 diff --git a/data/pdf/ACA 1972.pdf b/data/pdf/ACA 1972.pdf deleted file mode 100644 index 2bd65d148ee17d1ac313e96145ee6f8dd7db5e84..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132 zcmWN`OAf;z3;@wRr{Dq&e{6CaVoX75OH`69dirKH(wmFFvyYE#$Fa{x9_&E*1sK^MLa8BD5-+9ppSM-LGkdFMiOoJf4j!07eRU_~4Lg)L_B8 On#AniA|beFh{O*T=qAYk diff --git a/data/pdf/ACA 1974.pdf b/data/pdf/ACA 1974.pdf deleted file mode 100644 index 50120e6d887a5312ed75c00221170be49afce70f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132 zcmWN?%MrpL5CG77s-OYGe__)N?1C_(k`c_o>h)dTRo=6YmuPF9^N{l1*X>c~{=a?l zPU9)(Sx8=b8=b{$$#^&@1@>4&AVgg_D}@Qkn5>Jd+7LuwtW?1Qt3-^LZ9*5pRj1M9 NZT4>w)_D`P-+ot-C`bSR diff --git a/data/pdf/ACA 1975.pdf b/data/pdf/ACA 1975.pdf deleted file mode 100644 index d6180904e913964f74088642860099f3584e4bcf..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132 zcmWN?%MrpL5CG6SRnUN8f$Re5hOaQAl5sHytJimVSAFxCFWJ{R`B3UU*6q=F``$KYj#v9+zMjvfU*bp*8L82TT0l1PQrUF1I N3Lf7o$?TTUi68cuClLSu diff --git a/data/pdf/ACA 1976-1979.pdf b/data/pdf/ACA 1976-1979.pdf deleted file mode 100644 index 14c7d838ea698c946f7cde2345c4373b7df9b12d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 133 zcmWN?%MrpL5CG77s-OYG@`rS@EFjFNWL(U_>h)dTRo^`3OZ2r)KBT;lb$c}4|MySc zX+9;NmE@(iIf~$p+W7-?Y8hiHIe0 OjmNhNWxUcF?Dhk6xhO#Z diff --git a/data/pdf/ACA 1980-1981.pdf b/data/pdf/ACA 1980-1981.pdf deleted file mode 100644 index bd88608a8be4bdf640cbfbff5625afe84a5cf5b9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132 zcmWN{K@x)?3;@78uiyg}L;~V(k`M|rZFLHI=88Axm!8@TPig%bX;W`s$vQ!8_m>H1+-z`TH Or+WK|P5`qFUi|@*7b-6R diff --git a/data/pdf/ACA 1982.pdf b/data/pdf/ACA 1982.pdf deleted file mode 100644 index 4f0ffadd399a9b2e37b8fd4a05fa5a6231eb245e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 133 zcmWN?K@!3s3;@7;U%>|~B#;F9n>GbuMr}v12Vbvy*(<+$J6@)aarVX9dmFb$>GS{k z*=9YSwx3z;1=G=X)9Z@F%Zg$1S{$Qz21VfHvLj}vEG`NNL&6w5@g_o^6LJLAU3?)T On0otW0}(kSNYfAcRVCB_ diff --git a/data/pdf/ACA 1983-1984.pdf b/data/pdf/ACA 1983-1984.pdf deleted file mode 100644 index 065087ce21d0558927f507d672fe7ab478171c42..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132 zcmWN{K@x)?3;@78ukZtv22BY524bwtw8d$thrYg^_OhpbthV>C_kP&o{iF8VbF=Ys z|CUkO%iE#iPjIcSgY7vWEmYPNifH7!a^OH986zi|qHek|0Ab*Qz>u*J O%j~TFbVvw+DY-v1#wUmX diff --git a/data/pdf/ACA 1983.pdf b/data/pdf/ACA 1983.pdf deleted file mode 100644 index 236f91755b01e9cecd56b29f81390b30c2ed2d16..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132 zcmWN?%MrpL5CG6SRnUNeuq>Z+1MDKqsAL3luzG!$ckz4n@se$=a~?|F`?@{q-2S&u z-pY8Yd2&(LE=DIgTQcYljnShw#F*4QTXagAWw}VkUN}f^!Zj8%k(HV#^pH Oc`^H!WeCPvw~HSsmnXFV diff --git a/data/pdf/ACA 1984-1985.pdf b/data/pdf/ACA 1984-1985.pdf deleted file mode 100644 index c2ef3c999f6625eda61a27c7c9d44277d689dc0f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132 zcmWN?OA^8$3;@u5Pr(H&ZTR;#2?1e7rDL!QPp@zDuJV>XUaHM`%AxhW&)cKQ`oDc> z&Eu)%ELmUsGCC`12+kh~*+&;Vfla_ZR|o;oZf;{BLZ)I%OfCi+DkB3%XKEl*tJ@6$ OEz-Y5MVpK_arpsB7btW9 diff --git a/data/pdf/ACA 1986.pdf b/data/pdf/ACA 1986.pdf deleted file mode 100644 index c3c7ac05ab72ad510bb7b2f68d7e2c612ef9318c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 133 zcmWN{yAi`65CG9!RnUM12OMzfhOc49HX4HzxO$W9OZLv+(fWtgInQx${%rH{Dq~;P zE9|kq`8adrH*CEnHma9s4hyEZ0*Kin8HY(2(#km#LzW&r1@Z}W_EDk=YZi9ZD_1N8 PB-%#%Wd=f%11YvYLG~wy diff --git a/data/pdf/ACA 1987.pdf b/data/pdf/ACA 1987.pdf deleted file mode 100644 index 134ed56c7424e794f8b07c54b7a1b8cd1cd06e7b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 133 zcmWN?K@!3s3;@78ujmIXB!Ncy8v>~?qtcPK2Vbvy*-Jm8_4ibdV;`%vk9ORi+nE2? zQ|8=XcI=w%1*W$$jhZklYf+Iz%#Nvm5+g`V8;keofe1Ni3FO=zxOmEl3PDH~M9Q38 OG^2fW0tgWtp!oqzyeC2c diff --git a/data/pdf/ACA 1988.pdf b/data/pdf/ACA 1988.pdf deleted file mode 100644 index b8ea216e5f3ac9e132167ff943f32b5a0347a7b9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 133 zcmWN_%MHUI5CGA;tDu3H@9u)T!LS6RBv=9akm}>pJk7=5(fUW!IghbP`E2v?Dr3D{ z&wWk(Eyux0UV7;*i&3Ki+%htGP6nu0z+7VVs7a9pyJNWR6$cHJ4H$QC6*yQO90F5O P7?;t0EHt`6n%wdSF7+m> diff --git a/data/pdf/ACA 1989.pdf b/data/pdf/ACA 1989.pdf deleted file mode 100644 index 116059a0b8e0dc3e3e4fc1fc3f21abdc92756908..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132 zcmWN?K@!3s3;@7;U%>|~O`w$YHwmCHqtY?72Vbvy*{giC^|z^G9J*WkXyf+0>-@i- zGN=BsdX!`@n0m`bE8`k=Dwt51fB^l1C5iyDBM0D;&w_!4eO`z{0i@&!0#F7& NY5NuvXxl>c(+?{wCt?5q diff --git a/data/pdf/ACA 1990.pdf b/data/pdf/ACA 1990.pdf deleted file mode 100644 index 6fe18a26b4c29f3fcdbf12141a60f6e1d340440f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132 zcmWN?%MrpL5CG6SRnUL|7Wt){g|(;o5XqQAP=F9ZwZ<~$0NNu$_C8yNV5wkE(dcmZlmZ22 NEoT3+j0V6*Cw^g%D1!h1 diff --git a/data/pdf/ACA 1991.pdf b/data/pdf/ACA 1991.pdf deleted file mode 100644 index 42faa4b2a9065a6541353460e0aa0eb1a7e93443..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132 zcmWN?NfN>!5CFhCuiyiQWri)kfsqlaR8m56@b%i4zUpW8@s@3^W9~{l`no-NZvWd^ zHX1KAk4ow?V)VP5EnDaJ#GZ|@u3&}=77dvcG8l@{R>uISA%uv&`0S7ZBNWn=QegH0 Ol9>HlWz-rySn&f6bSD@9 diff --git a/data/pdf/ACA 1992.pdf b/data/pdf/ACA 1992.pdf deleted file mode 100644 index b161371b81b523b886d89bed6257568e3ac5971a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132 zcmWN?%MrpL5CG6SRnUMzSe8$^VSyE9R5F4&SiO0dch=AB<1O1-$J~{A^mTjk+^*Y~ zwme>H9yO~|i_wdmEg5t>+3HF(I!45CFhjRnUONBMuIA!|@nqY@_iZ1+LzTHrlIvxsQ)%a~`})`Rw!Ys=VH| zlh!oel8=()rI*pO;1&(I9craWTR@GZ6*veARZywsoFjS@e6n#~A%W9Muh?w4Re70i zS6tF~D?Un6_bNuuk{cOxd$G}j*B;0OO$jFnG|q6Y20O>q2&o`sZ@fdw(SojoIs!o< N!TndHS#b+t6Ms4=C@cT~ diff --git a/data/pdf/ACA 2002-2003.pdf b/data/pdf/ACA 2002-2003.pdf deleted file mode 100644 index 13bebf8bfdc1d023d1fc6381d227e74a5348c34e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 131 zcmWN?K@!3s3;@78uiyigf(?QGhL9l4sB{E-@bz}~u%~=R>u+7>JjQO!qs`m1jODt1 zY03R%#v$39R`teu)a1bLAgNHqOu)z#1toh!5CFhCuiyiQC6GXV!z>6@Dk%qZ@b%i4zW6PDymXuMl*5>NpSMSq<$wFY zg5zoAWHgskqm!0aeB2H$9P`Eqv_@cy49R2hK9eh05+%kpr;v$vSEB_AjC%yA)*+>Y NO8S?z3(?wa)en~sDX0Jd diff --git a/data/pdf/ACA 2006.pdf b/data/pdf/ACA 2006.pdf deleted file mode 100644 index d038eb031b2885d8a49697f7d2e5e7bfeb90722c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132 zcmWN?%MrpL5CG6SRnUL|S(Z<_frS-jR5C(xuzG!$ch%4A<1O1-$J~{A^mTjk-2S&S zZ!}(N9+lK}7o!(BTQcbOk_!iGGP+2BmJDdC2@3?rmMSMeEY?Ic6^dsv7$;OliSd+~ep{T6P$A7d)njkWfGvW%b2k-C`{^2XdL1b3{*8IOYwg5X4ICysXJG OiyHmQ8UVfBBB&p4J}B@2 diff --git a/data/pdf/ACA 2009-2010.pdf b/data/pdf/ACA 2009-2010.pdf deleted file mode 100644 index d8d0c1e4e6957c92459bf86fcda5b323b1433059..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132 zcmWN^%MrpL5CG6SRnUN8`9r!{77%7sGD32&dUJa7uJWCIe0W>yn7a_qz8W*!s+s131SzMKY+92E@AB>M6Qh OfW_#)B0+m*vMBx=LqceK8$MB(QRx`$!qe;9yi?xV$IG@k58a)+_j!5Tbot*t zc`4)R^pN6CFr!ziE&0HEl$aoiAYLm3q^&TfO(;RBkTMfwB&I;@fq*mTHIxuMB6`^f NjP|cshSgJK^8+4oCd&W- diff --git a/data/pdf/ACA 2019-2020.pdf b/data/pdf/ACA 2019-2020.pdf deleted file mode 100644 index fd3a199c30866414a6be5026440789e6ecb87b96..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132 zcmWN?NfN>!5CFh?UcmTX OsoU&d7LH8th}#bZ^(OlO diff --git a/data/pdf/ACA 2021-2022.pdf b/data/pdf/ACA 2021-2022.pdf deleted file mode 100644 index 91551ad920c1a3b7364601d39a356d00c3f62281..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 133 zcmWN?OA^8$3;@u5Ptgll2%pgOHV}ekMs3Gn7oJ|<=3VmEe!S*=j8ivb?tR=IO_%@s zr!0Itjh@2hg6ZgW);+t3n-3{NMv0YarOMcx#n@ay5U~iH0{P&g! None: - if not os.path.exists(out_dir): - os.makedirs(out_dir) - - # Loop through each file in the input directory - for filename in os.listdir(in_dir): - if filename.endswith(".csv"): - input_path = os.path.join(in_dir, filename) - base_filename = os.path.splitext(filename)[0] - output_path = os.path.join(out_dir, base_filename + ".txt") - log_path = os.path.join(out_dir, "txt_maker_log.txt") - process_file(input_path, output_path, log_path) - - -def process_file(input_path: str, output_path: str, log_path: str) -> None: - with open(input_path, mode="r", newline="", encoding="utf-8") as file, \ - open(output_path, mode="w", encoding="utf-8") as out_file, \ - open(log_path, mode="a", encoding="utf-8") as log_file: - - excluded_lines = 0 - reader: csv.DictReader = csv.DictReader(file) - - for row in reader: - layout = row["Layout"] - text = row["Text"].strip() - - if layout.startswith("Page number") or layout.startswith("Footer"): - log_file.write(f"{text}\n") - excluded_lines += 1 - continue - - if layout.startswith("Title") or layout.startswith("Section header") or layout.startswith("Header"): - if text.startswith("Reference") or text.startswith("Analysis"): - out_file.write(f"\n\n") - else: - out_file.write(f"\n------\n\n") - - out_file.write(text + "\n") - - -# Usage example: -input_directory = "." # Replace with your input directory path -output_directory = "txt" # Replace with your desired output directory path - - -process_csv_files(input_directory, output_directory) diff --git a/data/textract-ocr/json/ACA 1967-1970.json b/data/textract-ocr/json/ACA 1967-1970.json deleted file mode 100644 index c98d3ec..0000000 --- a/data/textract-ocr/json/ACA 1967-1970.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:54d4258e140f9693b703036a84d578dc011ecc75699e00711e24426ae53a73fd -size 19511531 diff --git a/data/textract-ocr/json/ACA 1971.json b/data/textract-ocr/json/ACA 1971.json deleted file mode 100644 index 6c1d993..0000000 --- a/data/textract-ocr/json/ACA 1971.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5af8bb0400da2ce0c320d7c2c340c57dea71abab62e248e1c676348b4841e8c1 -size 6503397 diff --git a/data/textract-ocr/json/ACA 1972.json b/data/textract-ocr/json/ACA 1972.json deleted file mode 100644 index a76dfd4..0000000 --- a/data/textract-ocr/json/ACA 1972.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5c47337fda8e8d3df92b89b67fd8eaf53d055300c395f9a4865a0824642800c0 -size 10578205 diff --git a/data/textract-ocr/json/ACA 1973.json b/data/textract-ocr/json/ACA 1973.json deleted file mode 100644 index 4c89bab..0000000 --- a/data/textract-ocr/json/ACA 1973.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b435433ffa75a6b8c6e215481b464392f8acc923e4b787e3ff7a1d3c988eef65 -size 11883861 diff --git a/data/textract-ocr/json/ACA 1974.json b/data/textract-ocr/json/ACA 1974.json deleted file mode 100644 index e4e161c..0000000 --- a/data/textract-ocr/json/ACA 1974.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec31035edf0ec06f5d1d9d4f31c5f4f776afb8a505ed0812d02d4f53de5b0b77 -size 9140259 diff --git a/data/textract-ocr/json/ACA 1975.json b/data/textract-ocr/json/ACA 1975.json deleted file mode 100644 index 054552e..0000000 --- a/data/textract-ocr/json/ACA 1975.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a11c47c25041eeaf5926a2eac43e1741ba858bbbd41e24ea55c7fd30bfd444e6 -size 15957460 diff --git a/data/textract-ocr/json/ACA 1976-1979.json b/data/textract-ocr/json/ACA 1976-1979.json deleted file mode 100644 index 201c8cc..0000000 --- a/data/textract-ocr/json/ACA 1976-1979.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:80597cb19325e5ae212a2f91b7f7fb579706aba419c1b16d39b3a42bc2646f4f -size 42248588 diff --git a/data/textract-ocr/json/ACA 1980-1981.json b/data/textract-ocr/json/ACA 1980-1981.json deleted file mode 100644 index 934f3c3..0000000 --- a/data/textract-ocr/json/ACA 1980-1981.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5b95360af51c55037d374916b0784ab7c6329e00064b1a81d3eca4ee8df2b3ce -size 35418359 diff --git a/data/textract-ocr/json/ACA 1982.json b/data/textract-ocr/json/ACA 1982.json deleted file mode 100644 index db12b50..0000000 --- a/data/textract-ocr/json/ACA 1982.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d6af7353b24338fa7dce83f40931bfb47bee7d19ed08923941827e02a56ab39f -size 31964660 diff --git a/data/textract-ocr/json/ACA 1983-1984.json b/data/textract-ocr/json/ACA 1983-1984.json deleted file mode 100644 index 0e1dd05..0000000 --- a/data/textract-ocr/json/ACA 1983-1984.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6abef9ad6330f87f396cfde782ad6ef193949dfc003f78277b00b1d41b571d29 -size 17891255 diff --git a/data/textract-ocr/json/ACA 1983.json b/data/textract-ocr/json/ACA 1983.json deleted file mode 100644 index 7302b1d..0000000 --- a/data/textract-ocr/json/ACA 1983.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d45dc2c03ad85c3cf112c0863f2ccf98eb7aa67f1f595f3b104d806b482f7901 -size 16294072 diff --git a/data/textract-ocr/json/ACA 1984-1985.json b/data/textract-ocr/json/ACA 1984-1985.json deleted file mode 100644 index 3642ac1..0000000 --- a/data/textract-ocr/json/ACA 1984-1985.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2b73596675f9c3822b893592f510245d35bae95c666a6d9a184dff1561e8978e -size 15494571 diff --git a/data/textract-ocr/json/ACA 1986.json b/data/textract-ocr/json/ACA 1986.json deleted file mode 100644 index 99c2b15..0000000 --- a/data/textract-ocr/json/ACA 1986.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ad35639985bed1406a1742edfc13923d82b66db644c9de5d1e00c3291626ab4a -size 20367412 diff --git a/data/textract-ocr/json/ACA 1987.json b/data/textract-ocr/json/ACA 1987.json deleted file mode 100644 index 1ab3488..0000000 --- a/data/textract-ocr/json/ACA 1987.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0ba806da29231d3f94682f04acb0de050b601349ce3d43b27874ce783a63bcea -size 19344438 diff --git a/data/textract-ocr/json/ACA 1988.json b/data/textract-ocr/json/ACA 1988.json deleted file mode 100644 index 95903b2..0000000 --- a/data/textract-ocr/json/ACA 1988.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5c0842619309333678b95741c0d0d39812acf77042fd754fbb3bfe238673305e -size 28016118 diff --git a/data/textract-ocr/json/ACA 1989.json b/data/textract-ocr/json/ACA 1989.json deleted file mode 100644 index e45aef5..0000000 --- a/data/textract-ocr/json/ACA 1989.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c86187968ce221ba7b8b6dc1b94e285d70441526cd5c436ff8a83dad0d4cd468 -size 24550856 diff --git a/data/textract-ocr/json/ACA 1990.json b/data/textract-ocr/json/ACA 1990.json deleted file mode 100644 index 63e9b0f..0000000 --- a/data/textract-ocr/json/ACA 1990.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9c70071ddaf01c24e47ec9541cee268a5fe1e3e2f8725bf00f466f2c05cb351c -size 26824967 diff --git a/data/textract-ocr/json/ACA 1991.json b/data/textract-ocr/json/ACA 1991.json deleted file mode 100644 index b1b83e5..0000000 --- a/data/textract-ocr/json/ACA 1991.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1ed7fd369f891c68ec5399742dc7347e8416e4862b11bd8ce5d2253c3821bf22 -size 30931662 diff --git a/data/textract-ocr/json/ACA 1992.json b/data/textract-ocr/json/ACA 1992.json deleted file mode 100644 index 81ebdd3..0000000 --- a/data/textract-ocr/json/ACA 1992.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06016ee92b34a5565b74d13bd9fa22140edebec6c3102249327c2761811287fd -size 28942115 diff --git a/data/textract-ocr/json/ACA 1993.json b/data/textract-ocr/json/ACA 1993.json deleted file mode 100644 index d8a72dc..0000000 --- a/data/textract-ocr/json/ACA 1993.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bc30aa88f53ccab47c77aa1b9921b3fe6383b9074b0a960412587014b364aeca -size 35394872 diff --git a/data/textract-ocr/json/ACA 1994-1995.json b/data/textract-ocr/json/ACA 1994-1995.json deleted file mode 100644 index e6a493f..0000000 --- a/data/textract-ocr/json/ACA 1994-1995.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7b93f76cd44d0a09e1dc863ea7f1b94353ce098cb1a0d7f8055ff712fd898de5 -size 51162409 diff --git a/data/textract-ocr/json/ACA 1996-1998.json b/data/textract-ocr/json/ACA 1996-1998.json deleted file mode 100644 index b09e404..0000000 --- a/data/textract-ocr/json/ACA 1996-1998.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c81d0798a294915fec58e0c10cc4f41740b7e2a9159f5ec6a1b3eeb86b021675 -size 56834958 diff --git a/data/textract-ocr/json/ACA 1999-2001.json b/data/textract-ocr/json/ACA 1999-2001.json deleted file mode 100644 index d143fc7..0000000 --- a/data/textract-ocr/json/ACA 1999-2001.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f7d7803b27a92b316a990c781f761332f055c7fa8f14d4c18aa5e42f71a3f9df -size 58551803 diff --git a/data/textract-ocr/json/ACA 2002-2003.json b/data/textract-ocr/json/ACA 2002-2003.json deleted file mode 100644 index 08a5666..0000000 --- a/data/textract-ocr/json/ACA 2002-2003.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c15bbc887091f9663d6216d8770a6bdf8b9808e18ca963fbbf9411ced743871d -size 49508291 diff --git a/data/textract-ocr/json/ACA 2004-2005.json b/data/textract-ocr/json/ACA 2004-2005.json deleted file mode 100644 index 7ea865d..0000000 --- a/data/textract-ocr/json/ACA 2004-2005.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ea7fcff11dbadf744e694710ab09d2e49528010bed234d4f42c54c297dbdfe9 -size 46664300 diff --git a/data/textract-ocr/json/ACA 2006.json b/data/textract-ocr/json/ACA 2006.json deleted file mode 100644 index 0ac9922..0000000 --- a/data/textract-ocr/json/ACA 2006.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5e79925cb3cb1769d978e6179140dfab5234d6db61ccdda0b0605984ae8e6f91 -size 18073367 diff --git a/data/textract-ocr/json/ACA 2007-2008.json b/data/textract-ocr/json/ACA 2007-2008.json deleted file mode 100644 index b71062e..0000000 --- a/data/textract-ocr/json/ACA 2007-2008.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4f53c84e75d4435af8058632238d48d48865ac2d1ad1d431847b9f3138ae9183 -size 31063398 diff --git a/data/textract-ocr/json/ACA 2009-2010.json b/data/textract-ocr/json/ACA 2009-2010.json deleted file mode 100644 index cc5d9ef..0000000 --- a/data/textract-ocr/json/ACA 2009-2010.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d1856a1578dbe3c38c74d75939a5946f7d8049b23e0c3f2f4d7f2a73a6193678 -size 25946860 diff --git a/data/textract-ocr/json/ACA 2011-2012.json b/data/textract-ocr/json/ACA 2011-2012.json deleted file mode 100644 index 0e4f996..0000000 --- a/data/textract-ocr/json/ACA 2011-2012.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9191ed6f128738d150967b2b1dfdc390e8a2d859487e4cbbbf185c12593d01ee -size 43249253 diff --git a/data/textract-ocr/json/ACA 2013-2014.json b/data/textract-ocr/json/ACA 2013-2014.json deleted file mode 100644 index 2bef225..0000000 --- a/data/textract-ocr/json/ACA 2013-2014.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5461c67d1809603a6dfa589c799fa9eb471c313b0106e904c029e38a806b50d5 -size 48800010 diff --git a/data/textract-ocr/json/ACA 2017-2018.json b/data/textract-ocr/json/ACA 2017-2018.json deleted file mode 100644 index 4bea23f..0000000 --- a/data/textract-ocr/json/ACA 2017-2018.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:99b380c582dceeb521a5986d534b49d933c38145c5b4e5c942e9095f18a9d908 -size 35913524 diff --git a/data/textract-ocr/json/ACA 2019-2020.json b/data/textract-ocr/json/ACA 2019-2020.json deleted file mode 100644 index b556f80..0000000 --- a/data/textract-ocr/json/ACA 2019-2020.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7124c4f40b08ba271a898110612941ac3199c4864db2779263f8140be5bee1c9 -size 42216243 diff --git a/data/textract-ocr/json/ACA 50th Anniversary.json b/data/textract-ocr/json/ACA 50th Anniversary.json deleted file mode 100644 index a5d2953..0000000 --- a/data/textract-ocr/json/ACA 50th Anniversary.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f30dae64237be3c90fc7297627382ae42598fd374c94a7d2a562dfbb766ff267 -size 79082330 diff --git a/data/textract-ocr/json/ACA Before 1900.json b/data/textract-ocr/json/ACA Before 1900.json deleted file mode 100644 index 6d3406d..0000000 --- a/data/textract-ocr/json/ACA Before 1900.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:63d32eaf9f748de44b2ef8fdc55f251d4d45ab4c6ae8b57f15e7c2779ea548b6 -size 4569037 diff --git a/data/textract-ocr/sort_into_columns.py b/data/textract-ocr/sort_into_columns.py deleted file mode 100644 index ec97651..0000000 --- a/data/textract-ocr/sort_into_columns.py +++ /dev/null @@ -1,215 +0,0 @@ -"""A script to sort the raw text from AWS Textract into two equal columns. - -It is designed to take the JSON output from AWS Textract for a document where there are -two (and only two) columns of text per page. It crudely splits the page from the exact -middle and outputs the text when reading from left to right. - -It also attempts (crudely) to mark dates and page numbers using very basic syntax. -""" - -import json -import os -import re - -JSON_DIR = "json/" -RESULTS_DIR = "results/" - - -class Line: - def __init__(self, text, top, left, width, height, page): - self.text = text - self.top = top - self.left = left - self.width = width - self.height = height - self.page = page - - def __repr__(self): - return self.text - - -class Page: - def __init__(self, number, lines=None): - self.number = number - - if lines is None: - self.lines = [] - else: - self.lines = lines - - def __repr__(self): - return f"Page {self.number}" - - -class Column: - def __init__(self, page, name): - self.page = page - self.name = name - self.lines = [] - - def __repr__(self): - return f"Page {self.page} {self.name} column" - - -# def get_average_spacing(blocks, page): -# """Get the average spacing between the previous and next line on the same page""" -# total_spacing = 0 -# num_lines = 0 -# for i, block in enumerate(blocks): -# if block["BlockType"] == "LINE" and block["Page"] == page: -# if i > 0 and blocks[i - 1]["BlockType"] == "LINE": -# total_spacing += ( -# block["Geometry"]["BoundingBox"]["Top"] -# - blocks[i - 1]["Geometry"]["BoundingBox"]["Top"] -# ) -# num_lines += 1 -# return total_spacing / num_lines - - -# def get_spacing_for_line(blocks, line): -# """Get the spacing for a specific line on a page""" -# for i, block in enumerate(blocks): -# if block["BlockType"] == "LINE": -# if block["Text"] == line.text: -# if i > 0 and blocks[i - 1]["BlockType"] == "LINE": -# return ( -# block["Geometry"]["BoundingBox"]["Top"] -# - blocks[i - 1]["Geometry"]["BoundingBox"]["Top"] -# ) -# return 0 - - -def filter_line(line): - """Apply various filters to lines""" - # Remove lines that are just numbers - match = re.search(r"^\d*$", line.text) - if match: - return None - - # Remove lines that begin with "NSS News" - match = re.search(r"^NSS News", line.text, re.IGNORECASE) - if match: - return None - - # Highlight lines that begin with a date - date_highlight = "\n\n********** " - # Matches type: December 27, 1983 - match = re.search(r"^\w+ \d{1,2}, \d{4}$", line.text) - if match: - line.text = date_highlight + line.text - return line - - # Matches type: Sunday, 3 December 1972 - match = re.search(r"^\w+, \d{1,2} \w+ \d{4}$", line.text) - if match: - line.text = date_highlight + line.text - return line - - # Matches type: 23 December - match = re.search(r"^\d{1,2} \w+$", line.text) - if match: - line.text = date_highlight + line.text - return line - - # Matches type: 11 July 2020 - match = re.search(r"^\d{1,2} \w+ \d{4}$", line.text) - if match: - line.text = date_highlight + line.text - return line - - # Match old report start types - # e.g. "West Virginia, Bat Cave:" - match = re.search(r"^\w+\s?\w*, [a-zA-Z]+( [a-zA-Z]+)*:", line.text) - if match: - line.text = date_highlight + line.text - return line - - return line - - -def process_lines(json_response): - """Generate a list of Line objects from the JSON response""" - lines = [] - # page_spacing = {} - # blocks = json_response["Blocks"] - - for item in json_response["Blocks"]: - if item["BlockType"] == "LINE": - line = Line( - text=item["Text"], - top=item["Geometry"]["BoundingBox"]["Top"], - left=item["Geometry"]["BoundingBox"]["Left"], - width=item["Geometry"]["BoundingBox"]["Width"], - height=item["Geometry"]["BoundingBox"]["Height"], - page=item["Page"], - ) - - line = filter_line(line) - if line: - lines.append(line) - - return lines - - -def sort_lines_into_pages(lines): - """Generate a list of Page objects from the list of Line objects""" - pages = {} - for line in lines: - if line.page in pages: - pages[line.page].lines.append(line) - else: - pages[line.page] = Page(line.page) - pages[line.page].lines.append(line) - return pages.values() - - -def sort_pages_into_columns(pages): - """Sort the pages into one left and one right column""" - pages_with_cols = [] - for page in pages: - left_col_lines = [] - right_col_lines = [] - for line in page.lines: - if line.left < 0.5: - left_col_lines.append(line) - else: - right_col_lines.append(line) - - pages_with_cols.append(Page(page.number, (left_col_lines + right_col_lines))) - return pages_with_cols - - -def process_file(report_name, file): - with open(file) as f: - response = json.load(f) - - lines = process_lines(response) - pages = sort_lines_into_pages(lines) - pages = sort_pages_into_columns(pages) - - # for page in pages: - # results_name = f"{result_dir}{page.number}.txt" - # with open(results_name, "w") as f: - # for line in page.lines: - # f.write(line.text + "\n") - - full_text_name = f"{RESULTS_DIR}{report_name}.txt" - with open(full_text_name, "w") as f: - for page in pages: - f.write("\n---------- Page " + str(page.number) + " ----------\n") - for line in page.lines: - f.write(line.text + "\n") - - -def main(): - """Get a list of files and process them""" - files = os.listdir(JSON_DIR) - for file in files: - report_name = file.split(".")[0] - print(f"Processing {report_name}...") - process_file(report_name, JSON_DIR + file) - print("Done!") - - -if __name__ == "__main__": - main() diff --git a/data/textract-ocr/txt/ACA 1967-1970.txt b/data/textract-ocr/txt/ACA 1967-1970.txt deleted file mode 100644 index cae6ea6..0000000 --- a/data/textract-ocr/txt/ACA 1967-1970.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2f30380f5a606a9aea59c0a0438b724abc7cc93f2c33c2021dfcc30b8be725bb -size 106998 diff --git a/data/textract-ocr/txt/ACA 1971.txt b/data/textract-ocr/txt/ACA 1971.txt deleted file mode 100644 index e9b0ced..0000000 --- a/data/textract-ocr/txt/ACA 1971.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b2269bab947a5c48c5dda282b75b05f6a8f7a5c97443161359c38235f7d88bdf -size 34451 diff --git a/data/textract-ocr/txt/ACA 1972.txt b/data/textract-ocr/txt/ACA 1972.txt deleted file mode 100644 index fc6a437..0000000 --- a/data/textract-ocr/txt/ACA 1972.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f860e0c785cd81812ca189c8aa2e8c26fd8cd2721dbbfc34dbca3f7d58c7ec1d -size 55108 diff --git a/data/textract-ocr/txt/ACA 1973.txt b/data/textract-ocr/txt/ACA 1973.txt deleted file mode 100644 index c2c8aeb..0000000 --- a/data/textract-ocr/txt/ACA 1973.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66e8333b8f7f233da5e69017878f260a6272143ee2fa1c4284e3e34b549ce14b -size 64348 diff --git a/data/textract-ocr/txt/ACA 1974.txt b/data/textract-ocr/txt/ACA 1974.txt deleted file mode 100644 index d6c029e..0000000 --- a/data/textract-ocr/txt/ACA 1974.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:49e26156316f8fa90b90bc945cc11607fdf8fa51a70e4d9c7a6cce3be15adeec -size 48945 diff --git a/data/textract-ocr/txt/ACA 1975.txt b/data/textract-ocr/txt/ACA 1975.txt deleted file mode 100644 index ad833d0..0000000 --- a/data/textract-ocr/txt/ACA 1975.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8215233bcabd4d959d5119c068d706daa1f20cce21eb806565ee7fb24e57202e -size 84941 diff --git a/data/textract-ocr/txt/ACA 1976-1979.txt b/data/textract-ocr/txt/ACA 1976-1979.txt deleted file mode 100644 index 18dfd1b..0000000 --- a/data/textract-ocr/txt/ACA 1976-1979.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5c3f1d5191e20cfb186f7c641c5764bf41b19a51369ad7c05b7edff296b61754 -size 220558 diff --git a/data/textract-ocr/txt/ACA 1980-1981.txt b/data/textract-ocr/txt/ACA 1980-1981.txt deleted file mode 100644 index 78a56de..0000000 --- a/data/textract-ocr/txt/ACA 1980-1981.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:298ac405fede6f8cdc6493208fc1f95ac67b08eb85559991b4fdf5e7dad98a24 -size 194347 diff --git a/data/textract-ocr/txt/ACA 1982.txt b/data/textract-ocr/txt/ACA 1982.txt deleted file mode 100644 index 52965d4..0000000 --- a/data/textract-ocr/txt/ACA 1982.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7a63f1b78ff2e8b9e8f56349471ec4954880d8fd4cd6aca468ea991fb47e0adc -size 172681 diff --git a/data/textract-ocr/txt/ACA 1983-1984.txt b/data/textract-ocr/txt/ACA 1983-1984.txt deleted file mode 100644 index 48a170b..0000000 --- a/data/textract-ocr/txt/ACA 1983-1984.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c404c369bbc1ab3483ed2e5273507afba9c0d46fa2fc8dafb05c859bf698325 -size 96537 diff --git a/data/textract-ocr/txt/ACA 1983.txt b/data/textract-ocr/txt/ACA 1983.txt deleted file mode 100644 index bf25419..0000000 --- a/data/textract-ocr/txt/ACA 1983.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:719def006affc6165681aada960fd325254483173fe8597a77e3cf416dadc5bd -size 88581 diff --git a/data/textract-ocr/txt/ACA 1984-1985.txt b/data/textract-ocr/txt/ACA 1984-1985.txt deleted file mode 100644 index 579d161..0000000 --- a/data/textract-ocr/txt/ACA 1984-1985.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4dc6ca16a43e84bf9d6d8622b0e1f4b4c3553b01f54582b048fe3c9fba4d1c5d -size 83058 diff --git a/data/textract-ocr/txt/ACA 1986.txt b/data/textract-ocr/txt/ACA 1986.txt deleted file mode 100644 index 8b058de..0000000 --- a/data/textract-ocr/txt/ACA 1986.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a092a262e0b6835c4363c0f06c540c751532ae7abb6a6fde8b96df87387d0a6e -size 109620 diff --git a/data/textract-ocr/txt/ACA 1987.txt b/data/textract-ocr/txt/ACA 1987.txt deleted file mode 100644 index 5f5edf4..0000000 --- a/data/textract-ocr/txt/ACA 1987.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a033425fc562312d45235686491d5d72e2b3d04568cf8a7fe07bfa1c53f36590 -size 104321 diff --git a/data/textract-ocr/txt/ACA 1988.txt b/data/textract-ocr/txt/ACA 1988.txt deleted file mode 100644 index 561979a..0000000 --- a/data/textract-ocr/txt/ACA 1988.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b42f435bef26cf1e8f80278822fb6432eb9d161c2c48ef1d2f6fb0dcbb0714de -size 150205 diff --git a/data/textract-ocr/txt/ACA 1989.txt b/data/textract-ocr/txt/ACA 1989.txt deleted file mode 100644 index f2deca7..0000000 --- a/data/textract-ocr/txt/ACA 1989.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:23fffa1da409f9ed7c80bc0ea52d76cac2bebef76e4570ecbcbc28452ff59642 -size 128366 diff --git a/data/textract-ocr/txt/ACA 1990.txt b/data/textract-ocr/txt/ACA 1990.txt deleted file mode 100644 index eb2b4c7..0000000 --- a/data/textract-ocr/txt/ACA 1990.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:acfc362d0c7ef77893d8d300b7a2c1a182df8932d935e67f71d8f58016332470 -size 142432 diff --git a/data/textract-ocr/txt/ACA 1991.txt b/data/textract-ocr/txt/ACA 1991.txt deleted file mode 100644 index 7a2f4c0..0000000 --- a/data/textract-ocr/txt/ACA 1991.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b6f16ccf32655798c5b9d3f5ff2ad3d3e6ce282eec35479d05648926493bbf78 -size 155020 diff --git a/data/textract-ocr/txt/ACA 1992.txt b/data/textract-ocr/txt/ACA 1992.txt deleted file mode 100644 index 70ffc99..0000000 --- a/data/textract-ocr/txt/ACA 1992.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a75e8587d5bd326eb3d904416a0844efb01d7730604862154fde00dfaf88a17 -size 145681 diff --git a/data/textract-ocr/txt/ACA 1993.txt b/data/textract-ocr/txt/ACA 1993.txt deleted file mode 100644 index 02d5d61..0000000 --- a/data/textract-ocr/txt/ACA 1993.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b9ee23c5c6dd6cf4ed08d3d09ab382e5f58bba830f9479e4aed2e6b82be95ff4 -size 175312 diff --git a/data/textract-ocr/txt/ACA 1994-1995.txt b/data/textract-ocr/txt/ACA 1994-1995.txt deleted file mode 100644 index 81f9740..0000000 --- a/data/textract-ocr/txt/ACA 1994-1995.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:40a0e734a1a1b69b28a1d69d471d74a67abf73ef113ebb191d682c2b9ae56916 -size 271700 diff --git a/data/textract-ocr/txt/ACA 1996-1998.txt b/data/textract-ocr/txt/ACA 1996-1998.txt deleted file mode 100644 index e446170..0000000 --- a/data/textract-ocr/txt/ACA 1996-1998.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:786de7ae425999d958498c017520312f1b3495923b78696b5743d3efd0c52d50 -size 301149 diff --git a/data/textract-ocr/txt/ACA 1999-2001.txt b/data/textract-ocr/txt/ACA 1999-2001.txt deleted file mode 100644 index 4c1c27c..0000000 --- a/data/textract-ocr/txt/ACA 1999-2001.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5fffae409a66ccff331b5d16ccb9fbedb5897596164869cdf1cbcc2a47029f9b -size 311605 diff --git a/data/textract-ocr/txt/ACA 2002-2003.txt b/data/textract-ocr/txt/ACA 2002-2003.txt deleted file mode 100644 index 3ae68a9..0000000 --- a/data/textract-ocr/txt/ACA 2002-2003.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a5a5677f36c461171d3e982c3dcb8004ce1de64e02293809b9f3861d04dd5196 -size 265000 diff --git a/data/textract-ocr/txt/ACA 2004-2005.txt b/data/textract-ocr/txt/ACA 2004-2005.txt deleted file mode 100644 index b8d3d87..0000000 --- a/data/textract-ocr/txt/ACA 2004-2005.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:08b45a6892c390f27f70ca93b20cf1e4eba837e97a60dcd5b0581abb48446a7f -size 248820 diff --git a/data/textract-ocr/txt/ACA 2006.txt b/data/textract-ocr/txt/ACA 2006.txt deleted file mode 100644 index bdf3037..0000000 --- a/data/textract-ocr/txt/ACA 2006.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c11b0f799091219a9fa79b32beaaa82b008356df7204b16c65c7a38ea6d1201a -size 98240 diff --git a/data/textract-ocr/txt/ACA 2007-2008.txt b/data/textract-ocr/txt/ACA 2007-2008.txt deleted file mode 100644 index 9bc0953..0000000 --- a/data/textract-ocr/txt/ACA 2007-2008.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ddf681b85ea7eacfb3daa9f091f653fad2aec4889a7fa05319a5299ffb02c5e5 -size 169034 diff --git a/data/textract-ocr/txt/ACA 2009-2010.txt b/data/textract-ocr/txt/ACA 2009-2010.txt deleted file mode 100644 index 4adb4f7..0000000 --- a/data/textract-ocr/txt/ACA 2009-2010.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:325b196c825a52baf7d382f5169d3068cc9150ba94d7e4bdf6c9e9e05ec30488 -size 141110 diff --git a/data/textract-ocr/txt/ACA 2011-2012.txt b/data/textract-ocr/txt/ACA 2011-2012.txt deleted file mode 100644 index eac0036..0000000 --- a/data/textract-ocr/txt/ACA 2011-2012.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1e75b329a5c91d14041bd1539c7c2bf3962a7588c5772c7e52587f7c53160b97 -size 232895 diff --git a/data/textract-ocr/txt/ACA 2013-2014.txt b/data/textract-ocr/txt/ACA 2013-2014.txt deleted file mode 100644 index afc107b..0000000 --- a/data/textract-ocr/txt/ACA 2013-2014.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06c0eb630141690804a7dadf02f4d35008564b83aaac0ffce365bba4d6912cd6 -size 265979 diff --git a/data/textract-ocr/txt/ACA 2017-2018.txt b/data/textract-ocr/txt/ACA 2017-2018.txt deleted file mode 100644 index d2ce509..0000000 --- a/data/textract-ocr/txt/ACA 2017-2018.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fd4a71c5e603483a3ca54ca4bd5ed1f27480006882494dbab999b091c660d9b5 -size 193372 diff --git a/data/textract-ocr/txt/ACA 2019-2020.txt b/data/textract-ocr/txt/ACA 2019-2020.txt deleted file mode 100644 index 416f157..0000000 --- a/data/textract-ocr/txt/ACA 2019-2020.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5dc031a8fa4a4bc99c9fbfc6b01213b2bff106d31beffcd94806067854e2f6ab -size 224993 diff --git a/data/textract-ocr/txt/ACA 50th Anniversary.txt b/data/textract-ocr/txt/ACA 50th Anniversary.txt deleted file mode 100644 index b6383ac..0000000 --- a/data/textract-ocr/txt/ACA 50th Anniversary.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:35100f8ebf019ad3a62b8815835cfc586dfe8bd30bb4cc7a0cc6164ca3003f61 -size 427571 diff --git a/data/textract-ocr/txt/ACA Before 1900.txt b/data/textract-ocr/txt/ACA Before 1900.txt deleted file mode 100644 index da5c0c4..0000000 --- a/data/textract-ocr/txt/ACA Before 1900.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:be38f34bf13fef5949166e143110051e2a7fa8802a5125fb74ecfcc5406aa4b3 -size 25608