Configuration Cheat Sheet

Contents

Configuration Cheat Sheet#

Below is a list of all configurations for the core modules in Auto Archiver

Configuration File#

# Module configuration


# Command Line Feeder configuration options
cli_feeder:
  urls:  # URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml


# CSV Database configuration options
csv_db:
  csv_file: db.csv  # CSV file name to save metadata to


# Generic Extractor configuration options
generic_extractor:
  subtitles: true  # download subtitles if available
  comments: false # download all comments if available, may lead to large metadata
  livestreams: false # if set, will download live streams, otherwise will skip them; see --max-filesize for more control
  live_from_start: false # if set, will download live streams from their earliest available moment, otherwise starts now.
  proxy: '' # http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port
  end_means_success: true # if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve.
  allow_playlist: false # If True will also download playlists, set to False if the expectation is to download a single video.
  max_downloads: inf # Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.
  extractor_args: {} # Additional arguments to pass to the yt-dlp extractor. See https://github.com/yt-dlp/yt-dlp/blob/master/README.md#extractor-arguments.
  ytdlp_update_interval: 5 # How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.
  ytdlp_args: '' # Additional arguments to pass to yt-dlp, e.g. --no-check-certificate or --plugin-dirs.See yt-dlp documentation here for more information: https://github.com/yt-dlp/yt-dlp?tab=readme-ov-file#general-optionsNote: this is not to be confused with 'extractor_args' which are specific to the extractor itself.


# Hash Enricher configuration options
hash_enricher:
  algorithm: SHA-256  # hash algorithm to use
  chunksize: 16000000 # number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB


# HTML Formatter configuration options
html_formatter:
  detect_thumbnails: true  # if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'


# Local Storage configuration options
local_storage:
  path_generator: flat  # how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.
  filename_generator: static # how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled)
  save_to: ./local_archive # folder where to save archived content
  save_absolute: false # whether the path to the stored file is absolute or relative in the output result inc. formatters (Warning: saving an absolute path will show your computer's file structure)


# SSL Certificate Enricher configuration options
ssl_enricher:
  skip_when_nothing_archived: true  # if true, will skip enriching when no media is archived


# Thumbnail Enricher configuration options
thumbnail_enricher:
  thumbnails_per_minute: 60  # how many thumbnails to generate per minute of video, can be limited by max_thumbnails
  max_thumbnails: 16 # limit the number of thumbnails to generate per video, 0 means no limit


# Auto Archiver API Database configuration options
api_db:
  api_endpoint: ''  # API endpoint where calls are made to
  api_token: # API Bearer token.
  public: false # whether the URL should be publicly available via the API
  author_id: # which email to assign as author
  group_id: # which group of users have access to the archive in case public=false as author
  use_api_cache: false # if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived
  store_results: true # when set, will send the results to the API database.
  tags: [] # what tags to add to the archived URL


# Atlos Feeder Database Storage configuration options
atlos_feeder_db_storage:
  api_token: ''  # An Atlos API token. For more information, see https://docs.atlos.org/technical/api/
  atlos_url: https://platform.atlos.org # The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.


# CSV Feeder configuration options
csv_feeder:
  files:  # Path to the input file(s) to read the URLs from, comma separated.                         Input files should be formatted with one URL per line
  column: # Column number or name to read the URLs from, 0-indexed


# Google Drive Storage configuration options
gdrive_storage:
  path_generator: url  # how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.
  filename_generator: static # how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).
  root_folder_id: '' # root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'
  oauth_token: # JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account.
  service_account: secrets/service_account.json # service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account.


# Google Sheets Feeder Database configuration options
gsheet_feeder_db:
  sheet:  # name of the sheet to archive
  sheet_id: # the id of the sheet to archive (alternative to 'sheet' config)
  header: 1 # index of the header row (starts at 1)
  service_account: secrets/service_account.json # service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html
  columns: # Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting
    url: link
    status: archive status
    folder: destination folder
    archive: archive location
    date: archive date
    thumbnail: thumbnail
    timestamp: upload timestamp
    title: upload title
    text: text content
    screenshot: screenshot
    hash: hash
    pdq_hash: perceptual hashes
    wacz: wacz
    replaywebpage: replaywebpage
  allow_worksheets: !!set {} # A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed
  block_worksheets: !!set {} # A list of worksheet names for worksheets that should be explicitly blocked from being processed
  use_sheet_names_in_stored_paths: true # if True the stored files path will include 'workbook_name/worksheet_name/...'


# Instagram API Extractor configuration options
instagram_api_extractor:
  access_token:  # a valid instagrapi-api token
  api_endpoint: '' # API endpoint to use
  full_profile: false # if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.
  full_profile_max_posts: 0 # Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights
  minimize_json_output: true # if true, will remove empty values from the json output


# Instagram Extractor configuration options
instagram_extractor:
  username: ''  # A valid Instagram username.
  password: '' # The corresponding Instagram account password.
  download_folder: instaloader # Name of a folder to temporarily download content to.
  session_file: secrets/instaloader.session # Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.


# Instagram Telegram Bot Extractor configuration options
instagram_tbot_extractor:
  api_id:  # telegram API_ID value, go to https://my.telegram.org/apps
  api_hash: # telegram API_HASH value, go to https://my.telegram.org/apps
  session_file: secrets/anon-insta # optional, records the telegram login session for future usage, '.session' will be appended to the provided value.
  timeout: 45 # timeout to fetch the instagram content in seconds.


# OpenTimestamps Enricher configuration options
opentimestamps_enricher:
  calendar_urls:  # List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:https://opentimestamps.org/#calendars
  - https://alice.btc.calendar.opentimestamps.org
  - https://bob.btc.calendar.opentimestamps.org
  - https://finney.calendar.eternitywall.com
  calendar_whitelist: [] # Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. ['https://mycalendar.com']


# S3 Storage configuration options
s3_storage:
  path_generator: flat  # how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.
  filename_generator: static # how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).
  bucket: # S3 bucket name
  region: # S3 region name
  key: # S3 API key
  secret: # S3 API secret
  random_no_duplicate: false # if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`
  endpoint_url: https://{region}.digitaloceanspaces.com # S3 bucket endpoint, {region} are inserted at runtime
  cdn_url: https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key} # S3 CDN url, {bucket}, {region} and {key} are inserted at runtime
  private: false # if true S3 files will not be readable online


# Screenshot Enricher configuration options
screenshot_enricher:
  width: 1280  # width of the screenshots
  height: 1024 # height of the screenshots
  timeout: 60 # timeout for taking the screenshot
  sleep_before_screenshot: 4 # seconds to wait for the pages to load before taking screenshot
  http_proxy: '' # http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port
  save_to_pdf: false # save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter
  print_options: {} # options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information


# Telethon Extractor configuration options
telethon_extractor:
  api_id:  # telegram API_ID value, go to https://my.telegram.org/apps
  api_hash: # telegram API_HASH value, go to https://my.telegram.org/apps
  bot_token: # optional, but allows access to more content such as large videos, talk to @botfather
  session_file: secrets/anon # optional, records the telegram login session for future usage, '.session' will be appended to the provided value.
  join_channels: true # disables the initial setup with channel_invites config, useful if you have a lot and get stuck
  channel_invites: {} # (JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup


# Timestamping Enricher configuration options
timestamping_enricher:
  tsa_urls:  # List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.
  - http://timestamp.identrust.com
  - http://timestamp.ssl.trustwave.com
  - http://zeitstempel.dfn.de
  - http://ts.ssl.com
  - http://tsa.lex-persona.com/tsa
  - http://tss.cnbs.gob.hn/TSS/HttpTspServer
  - http://dss.nowina.lu/pki-factory/tsa/good-tsa
  cert_authorities: # Path to a file containing trusted Certificate Authorities (CAs) in PEM format. If empty, the default system authorities are used.
  allow_selfsigned: false # Whether or not to allow and save self-signed Timestamping certificates. This allows for a greater range of timestamping servers to be used, but they are not trusted authorities


# Twitter API Extractor configuration options
twitter_api_extractor:
  bearer_token:  # [deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret
  bearer_tokens: [] #  a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line
  consumer_key: # twitter API consumer_key
  consumer_secret: # twitter API consumer_secret
  access_token: # twitter API access_token
  access_secret: # twitter API access_secret


# VKontakte Extractor configuration options
vk_extractor:
  username: ''  # valid VKontakte username
  password: '' # valid VKontakte password
  session_file: secrets/vk_config.v2.json # valid VKontakte password


# WACZ Enricher (and Extractor) configuration options
wacz_extractor_enricher:
  profile:  # browsertrix-profile (for profile generation see https://crawler.docs.browsertrix.com/user-guide/browser-profiles/).
  docker_commands: # if a custom docker invocation is needed
  timeout: 120 # timeout for WACZ generation in seconds
  extract_media: false # If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched.
  extract_screenshot: true # If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched.
  socks_proxy_host: # SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host
  socks_proxy_port: # SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234
  proxy_server: # SOCKS server proxy URL, in development


# Wayback Machine Enricher (and Extractor) configuration options
wayback_extractor_enricher:
  timeout: 15  # seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.
  if_not_archived_within: # only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA
  key: '' # wayback API key. to get credentials visit https://archive.org/account/s3.php
  secret: '' # wayback API secret. to get credentials visit https://archive.org/account/s3.php
  proxy_http: # http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port
  proxy_https: # https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port


# Whisper Enricher configuration options
whisper_enricher:
  api_endpoint: ''  # WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe.
  api_key: '' # WhisperApi api key for authentication
  include_srt: false # Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players).
  timeout: 90 # How many seconds to wait at most for a successful job completion.
  action: translate # which Whisper operation to execute

Command Line#

Configuration Options#

Option	Description	Default	Type
`cli_feeder.urls`	Optional. URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml	None	string
`csv_db.csv_file`	Optional. CSV file name to save metadata to	db.csv	string
`generic_extractor.subtitles`	Optional. download subtitles if available	True	bool
`generic_extractor.comments`	Optional. download all comments if available, may lead to large metadata	False	bool
`generic_extractor.livestreams`	Optional. if set, will download live streams, otherwise will skip them; see –max-filesize for more control	False	bool
`generic_extractor.live_from_start`	Optional. if set, will download live streams from their earliest available moment, otherwise starts now.	False	bool
`generic_extractor.proxy`	Optional. http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port		string
`generic_extractor.end_means_success`	Optional. if True, any archived content will mean a ‘success’, if False this archiver will not return a ‘success’ stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve.	True	bool
`generic_extractor.allow_playlist`	Optional. If True will also download playlists, set to False if the expectation is to download a single video.	False	bool
`generic_extractor.max_downloads`	Optional. Use to limit the number of videos to download when a channel or long page is being extracted. ‘inf’ means no limit.	inf	string
`generic_extractor.extractor_args`	Optional. Additional arguments to pass to the yt-dlp extractor. See yt-dlp/yt-dlp.	{}	json_loader
`generic_extractor.ytdlp_update_interval`	Optional. How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.	5	int
`generic_extractor.ytdlp_args`	Optional. Additional arguments to pass to yt-dlp, e.g. –no-check-certificate or –plugin-dirs.See yt-dlp documentation here for more information: yt-dlp/yt-dlp this is not to be confused with ‘extractor_args’ which are specific to the extractor itself.		string
`hash_enricher.algorithm`	Optional. hash algorithm to use	SHA-256	string
`hash_enricher.chunksize`	Optional. number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB	16000000	int
`html_formatter.detect_thumbnails`	Optional. if true will group by thumbnails generated by thumbnail enricher by id ‘thumbnail_00’	True	bool
`local_storage.path_generator`	Optional. how to store the file in terms of directory structure: ‘flat’ sets to root; ‘url’ creates a directory based on the provided URL; ‘random’ creates a random directory.	flat	string
`local_storage.filename_generator`	Optional. how to name stored files: ‘random’ creates a random string; ‘static’ uses a hash, with the settings of the ‘hash_enricher’ module (defaults to SHA256 if not enabled)	static	string
`local_storage.save_to`	Optional. folder where to save archived content	./local_archive	string
`local_storage.save_absolute`	Optional. whether the path to the stored file is absolute or relative in the output result inc. formatters (Warning: saving an absolute path will show your computer’s file structure)	False	bool
`ssl_enricher.skip_when_nothing_archived`	Optional. if true, will skip enriching when no media is archived	True	bool
`thumbnail_enricher.thumbnails_per_minute`	Optional. how many thumbnails to generate per minute of video, can be limited by max_thumbnails	60	int
`thumbnail_enricher.max_thumbnails`	Optional. limit the number of thumbnails to generate per video, 0 means no limit	16	int
`api_db.api_endpoint`	Required. API endpoint where calls are made to		string
`api_db.api_token`	Optional. API Bearer token.	None	string
`api_db.public`	Optional. whether the URL should be publicly available via the API	False	bool
`api_db.author_id`	Optional. which email to assign as author	None	string
`api_db.group_id`	Optional. which group of users have access to the archive in case public=false as author	None	string
`api_db.use_api_cache`	Optional. if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived	False	bool
`api_db.store_results`	Optional. when set, will send the results to the API database.	True	bool
`api_db.tags`	Optional. what tags to add to the archived URL	[]	string
`atlos_feeder_db_storage.api_token`	Required. An Atlos API token. For more information, see https://docs.atlos.org/technical/api/		string
`atlos_feeder_db_storage.atlos_url`	Optional. The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.	https://platform.atlos.org	string
`csv_feeder.files`	Required. Path to the input file(s) to read the URLs from, comma separated. Input files should be formatted with one URL per line	None	valid_file
`csv_feeder.column`	Optional. Column number or name to read the URLs from, 0-indexed	None	string
`gdrive_storage.path_generator`	Optional. how to store the file in terms of directory structure: ‘flat’ sets to root; ‘url’ creates a directory based on the provided URL; ‘random’ creates a random directory.	url	string
`gdrive_storage.filename_generator`	Optional. how to name stored files: ‘random’ creates a random string; ‘static’ uses a hash, with the settings of the ‘hash_enricher’ module (defaults to SHA256 if not enabled).	static	string
`gdrive_storage.root_folder_id`	Required. root google drive folder ID to use as storage, found in URL: ‘https://drive.google.com/drive/folders/FOLDER_ID’		string
`gdrive_storage.oauth_token`	Optional. JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account.	None	string
`gdrive_storage.service_account`	Optional. service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account.	secrets/service_account.json	string
`gsheet_feeder_db.sheet`	Optional. name of the sheet to archive	None	string
`gsheet_feeder_db.sheet_id`	Optional. the id of the sheet to archive (alternative to ‘sheet’ config)	None	string
`gsheet_feeder_db.header`	Optional. index of the header row (starts at 1)	1	int
`gsheet_feeder_db.service_account`	Required. service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html	secrets/service_account.json	string
`gsheet_feeder_db.columns`	Optional. Custom names for the columns in your Google sheet. If you don’t want to use the default column names, change them with this setting	{‘url’: ‘link’, ‘status’: ‘archive status’, ‘folder’: ‘destination folder’, ‘archive’: ‘archive location’, ‘date’: ‘archive date’, ‘thumbnail’: ‘thumbnail’, ‘timestamp’: ‘upload timestamp’, ‘title’: ‘upload title’, ‘text’: ‘text content’, ‘screenshot’: ‘screenshot’, ‘hash’: ‘hash’, ‘pdq_hash’: ‘perceptual hashes’, ‘wacz’: ‘wacz’, ‘replaywebpage’: ‘replaywebpage’}	json_loader
`gsheet_feeder_db.allow_worksheets`	Optional. A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed	set()	string
`gsheet_feeder_db.block_worksheets`	Optional. A list of worksheet names for worksheets that should be explicitly blocked from being processed	set()	string
`gsheet_feeder_db.use_sheet_names_in_stored_paths`	Optional. if True the stored files path will include ‘workbook_name/worksheet_name/…’	True	bool
`instagram_api_extractor.access_token`	Optional. a valid instagrapi-api token	None	string
`instagram_api_extractor.api_endpoint`	Required. API endpoint to use		string
`instagram_api_extractor.full_profile`	Optional. if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.	False	bool
`instagram_api_extractor.full_profile_max_posts`	Optional. Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights	0	int
`instagram_api_extractor.minimize_json_output`	Optional. if true, will remove empty values from the json output	True	bool
`instagram_extractor.username`	Required. A valid Instagram username.		string
`instagram_extractor.password`	Required. The corresponding Instagram account password.		string
`instagram_extractor.download_folder`	Optional. Name of a folder to temporarily download content to.	instaloader	string
`instagram_extractor.session_file`	Optional. Path to the instagram session file which saves session credentials. If one doesn’t exist this gives the path to store a new one.	secrets/instaloader.session	string
`instagram_tbot_extractor.api_id`	Optional. telegram API_ID value, go to https://my.telegram.org/apps	None	string
`instagram_tbot_extractor.api_hash`	Optional. telegram API_HASH value, go to https://my.telegram.org/apps	None	string
`instagram_tbot_extractor.session_file`	Optional. optional, records the telegram login session for future usage, ‘.session’ will be appended to the provided value.	secrets/anon-insta	string
`instagram_tbot_extractor.timeout`	Optional. timeout to fetch the instagram content in seconds.	45	int
`opentimestamps_enricher.calendar_urls`	Optional. List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:https://opentimestamps.org/#calendars	[‘https://alice.btc.calendar.opentimestamps.org’, ‘https://bob.btc.calendar.opentimestamps.org’, ‘https://finney.calendar.eternitywall.com’]	list
`opentimestamps_enricher.calendar_whitelist`	Optional. Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. [‘https://mycalendar.com’]	[]	list
`s3_storage.path_generator`	Optional. how to store the file in terms of directory structure: ‘flat’ sets to root; ‘url’ creates a directory based on the provided URL; ‘random’ creates a random directory.	flat	string
`s3_storage.filename_generator`	Optional. how to name stored files: ‘random’ creates a random string; ‘static’ uses a hash, with the settings of the ‘hash_enricher’ module (defaults to SHA256 if not enabled).	static	string
`s3_storage.bucket`	Optional. S3 bucket name	None	string
`s3_storage.region`	Optional. S3 region name	None	string
`s3_storage.key`	Optional. S3 API key	None	string
`s3_storage.secret`	Optional. S3 API secret	None	string
`s3_storage.random_no_duplicate`	Optional. if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`	False	bool
`s3_storage.endpoint_url`	Optional. S3 bucket endpoint, {region} are inserted at runtime	https://{region}.digitaloceanspaces.com	string
`s3_storage.cdn_url`	Optional. S3 CDN url, {bucket}, {region} and {key} are inserted at runtime	https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}	string
`s3_storage.private`	Optional. if true S3 files will not be readable online	False	bool
`screenshot_enricher.width`	Optional. width of the screenshots	1280	int
`screenshot_enricher.height`	Optional. height of the screenshots	1024	int
`screenshot_enricher.timeout`	Optional. timeout for taking the screenshot	60	int
`screenshot_enricher.sleep_before_screenshot`	Optional. seconds to wait for the pages to load before taking screenshot	4	int
`screenshot_enricher.http_proxy`	Optional. http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port		string
`screenshot_enricher.save_to_pdf`	Optional. save the page as pdf along with the screenshot. PDF saving options can be adjusted with the ‘print_options’ parameter	False	bool
`screenshot_enricher.print_options`	Optional. options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information	{}	json_loader
`telethon_extractor.api_id`	Optional. telegram API_ID value, go to https://my.telegram.org/apps	None	string
`telethon_extractor.api_hash`	Optional. telegram API_HASH value, go to https://my.telegram.org/apps	None	string
`telethon_extractor.bot_token`	Optional. optional, but allows access to more content such as large videos, talk to @botfather	None	string
`telethon_extractor.session_file`	Optional. optional, records the telegram login session for future usage, ‘.session’ will be appended to the provided value.	secrets/anon	string
`telethon_extractor.join_channels`	Optional. disables the initial setup with channel_invites config, useful if you have a lot and get stuck	True	bool
`telethon_extractor.channel_invites`	Optional. (JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup	{}	json_loader
`timestamping_enricher.tsa_urls`	Optional. List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.	[‘http://timestamp.identrust.com’, ‘http://timestamp.ssl.trustwave.com’, ‘http://zeitstempel.dfn.de’, ‘http://ts.ssl.com’, ‘http://tsa.lex-persona.com/tsa’, ‘http://tss.cnbs.gob.hn/TSS/HttpTspServer’, ‘http://dss.nowina.lu/pki-factory/tsa/good-tsa’]	string
`timestamping_enricher.cert_authorities`	Optional. Path to a file containing trusted Certificate Authorities (CAs) in PEM format. If empty, the default system authorities are used.	None	string
`timestamping_enricher.allow_selfsigned`	Optional. Whether or not to allow and save self-signed Timestamping certificates. This allows for a greater range of timestamping servers to be used, but they are not trusted authorities	False	bool
`twitter_api_extractor.bearer_token`	Optional. [deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret	None	string
`twitter_api_extractor.bearer_tokens`	Optional. a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line	[]	string
`twitter_api_extractor.consumer_key`	Optional. twitter API consumer_key	None	string
`twitter_api_extractor.consumer_secret`	Optional. twitter API consumer_secret	None	string
`twitter_api_extractor.access_token`	Optional. twitter API access_token	None	string
`twitter_api_extractor.access_secret`	Optional. twitter API access_secret	None	string
`vk_extractor.username`	Required. valid VKontakte username		string
`vk_extractor.password`	Required. valid VKontakte password		string
`vk_extractor.session_file`	Optional. valid VKontakte password	secrets/vk_config.v2.json	string
`wacz_extractor_enricher.profile`	Optional. browsertrix-profile (for profile generation see https://crawler.docs.browsertrix.com/user-guide/browser-profiles/).	None	string
`wacz_extractor_enricher.docker_commands`	Optional. if a custom docker invocation is needed	None	string
`wacz_extractor_enricher.timeout`	Optional. timeout for WACZ generation in seconds	120	int
`wacz_extractor_enricher.extract_media`	Optional. If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched.	False	bool
`wacz_extractor_enricher.extract_screenshot`	Optional. If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched.	True	bool
`wacz_extractor_enricher.socks_proxy_host`	Optional. SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host	None	string
`wacz_extractor_enricher.socks_proxy_port`	Optional. SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234	None	int
`wacz_extractor_enricher.proxy_server`	Optional. SOCKS server proxy URL, in development	None	string
`wayback_extractor_enricher.timeout`	Optional. seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.	15	int
`wayback_extractor_enricher.if_not_archived_within`	Optional. only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA	None	string
`wayback_extractor_enricher.key`	Required. wayback API key. to get credentials visit https://archive.org/account/s3.php		string
`wayback_extractor_enricher.secret`	Required. wayback API secret. to get credentials visit https://archive.org/account/s3.php		string
`wayback_extractor_enricher.proxy_http`	Optional. http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port	None	string
`wayback_extractor_enricher.proxy_https`	Optional. https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port	None	string
`whisper_enricher.api_endpoint`	Required. WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of bellingcat/whisperbox-transcribe.		string
`whisper_enricher.api_key`	Required. WhisperApi api key for authentication		string
`whisper_enricher.include_srt`	Optional. Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players).	False	bool
`whisper_enricher.timeout`	Optional. How many seconds to wait at most for a successful job completion.	90	int
`whisper_enricher.action`	Optional. which Whisper operation to execute	translate	string