Configuration Cheat Sheet

Contents

Configuration Cheat Sheet#

Below is a list of all configurations for the core modules in Auto Archiver

Configuration File#

# Module configuration


# Antibot Extractor/Enricher configuration options
antibot_extractor_enricher:
  save_to_pdf: false  # save a PDF snapshot of the page.
  max_download_images: 50 # maximum number of images to download from the page (0 = no download, inf = no limit).
  max_download_videos: 50 # maximum number of videos to download from the page (0 = no download, inf = no limit).
  user_data_dir: secrets/antibot_user_data # Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. If you use the docker deployment, this path will be appended with `_docker` that is because the folder cannot be shared between the host and the container due to user permissions.
  detect_auth_wall: true # detect if the page is behind an authentication wall (e.g. login required) and skip it. disable if you want to archive pages where logins are required.
  proxy: # proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'


# Command Line Feeder configuration options
cli_feeder:
  urls:  # URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml


# CSV Database configuration options
csv_db:
  csv_file: db.csv  # CSV file name to save metadata to


# Generic Extractor configuration options
generic_extractor:
  subtitles: true  # download subtitles if available
  comments: false # download all comments if available, may lead to large metadata
  livestreams: false # if set, will download live streams, otherwise will skip them; see --max-filesize for more control
  live_from_start: false # if set, will download live streams from their earliest available moment, otherwise starts now.
  proxy: '' # http/https/socks proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port
  proxy_on_failure_only: true # Applies only if a proxy is set. In that case if this setting is True, the extractor will only use the proxy if the initial request fails; if it is False, the extractor will always use the proxy.
  end_means_success: true # if True, any archived content will mean a 'success', if False this extractor will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent extractors can retrieve.
  allow_playlist: false # If True will also download playlists, set to False if the expectation is to download a single video.
  max_downloads: inf # Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.
  bguils_po_token_method: auto # Set up a Proof of origin token provider. This process has additional requirements. See [authentication](https://auto-archiver.readthedocs.io/en/latest/how_to/authentication_how_to.html) for more information.
  extractor_args: {} # Additional arguments to pass to the yt-dlp extractor. See https://github.com/yt-dlp/yt-dlp/blob/master/README.md#extractor-arguments.
  ytdlp_update_interval: 5 # How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.
  ytdlp_args: '' # Additional arguments to pass to yt-dlp, e.g. --no-check-certificate or --plugin-dirs.See yt-dlp documentation here for more information: https://github.com/yt-dlp/yt-dlp?tab=readme-ov-file#general-optionsNote: this is not to be confused with 'extractor_args' which are specific to the extractor itself.


# Hash Enricher configuration options
hash_enricher:
  algorithm: SHA-256  # hash algorithm to use
  chunksize: 16000000 # number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB


# HTML Formatter configuration options
html_formatter:
  detect_thumbnails: true  # if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'


# Local Storage configuration options
local_storage:
  path_generator: flat  # how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.
  filename_generator: static # how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled)
  save_to: ./local_archive # folder where to save archived content
  save_absolute: false # whether the path to the stored file is absolute or relative in the output result inc. formatters (Warning: saving an absolute path will show your computer's file structure)


# SSL Certificate Enricher configuration options
ssl_enricher:
  skip_when_nothing_archived: true  # if true, will skip enriching when no media is archived


# Thumbnail Enricher configuration options
thumbnail_enricher:
  thumbnails_per_minute: 60  # how many thumbnails to generate per minute of video, can be limited by max_thumbnails
  max_thumbnails: 16 # limit the number of thumbnails to generate per video, 0 means no limit


# Auto Archiver API Database configuration options
api_db:
  api_endpoint: ''  # API endpoint where calls are made to
  api_token: # API Bearer token.
  public: false # whether the URL should be publicly available via the API
  author_id: # which email to assign as author
  group_id: # which group of users have access to the archive in case public=false as author
  use_api_cache: false # if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived
  store_results: true # when set, will send the results to the API database.
  tags: [] # what tags to add to the archived URL


# Atlos Feeder Database Storage configuration options
atlos_feeder_db_storage:
  api_token: ''  # An Atlos API token. For more information, see https://docs.atlos.org/technical/api/
  atlos_url: https://platform.atlos.org # The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.


# CSV Feeder configuration options
csv_feeder:
  files:  # Path to the input file(s) to read the URLs from, comma separated.                         Input files should be formatted with one URL per line
  column: # Column number or name to read the URLs from, 0-indexed


# Google Drive Storage configuration options
gdrive_storage:
  path_generator: url  # how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.
  filename_generator: static # how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).
  root_folder_id: '' # root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'
  oauth_token: # JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account.
  service_account: secrets/service_account.json # service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account.


# Google Sheets Feeder Database configuration options
gsheet_feeder_db:
  sheet:  # name of the sheet to archive
  sheet_id: # the id of the sheet to archive (alternative to 'sheet' config)
  header: 1 # index of the header row (starts at 1)
  service_account: secrets/service_account.json # service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html
  columns: # Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting
    url: link
    status: archive status
    folder: destination folder
    archive: archive location
    date: archive date
    thumbnail: thumbnail
    timestamp: upload timestamp
    title: upload title
    text: text content
    screenshot: screenshot
    hash: hash
    pdq_hash: perceptual hashes
    wacz: wacz
    replaywebpage: replaywebpage
  allow_worksheets: !!set {} # A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed
  block_worksheets: !!set {} # A list of worksheet names for worksheets that should be explicitly blocked from being processed
  use_sheet_names_in_stored_paths: true # if True the stored files path will include 'workbook_name/worksheet_name/...'


# Instagram API Extractor configuration options
instagram_api_extractor:
  access_token:  # a valid instagrapi-api token
  api_endpoint: '' # API endpoint to use
  full_profile: false # if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.
  full_profile_max_posts: 0 # Use to limit the number of posts to download when full_profile is true or when a URL for multiple posts is passed (like /stories /highlights ...). 0 means no limit. when full_profile is true the order of downloaded content is stories -> posts -> tagged posts -> highlights, so a value of 10 could download 2 stories, 7 posts, 1 tagged posts, and 0 highlights.
  minimize_json_output: true # if true, will remove empty values from the json output


# Instagram Extractor configuration options
instagram_extractor:
  username: ''  # A valid Instagram username.
  password: '' # The corresponding Instagram account password.
  download_folder: instaloader # Name of a folder to temporarily download content to.
  session_file: secrets/instaloader.session # Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.


# Instagram Telegram Bot Extractor configuration options
instagram_tbot_extractor:
  api_id:  # telegram API_ID value, go to https://my.telegram.org/apps
  api_hash: # telegram API_HASH value, go to https://my.telegram.org/apps
  session_file: secrets/anon-insta # optional, records the telegram login session for future usage, '.session' will be appended to the provided value.
  timeout: 45 # timeout to fetch the instagram content in seconds.


# Media Metadata Enricher configuration options
metadata_enricher:
  look_for_keys: []  # list of lowercased metadata keys that will be included in the enriched metadata. Special keys: 'author', 'datetimes', 'location' to include related metadata fields. The default empty list `[]` means all metadata will be included.


# OpenTimestamps Enricher configuration options
opentimestamps_enricher:
  calendar_urls:  # List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:https://opentimestamps.org/#calendars
  - https://alice.btc.calendar.opentimestamps.org
  - https://bob.btc.calendar.opentimestamps.org
  - https://finney.calendar.eternitywall.com
  calendar_whitelist: [] # Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. ['https://mycalendar.com']


# S3 Storage configuration options
s3_storage:
  path_generator: flat  # how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.
  filename_generator: static # how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).
  bucket: # S3 bucket name
  region: # S3 region name
  key: # S3 API key
  secret: # S3 API secret
  random_no_duplicate: false # if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`
  endpoint_url: https://{region}.digitaloceanspaces.com # S3 bucket endpoint, {region} are inserted at runtime
  cdn_url: https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key} # S3 CDN url, {bucket}, {region} and {key} are inserted at runtime
  private: false # if true S3 files will not be readable online


# Telethon Extractor configuration options
telethon_extractor:
  api_id:  # telegram API_ID value, go to https://my.telegram.org/apps
  api_hash: # telegram API_HASH value, go to https://my.telegram.org/apps
  bot_token: # optional, but allows access to more content such as large videos, talk to @botfather
  session_file: secrets/anon # Path of the file to save the telegram login session for future usage, '.session' will be appended to the provided path.
  join_channels: true # disables the initial setup with channel_invites config, useful if you have a lot and get stuck
  channel_invites: {} # (JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup


# Timestamping Enricher configuration options
timestamping_enricher:
  tsa_urls:  # List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.
  - http://timestamp.identrust.com
  - http://timestamp.ssl.trustwave.com
  - http://zeitstempel.dfn.de
  - http://ts.ssl.com
  - http://tsa.lex-persona.com/tsa
  - http://tss.cnbs.gob.hn/TSS/HttpTspServer
  cert_authorities: # Path to a file containing trusted Certificate Authorities (CAs) in PEM format. If empty, the default system authorities are used.
  allow_selfsigned: false # Whether or not to allow and save self-signed Timestamping certificates. This allows for a greater range of timestamping servers to be used, but they are not trusted authorities


# Twitter API Extractor configuration options
twitter_api_extractor:
  bearer_token:  # [deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret
  bearer_tokens: [] #  a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line
  consumer_key: # twitter API consumer_key
  consumer_secret: # twitter API consumer_secret
  access_token: # twitter API access_token
  access_secret: # twitter API access_secret


# WACZ Enricher (and Extractor) configuration options
wacz_extractor_enricher:
  profile:  # browsertrix-profile (for profile generation see https://crawler.docs.browsertrix.com/user-guide/browser-profiles/).
  docker_commands: # if a custom docker invocation is needed
  timeout: 120 # timeout for WACZ generation in seconds
  extract_media: false # If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched.
  extract_screenshot: true # If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched.
  socks_proxy_host: # SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host
  socks_proxy_port: # SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234
  proxy_server: # SOCKS server proxy URL, in development


# Wayback Machine Enricher (and Extractor) configuration options
wayback_extractor_enricher:
  timeout: 15  # seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.
  if_not_archived_within: # only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA
  key: '' # wayback API key. to get credentials visit https://archive.org/account/s3.php
  secret: '' # wayback API secret. to get credentials visit https://archive.org/account/s3.php
  proxy_http: # http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port
  proxy_https: # https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port


# Whisper Enricher configuration options
whisper_enricher:
  api_endpoint: ''  # WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe.
  api_key: '' # WhisperApi api key for authentication
  include_srt: false # Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players).
  timeout: 90 # How many seconds to wait at most for a successful job completion.
  action: translate # which Whisper operation to execute

Command Line#

Configuration Options#

Option	Description	Default	Type
`antibot_extractor_enricher.save_to_pdf`	Optional. save a PDF snapshot of the page.	False	bool
`antibot_extractor_enricher.max_download_images`	Optional. maximum number of images to download from the page (0 = no download, inf = no limit).	50	string
`antibot_extractor_enricher.max_download_videos`	Optional. maximum number of videos to download from the page (0 = no download, inf = no limit).	50	string
`antibot_extractor_enricher.user_data_dir`	Optional. Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. If you use the docker deployment, this path will be appended with `_docker` that is because the folder cannot be shared between the host and the container due to user permissions.	secrets/antibot_user_data	string
`antibot_extractor_enricher.detect_auth_wall`	Optional. detect if the page is behind an authentication wall (e.g. login required) and skip it. disable if you want to archive pages where logins are required.	True	bool
`antibot_extractor_enricher.proxy`	Optional. proxy to use for the webdriver, Format: ‘SERVER:PORT’ or ‘USER:PASS@SERVER:PORT’	None	string
`cli_feeder.urls`	Optional. URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml	None	string
`csv_db.csv_file`	Optional. CSV file name to save metadata to	db.csv	string
`generic_extractor.subtitles`	Optional. download subtitles if available	True	bool
`generic_extractor.comments`	Optional. download all comments if available, may lead to large metadata	False	bool
`generic_extractor.livestreams`	Optional. if set, will download live streams, otherwise will skip them; see –max-filesize for more control	False	bool
`generic_extractor.live_from_start`	Optional. if set, will download live streams from their earliest available moment, otherwise starts now.	False	bool
`generic_extractor.proxy`	Optional. http/https/socks proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port		string
`generic_extractor.proxy_on_failure_only`	Optional. Applies only if a proxy is set. In that case if this setting is True, the extractor will only use the proxy if the initial request fails; if it is False, the extractor will always use the proxy.	True	string
`generic_extractor.end_means_success`	Optional. if True, any archived content will mean a ‘success’, if False this extractor will not return a ‘success’ stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent extractors can retrieve.	True	bool
`generic_extractor.allow_playlist`	Optional. If True will also download playlists, set to False if the expectation is to download a single video.	False	bool
`generic_extractor.max_downloads`	Optional. Use to limit the number of videos to download when a channel or long page is being extracted. ‘inf’ means no limit.	inf	string
`generic_extractor.bguils_po_token_method`	Optional. Set up a Proof of origin token provider. This process has additional requirements. See authentication for more information.	auto	string
`generic_extractor.extractor_args`	Optional. Additional arguments to pass to the yt-dlp extractor. See yt-dlp/yt-dlp.	{}	json_loader
`generic_extractor.ytdlp_update_interval`	Optional. How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.	5	int
`generic_extractor.ytdlp_args`	Optional. Additional arguments to pass to yt-dlp, e.g. –no-check-certificate or –plugin-dirs.See yt-dlp documentation here for more information: yt-dlp/yt-dlp this is not to be confused with ‘extractor_args’ which are specific to the extractor itself.		string
`hash_enricher.algorithm`	Optional. hash algorithm to use	SHA-256	string
`hash_enricher.chunksize`	Optional. number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB	16000000	int
`html_formatter.detect_thumbnails`	Optional. if true will group by thumbnails generated by thumbnail enricher by id ‘thumbnail_00’	True	bool
`local_storage.path_generator`	Optional. how to store the file in terms of directory structure: ‘flat’ sets to root; ‘url’ creates a directory based on the provided URL; ‘random’ creates a random directory.	flat	string
`local_storage.filename_generator`	Optional. how to name stored files: ‘random’ creates a random string; ‘static’ uses a hash, with the settings of the ‘hash_enricher’ module (defaults to SHA256 if not enabled)	static	string
`local_storage.save_to`	Optional. folder where to save archived content	./local_archive	string
`local_storage.save_absolute`	Optional. whether the path to the stored file is absolute or relative in the output result inc. formatters (Warning: saving an absolute path will show your computer’s file structure)	False	bool
`ssl_enricher.skip_when_nothing_archived`	Optional. if true, will skip enriching when no media is archived	True	bool
`thumbnail_enricher.thumbnails_per_minute`	Optional. how many thumbnails to generate per minute of video, can be limited by max_thumbnails	60	int
`thumbnail_enricher.max_thumbnails`	Optional. limit the number of thumbnails to generate per video, 0 means no limit	16	int
`api_db.api_endpoint`	Required. API endpoint where calls are made to		string
`api_db.api_token`	Optional. API Bearer token.	None	string
`api_db.public`	Optional. whether the URL should be publicly available via the API	False	bool
`api_db.author_id`	Optional. which email to assign as author	None	string
`api_db.group_id`	Optional. which group of users have access to the archive in case public=false as author	None	string
`api_db.use_api_cache`	Optional. if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived	False	bool
`api_db.store_results`	Optional. when set, will send the results to the API database.	True	bool
`api_db.tags`	Optional. what tags to add to the archived URL	[]	string
`atlos_feeder_db_storage.api_token`	Required. An Atlos API token. For more information, see https://docs.atlos.org/technical/api/		string
`atlos_feeder_db_storage.atlos_url`	Optional. The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.	https://platform.atlos.org	string
`csv_feeder.files`	Required. Path to the input file(s) to read the URLs from, comma separated. Input files should be formatted with one URL per line	None	valid_file
`csv_feeder.column`	Optional. Column number or name to read the URLs from, 0-indexed	None	string
`gdrive_storage.path_generator`	Optional. how to store the file in terms of directory structure: ‘flat’ sets to root; ‘url’ creates a directory based on the provided URL; ‘random’ creates a random directory.	url	string
`gdrive_storage.filename_generator`	Optional. how to name stored files: ‘random’ creates a random string; ‘static’ uses a hash, with the settings of the ‘hash_enricher’ module (defaults to SHA256 if not enabled).	static	string
`gdrive_storage.root_folder_id`	Required. root google drive folder ID to use as storage, found in URL: ‘https://drive.google.com/drive/folders/FOLDER_ID’		string
`gdrive_storage.oauth_token`	Optional. JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account.	None	string
`gdrive_storage.service_account`	Optional. service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account.	secrets/service_account.json	string
`gsheet_feeder_db.sheet`	Optional. name of the sheet to archive	None	string
`gsheet_feeder_db.sheet_id`	Optional. the id of the sheet to archive (alternative to ‘sheet’ config)	None	string
`gsheet_feeder_db.header`	Optional. index of the header row (starts at 1)	1	int
`gsheet_feeder_db.service_account`	Required. service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html	secrets/service_account.json	string
`gsheet_feeder_db.columns`	Optional. Custom names for the columns in your Google sheet. If you don’t want to use the default column names, change them with this setting	{‘url’: ‘link’, ‘status’: ‘archive status’, ‘folder’: ‘destination folder’, ‘archive’: ‘archive location’, ‘date’: ‘archive date’, ‘thumbnail’: ‘thumbnail’, ‘timestamp’: ‘upload timestamp’, ‘title’: ‘upload title’, ‘text’: ‘text content’, ‘screenshot’: ‘screenshot’, ‘hash’: ‘hash’, ‘pdq_hash’: ‘perceptual hashes’, ‘wacz’: ‘wacz’, ‘replaywebpage’: ‘replaywebpage’}	json_loader
`gsheet_feeder_db.allow_worksheets`	Optional. A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed	set()	string
`gsheet_feeder_db.block_worksheets`	Optional. A list of worksheet names for worksheets that should be explicitly blocked from being processed	set()	string
`gsheet_feeder_db.use_sheet_names_in_stored_paths`	Optional. if True the stored files path will include ‘workbook_name/worksheet_name/…’	True	bool
`instagram_api_extractor.access_token`	Optional. a valid instagrapi-api token	None	string
`instagram_api_extractor.api_endpoint`	Required. API endpoint to use		string
`instagram_api_extractor.full_profile`	Optional. if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.	False	bool
`instagram_api_extractor.full_profile_max_posts`	Optional. Use to limit the number of posts to download when full_profile is true or when a URL for multiple posts is passed (like /stories /highlights …). 0 means no limit. when full_profile is true the order of downloaded content is stories -> posts -> tagged posts -> highlights, so a value of 10 could download 2 stories, 7 posts, 1 tagged posts, and 0 highlights.	0	int
`instagram_api_extractor.minimize_json_output`	Optional. if true, will remove empty values from the json output	True	bool
`instagram_extractor.username`	Required. A valid Instagram username.		string
`instagram_extractor.password`	Required. The corresponding Instagram account password.		string
`instagram_extractor.download_folder`	Optional. Name of a folder to temporarily download content to.	instaloader	string
`instagram_extractor.session_file`	Optional. Path to the instagram session file which saves session credentials. If one doesn’t exist this gives the path to store a new one.	secrets/instaloader.session	string
`instagram_tbot_extractor.api_id`	Optional. telegram API_ID value, go to https://my.telegram.org/apps	None	string
`instagram_tbot_extractor.api_hash`	Optional. telegram API_HASH value, go to https://my.telegram.org/apps	None	string
`instagram_tbot_extractor.session_file`	Optional. optional, records the telegram login session for future usage, ‘.session’ will be appended to the provided value.	secrets/anon-insta	string
`instagram_tbot_extractor.timeout`	Optional. timeout to fetch the instagram content in seconds.	45	int
`metadata_enricher.look_for_keys`	Optional. list of lowercased metadata keys that will be included in the enriched metadata. Special keys: ‘author’, ‘datetimes’, ‘location’ to include related metadata fields. The default empty list `[]` means all metadata will be included.	[]	list
`opentimestamps_enricher.calendar_urls`	Optional. List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:https://opentimestamps.org/#calendars	[‘https://alice.btc.calendar.opentimestamps.org’, ‘https://bob.btc.calendar.opentimestamps.org’, ‘https://finney.calendar.eternitywall.com’]	list
`opentimestamps_enricher.calendar_whitelist`	Optional. Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. [‘https://mycalendar.com’]	[]	list
`s3_storage.path_generator`	Optional. how to store the file in terms of directory structure: ‘flat’ sets to root; ‘url’ creates a directory based on the provided URL; ‘random’ creates a random directory.	flat	string
`s3_storage.filename_generator`	Optional. how to name stored files: ‘random’ creates a random string; ‘static’ uses a hash, with the settings of the ‘hash_enricher’ module (defaults to SHA256 if not enabled).	static	string
`s3_storage.bucket`	Optional. S3 bucket name	None	string
`s3_storage.region`	Optional. S3 region name	None	string
`s3_storage.key`	Optional. S3 API key	None	string
`s3_storage.secret`	Optional. S3 API secret	None	string
`s3_storage.random_no_duplicate`	Optional. if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`	False	bool
`s3_storage.endpoint_url`	Optional. S3 bucket endpoint, {region} are inserted at runtime	https://{region}.digitaloceanspaces.com	string
`s3_storage.cdn_url`	Optional. S3 CDN url, {bucket}, {region} and {key} are inserted at runtime	https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}	string
`s3_storage.private`	Optional. if true S3 files will not be readable online	False	bool
`telethon_extractor.api_id`	Optional. telegram API_ID value, go to https://my.telegram.org/apps	None	string
`telethon_extractor.api_hash`	Optional. telegram API_HASH value, go to https://my.telegram.org/apps	None	string
`telethon_extractor.bot_token`	Optional. optional, but allows access to more content such as large videos, talk to @botfather	None	string
`telethon_extractor.session_file`	Optional. Path of the file to save the telegram login session for future usage, ‘.session’ will be appended to the provided path.	secrets/anon	string
`telethon_extractor.join_channels`	Optional. disables the initial setup with channel_invites config, useful if you have a lot and get stuck	True	bool
`telethon_extractor.channel_invites`	Optional. (JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup	{}	json_loader
`timestamping_enricher.tsa_urls`	Optional. List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.	[‘http://timestamp.identrust.com’, ‘http://timestamp.ssl.trustwave.com’, ‘http://zeitstempel.dfn.de’, ‘http://ts.ssl.com’, ‘http://tsa.lex-persona.com/tsa’, ‘http://tss.cnbs.gob.hn/TSS/HttpTspServer’]	string
`timestamping_enricher.cert_authorities`	Optional. Path to a file containing trusted Certificate Authorities (CAs) in PEM format. If empty, the default system authorities are used.	None	string
`timestamping_enricher.allow_selfsigned`	Optional. Whether or not to allow and save self-signed Timestamping certificates. This allows for a greater range of timestamping servers to be used, but they are not trusted authorities	False	bool
`twitter_api_extractor.bearer_token`	Optional. [deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret	None	string
`twitter_api_extractor.bearer_tokens`	Optional. a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line	[]	string
`twitter_api_extractor.consumer_key`	Optional. twitter API consumer_key	None	string
`twitter_api_extractor.consumer_secret`	Optional. twitter API consumer_secret	None	string
`twitter_api_extractor.access_token`	Optional. twitter API access_token	None	string
`twitter_api_extractor.access_secret`	Optional. twitter API access_secret	None	string
`wacz_extractor_enricher.profile`	Optional. browsertrix-profile (for profile generation see https://crawler.docs.browsertrix.com/user-guide/browser-profiles/).	None	string
`wacz_extractor_enricher.docker_commands`	Optional. if a custom docker invocation is needed	None	string
`wacz_extractor_enricher.timeout`	Optional. timeout for WACZ generation in seconds	120	int
`wacz_extractor_enricher.extract_media`	Optional. If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched.	False	bool
`wacz_extractor_enricher.extract_screenshot`	Optional. If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched.	True	bool
`wacz_extractor_enricher.socks_proxy_host`	Optional. SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host	None	string
`wacz_extractor_enricher.socks_proxy_port`	Optional. SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234	None	int
`wacz_extractor_enricher.proxy_server`	Optional. SOCKS server proxy URL, in development	None	string
`wayback_extractor_enricher.timeout`	Optional. seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.	15	int
`wayback_extractor_enricher.if_not_archived_within`	Optional. only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA	None	string
`wayback_extractor_enricher.key`	Required. wayback API key. to get credentials visit https://archive.org/account/s3.php		string
`wayback_extractor_enricher.secret`	Required. wayback API secret. to get credentials visit https://archive.org/account/s3.php		string
`wayback_extractor_enricher.proxy_http`	Optional. http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port	None	string
`wayback_extractor_enricher.proxy_https`	Optional. https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port	None	string
`whisper_enricher.api_endpoint`	Required. WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of bellingcat/whisperbox-transcribe.		string
`whisper_enricher.api_key`	Required. WhisperApi api key for authentication		string
`whisper_enricher.include_srt`	Optional. Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players).	False	bool
`whisper_enricher.timeout`	Optional. How many seconds to wait at most for a successful job completion.	90	int
`whisper_enricher.action`	Optional. which Whisper operation to execute	translate	string