Configuration Cheat Sheet#
Below is a list of all configurations for the core modules in Auto Archiver
Configuration File#
# Module configuration
# Command Line Feeder configuration options
cli_feeder:
urls: # URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml
# CSV Database configuration options
csv_db:
csv_file: db.csv # CSV file name to save metadata to
# Generic Extractor configuration options
generic_extractor:
subtitles: true # download subtitles if available
comments: false # download all comments if available, may lead to large metadata
livestreams: false # if set, will download live streams, otherwise will skip them; see --max-filesize for more control
live_from_start: false # if set, will download live streams from their earliest available moment, otherwise starts now.
proxy: '' # http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port
end_means_success: true # if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve.
allow_playlist: false # If True will also download playlists, set to False if the expectation is to download a single video.
max_downloads: inf # Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.
extractor_args: {} # Additional arguments to pass to the yt-dlp extractor. See https://github.com/yt-dlp/yt-dlp/blob/master/README.md#extractor-arguments.
ytdlp_update_interval: 5 # How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.
ytdlp_args: '' # Additional arguments to pass to yt-dlp, e.g. --no-check-certificate or --plugin-dirs.See yt-dlp documentation here for more information: https://github.com/yt-dlp/yt-dlp?tab=readme-ov-file#general-optionsNote: this is not to be confused with 'extractor_args' which are specific to the extractor itself.
# Hash Enricher configuration options
hash_enricher:
algorithm: SHA-256 # hash algorithm to use
chunksize: 16000000 # number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB
# HTML Formatter configuration options
html_formatter:
detect_thumbnails: true # if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'
# Local Storage configuration options
local_storage:
path_generator: flat # how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.
filename_generator: static # how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled)
save_to: ./local_archive # folder where to save archived content
save_absolute: false # whether the path to the stored file is absolute or relative in the output result inc. formatters (Warning: saving an absolute path will show your computer's file structure)
# SSL Certificate Enricher configuration options
ssl_enricher:
skip_when_nothing_archived: true # if true, will skip enriching when no media is archived
# Thumbnail Enricher configuration options
thumbnail_enricher:
thumbnails_per_minute: 60 # how many thumbnails to generate per minute of video, can be limited by max_thumbnails
max_thumbnails: 16 # limit the number of thumbnails to generate per video, 0 means no limit
# Auto Archiver API Database configuration options
api_db:
api_endpoint: '' # API endpoint where calls are made to
api_token: # API Bearer token.
public: false # whether the URL should be publicly available via the API
author_id: # which email to assign as author
group_id: # which group of users have access to the archive in case public=false as author
use_api_cache: false # if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived
store_results: true # when set, will send the results to the API database.
tags: [] # what tags to add to the archived URL
# Atlos Feeder Database Storage configuration options
atlos_feeder_db_storage:
api_token: '' # An Atlos API token. For more information, see https://docs.atlos.org/technical/api/
atlos_url: https://platform.atlos.org # The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.
# CSV Feeder configuration options
csv_feeder:
files: # Path to the input file(s) to read the URLs from, comma separated. Input files should be formatted with one URL per line
column: # Column number or name to read the URLs from, 0-indexed
# Google Drive Storage configuration options
gdrive_storage:
path_generator: url # how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.
filename_generator: static # how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).
root_folder_id: '' # root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'
oauth_token: # JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account.
service_account: secrets/service_account.json # service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account.
# Google Sheets Feeder Database configuration options
gsheet_feeder_db:
sheet: # name of the sheet to archive
sheet_id: # the id of the sheet to archive (alternative to 'sheet' config)
header: 1 # index of the header row (starts at 1)
service_account: secrets/service_account.json # service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html
columns: # Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting
url: link
status: archive status
folder: destination folder
archive: archive location
date: archive date
thumbnail: thumbnail
timestamp: upload timestamp
title: upload title
text: text content
screenshot: screenshot
hash: hash
pdq_hash: perceptual hashes
wacz: wacz
replaywebpage: replaywebpage
allow_worksheets: !!set {} # A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed
block_worksheets: !!set {} # A list of worksheet names for worksheets that should be explicitly blocked from being processed
use_sheet_names_in_stored_paths: true # if True the stored files path will include 'workbook_name/worksheet_name/...'
# Instagram API Extractor configuration options
instagram_api_extractor:
access_token: # a valid instagrapi-api token
api_endpoint: '' # API endpoint to use
full_profile: false # if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.
full_profile_max_posts: 0 # Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights
minimize_json_output: true # if true, will remove empty values from the json output
# Instagram Extractor configuration options
instagram_extractor:
username: '' # A valid Instagram username.
password: '' # The corresponding Instagram account password.
download_folder: instaloader # Name of a folder to temporarily download content to.
session_file: secrets/instaloader.session # Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.
# Instagram Telegram Bot Extractor configuration options
instagram_tbot_extractor:
api_id: # telegram API_ID value, go to https://my.telegram.org/apps
api_hash: # telegram API_HASH value, go to https://my.telegram.org/apps
session_file: secrets/anon-insta # optional, records the telegram login session for future usage, '.session' will be appended to the provided value.
timeout: 45 # timeout to fetch the instagram content in seconds.
# OpenTimestamps Enricher configuration options
opentimestamps_enricher:
calendar_urls: # List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:https://opentimestamps.org/#calendars
- https://alice.btc.calendar.opentimestamps.org
- https://bob.btc.calendar.opentimestamps.org
- https://finney.calendar.eternitywall.com
calendar_whitelist: [] # Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. ['https://mycalendar.com']
# S3 Storage configuration options
s3_storage:
path_generator: flat # how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.
filename_generator: static # how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).
bucket: # S3 bucket name
region: # S3 region name
key: # S3 API key
secret: # S3 API secret
random_no_duplicate: false # if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`
endpoint_url: https://{region}.digitaloceanspaces.com # S3 bucket endpoint, {region} are inserted at runtime
cdn_url: https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key} # S3 CDN url, {bucket}, {region} and {key} are inserted at runtime
private: false # if true S3 files will not be readable online
# Screenshot Enricher configuration options
screenshot_enricher:
width: 1280 # width of the screenshots
height: 1024 # height of the screenshots
timeout: 60 # timeout for taking the screenshot
sleep_before_screenshot: 4 # seconds to wait for the pages to load before taking screenshot
http_proxy: '' # http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port
save_to_pdf: false # save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter
print_options: {} # options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information
# Telethon Extractor configuration options
telethon_extractor:
api_id: # telegram API_ID value, go to https://my.telegram.org/apps
api_hash: # telegram API_HASH value, go to https://my.telegram.org/apps
bot_token: # optional, but allows access to more content such as large videos, talk to @botfather
session_file: secrets/anon # optional, records the telegram login session for future usage, '.session' will be appended to the provided value.
join_channels: true # disables the initial setup with channel_invites config, useful if you have a lot and get stuck
channel_invites: {} # (JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup
# Timestamping Enricher configuration options
timestamping_enricher:
tsa_urls: # List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.
- http://timestamp.identrust.com
- http://timestamp.ssl.trustwave.com
- http://zeitstempel.dfn.de
- http://ts.ssl.com
- http://tsa.lex-persona.com/tsa
- http://tss.cnbs.gob.hn/TSS/HttpTspServer
- http://dss.nowina.lu/pki-factory/tsa/good-tsa
cert_authorities: # Path to a file containing trusted Certificate Authorities (CAs) in PEM format. If empty, the default system authorities are used.
allow_selfsigned: false # Whether or not to allow and save self-signed Timestamping certificates. This allows for a greater range of timestamping servers to be used, but they are not trusted authorities
# Twitter API Extractor configuration options
twitter_api_extractor:
bearer_token: # [deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret
bearer_tokens: [] # a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line
consumer_key: # twitter API consumer_key
consumer_secret: # twitter API consumer_secret
access_token: # twitter API access_token
access_secret: # twitter API access_secret
# VKontakte Extractor configuration options
vk_extractor:
username: '' # valid VKontakte username
password: '' # valid VKontakte password
session_file: secrets/vk_config.v2.json # valid VKontakte password
# WACZ Enricher (and Extractor) configuration options
wacz_extractor_enricher:
profile: # browsertrix-profile (for profile generation see https://crawler.docs.browsertrix.com/user-guide/browser-profiles/).
docker_commands: # if a custom docker invocation is needed
timeout: 120 # timeout for WACZ generation in seconds
extract_media: false # If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched.
extract_screenshot: true # If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched.
socks_proxy_host: # SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host
socks_proxy_port: # SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234
proxy_server: # SOCKS server proxy URL, in development
# Wayback Machine Enricher (and Extractor) configuration options
wayback_extractor_enricher:
timeout: 15 # seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.
if_not_archived_within: # only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA
key: '' # wayback API key. to get credentials visit https://archive.org/account/s3.php
secret: '' # wayback API secret. to get credentials visit https://archive.org/account/s3.php
proxy_http: # http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port
proxy_https: # https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port
# Whisper Enricher configuration options
whisper_enricher:
api_endpoint: '' # WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe.
api_key: '' # WhisperApi api key for authentication
include_srt: false # Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players).
timeout: 90 # How many seconds to wait at most for a successful job completion.
action: translate # which Whisper operation to execute
Command Line#
Configuration Options#
Option |
Description |
Default |
Type |
|---|---|---|---|
|
Optional. URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml |
None |
string |
|
Optional. CSV file name to save metadata to |
db.csv |
string |
|
Optional. download subtitles if available |
True |
bool |
|
Optional. download all comments if available, may lead to large metadata |
False |
bool |
|
Optional. if set, will download live streams, otherwise will skip them; see –max-filesize for more control |
False |
bool |
|
Optional. if set, will download live streams from their earliest available moment, otherwise starts now. |
False |
bool |
|
Optional. http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port |
string |
|
|
Optional. if True, any archived content will mean a ‘success’, if False this archiver will not return a ‘success’ stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve. |
True |
bool |
|
Optional. If True will also download playlists, set to False if the expectation is to download a single video. |
False |
bool |
|
Optional. Use to limit the number of videos to download when a channel or long page is being extracted. ‘inf’ means no limit. |
inf |
string |
|
Optional. Additional arguments to pass to the yt-dlp extractor. See yt-dlp/yt-dlp. |
{} |
json_loader |
|
Optional. How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run. |
5 |
int |
|
Optional. Additional arguments to pass to yt-dlp, e.g. –no-check-certificate or –plugin-dirs.See yt-dlp documentation here for more information: yt-dlp/yt-dlp this is not to be confused with ‘extractor_args’ which are specific to the extractor itself. |
string |
|
|
Optional. hash algorithm to use |
SHA-256 |
string |
|
Optional. number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB |
16000000 |
int |
|
Optional. if true will group by thumbnails generated by thumbnail enricher by id ‘thumbnail_00’ |
True |
bool |
|
Optional. how to store the file in terms of directory structure: ‘flat’ sets to root; ‘url’ creates a directory based on the provided URL; ‘random’ creates a random directory. |
flat |
string |
|
Optional. how to name stored files: ‘random’ creates a random string; ‘static’ uses a hash, with the settings of the ‘hash_enricher’ module (defaults to SHA256 if not enabled) |
static |
string |
|
Optional. folder where to save archived content |
./local_archive |
string |
|
Optional. whether the path to the stored file is absolute or relative in the output result inc. formatters (Warning: saving an absolute path will show your computer’s file structure) |
False |
bool |
|
Optional. if true, will skip enriching when no media is archived |
True |
bool |
|
Optional. how many thumbnails to generate per minute of video, can be limited by max_thumbnails |
60 |
int |
|
Optional. limit the number of thumbnails to generate per video, 0 means no limit |
16 |
int |
|
Required. API endpoint where calls are made to |
string |
|
|
Optional. API Bearer token. |
None |
string |
|
Optional. whether the URL should be publicly available via the API |
False |
bool |
|
Optional. which email to assign as author |
None |
string |
|
Optional. which group of users have access to the archive in case public=false as author |
None |
string |
|
Optional. if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived |
False |
bool |
|
Optional. when set, will send the results to the API database. |
True |
bool |
|
Optional. what tags to add to the archived URL |
[] |
string |
|
Required. An Atlos API token. For more information, see https://docs.atlos.org/technical/api/ |
string |
|
|
Optional. The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash. |
string |
|
|
Required. Path to the input file(s) to read the URLs from, comma separated. Input files should be formatted with one URL per line |
None |
valid_file |
|
Optional. Column number or name to read the URLs from, 0-indexed |
None |
string |
|
Optional. how to store the file in terms of directory structure: ‘flat’ sets to root; ‘url’ creates a directory based on the provided URL; ‘random’ creates a random directory. |
url |
string |
|
Optional. how to name stored files: ‘random’ creates a random string; ‘static’ uses a hash, with the settings of the ‘hash_enricher’ module (defaults to SHA256 if not enabled). |
static |
string |
|
Required. root google drive folder ID to use as storage, found in URL: ‘https://drive.google.com/drive/folders/FOLDER_ID’ |
string |
|
|
Optional. JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account. |
None |
string |
|
Optional. service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account. |
secrets/service_account.json |
string |
|
Optional. name of the sheet to archive |
None |
string |
|
Optional. the id of the sheet to archive (alternative to ‘sheet’ config) |
None |
string |
|
Optional. index of the header row (starts at 1) |
1 |
int |
|
Required. service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html |
secrets/service_account.json |
string |
|
Optional. Custom names for the columns in your Google sheet. If you don’t want to use the default column names, change them with this setting |
{‘url’: ‘link’, ‘status’: ‘archive status’, ‘folder’: ‘destination folder’, ‘archive’: ‘archive location’, ‘date’: ‘archive date’, ‘thumbnail’: ‘thumbnail’, ‘timestamp’: ‘upload timestamp’, ‘title’: ‘upload title’, ‘text’: ‘text content’, ‘screenshot’: ‘screenshot’, ‘hash’: ‘hash’, ‘pdq_hash’: ‘perceptual hashes’, ‘wacz’: ‘wacz’, ‘replaywebpage’: ‘replaywebpage’} |
json_loader |
|
Optional. A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed |
set() |
string |
|
Optional. A list of worksheet names for worksheets that should be explicitly blocked from being processed |
set() |
string |
|
Optional. if True the stored files path will include ‘workbook_name/worksheet_name/…’ |
True |
bool |
|
Optional. a valid instagrapi-api token |
None |
string |
|
Required. API endpoint to use |
string |
|
|
Optional. if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information. |
False |
bool |
|
Optional. Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights |
0 |
int |
|
Optional. if true, will remove empty values from the json output |
True |
bool |
|
Required. A valid Instagram username. |
string |
|
|
Required. The corresponding Instagram account password. |
string |
|
|
Optional. Name of a folder to temporarily download content to. |
instaloader |
string |
|
Optional. Path to the instagram session file which saves session credentials. If one doesn’t exist this gives the path to store a new one. |
secrets/instaloader.session |
string |
|
Optional. telegram API_ID value, go to https://my.telegram.org/apps |
None |
string |
|
Optional. telegram API_HASH value, go to https://my.telegram.org/apps |
None |
string |
|
Optional. optional, records the telegram login session for future usage, ‘.session’ will be appended to the provided value. |
secrets/anon-insta |
string |
|
Optional. timeout to fetch the instagram content in seconds. |
45 |
int |
|
Optional. List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:https://opentimestamps.org/#calendars |
[‘https://alice.btc.calendar.opentimestamps.org’, ‘https://bob.btc.calendar.opentimestamps.org’, ‘https://finney.calendar.eternitywall.com’] |
list |
|
Optional. Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. [‘https://mycalendar.com’] |
[] |
list |
|
Optional. how to store the file in terms of directory structure: ‘flat’ sets to root; ‘url’ creates a directory based on the provided URL; ‘random’ creates a random directory. |
flat |
string |
|
Optional. how to name stored files: ‘random’ creates a random string; ‘static’ uses a hash, with the settings of the ‘hash_enricher’ module (defaults to SHA256 if not enabled). |
static |
string |
|
Optional. S3 bucket name |
None |
string |
|
Optional. S3 region name |
None |
string |
|
Optional. S3 API key |
None |
string |
|
Optional. S3 API secret |
None |
string |
|
Optional. if set, it will override |
False |
bool |
|
Optional. S3 bucket endpoint, {region} are inserted at runtime |
https://{region}.digitaloceanspaces.com |
string |
|
Optional. S3 CDN url, {bucket}, {region} and {key} are inserted at runtime |
https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key} |
string |
|
Optional. if true S3 files will not be readable online |
False |
bool |
|
Optional. width of the screenshots |
1280 |
int |
|
Optional. height of the screenshots |
1024 |
int |
|
Optional. timeout for taking the screenshot |
60 |
int |
|
Optional. seconds to wait for the pages to load before taking screenshot |
4 |
int |
|
Optional. http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port |
string |
|
|
Optional. save the page as pdf along with the screenshot. PDF saving options can be adjusted with the ‘print_options’ parameter |
False |
bool |
|
Optional. options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information |
{} |
json_loader |
|
Optional. telegram API_ID value, go to https://my.telegram.org/apps |
None |
string |
|
Optional. telegram API_HASH value, go to https://my.telegram.org/apps |
None |
string |
|
Optional. optional, but allows access to more content such as large videos, talk to @botfather |
None |
string |
|
Optional. optional, records the telegram login session for future usage, ‘.session’ will be appended to the provided value. |
secrets/anon |
string |
|
Optional. disables the initial setup with channel_invites config, useful if you have a lot and get stuck |
True |
bool |
|
Optional. (JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup |
{} |
json_loader |
|
Optional. List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line. |
[‘http://timestamp.identrust.com’, ‘http://timestamp.ssl.trustwave.com’, ‘http://zeitstempel.dfn.de’, ‘http://ts.ssl.com’, ‘http://tsa.lex-persona.com/tsa’, ‘http://tss.cnbs.gob.hn/TSS/HttpTspServer’, ‘http://dss.nowina.lu/pki-factory/tsa/good-tsa’] |
string |
|
Optional. Path to a file containing trusted Certificate Authorities (CAs) in PEM format. If empty, the default system authorities are used. |
None |
string |
|
Optional. Whether or not to allow and save self-signed Timestamping certificates. This allows for a greater range of timestamping servers to be used, but they are not trusted authorities |
False |
bool |
|
Optional. [deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret |
None |
string |
|
Optional. a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line |
[] |
string |
|
Optional. twitter API consumer_key |
None |
string |
|
Optional. twitter API consumer_secret |
None |
string |
|
Optional. twitter API access_token |
None |
string |
|
Optional. twitter API access_secret |
None |
string |
|
Required. valid VKontakte username |
string |
|
|
Required. valid VKontakte password |
string |
|
|
Optional. valid VKontakte password |
secrets/vk_config.v2.json |
string |
|
Optional. browsertrix-profile (for profile generation see https://crawler.docs.browsertrix.com/user-guide/browser-profiles/). |
None |
string |
|
Optional. if a custom docker invocation is needed |
None |
string |
|
Optional. timeout for WACZ generation in seconds |
120 |
int |
|
Optional. If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched. |
False |
bool |
|
Optional. If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched. |
True |
bool |
|
Optional. SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host |
None |
string |
|
Optional. SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234 |
None |
int |
|
Optional. SOCKS server proxy URL, in development |
None |
string |
|
Optional. seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually. |
15 |
int |
|
Optional. only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA |
None |
string |
|
Required. wayback API key. to get credentials visit https://archive.org/account/s3.php |
string |
|
|
Required. wayback API secret. to get credentials visit https://archive.org/account/s3.php |
string |
|
|
Optional. http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port |
None |
string |
|
Optional. https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port |
None |
string |
|
Required. WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of bellingcat/whisperbox-transcribe. |
string |
|
|
Required. WhisperApi api key for authentication |
string |
|
|
Optional. Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players). |
False |
bool |
|
Optional. How many seconds to wait at most for a successful job completion. |
90 |
int |
|
Optional. which Whisper operation to execute |
translate |
string |