Added exporter.py and documentation

2020-12-29 16:37:57 -06:00 · 2020-12-29 16:37:57 -06:00 · 8c4550f307
commit 8c4550f307
parent 23385c2f01
5 changed files with 439 additions and 82 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
-/.env
+.env
+exports
--- a/README.md
+++ b/README.md
@ -1,5 +1,62 @@
 # slack-exporter

-A Slack app for exporting messages and file attachments from public and private channels.
+A Slack bot and standalone script for exporting messages and file attachments from public and private channels, using Slack's new Conversations API.

-Note that Slack provides a similar service for workspace admins at [https://my.slack.com/services/export](https://my.slack.com/services/export) (where `my` can be replaced with your full workspace name to access workspace different than your default). However, it can only access public channels, while `slack-exporter` can be added to any channel.
+A similar service is provided by Slack for workspace admins at [https://my.slack.com/services/export](https://my.slack.com/services/export) (where `my` can be replaced with your full workspace name to refer to a workspace different than your default). However, it can only access public channels, while `slack-exporter` can retrieve data from any channel accessible to your user account.
+
+## Authentication with Slack
+
+There are two ways to use `slack-exporter` (detailed below). Both require a Slack API token to be able to communicate with your workspace.
+
+1. Visit [https://api.slack.com/apps/](https://api.slack.com/apps/) and sign in to your workspace.
+2. Click `Create New App`, enter a name (e.g., `Slack Exporter`), and select your workspace.
+3. In the left-hand panel, navigate to `OAuth & Permissions`, and scroll to `User Token Scopes` (**not** `Bot Token Scopes`).
+4. Select the following permissions: 
+    - `channels:read`, `channels:history`
+    - `groups:read`, `groups:history`
+    - `mpim:read`, `mpim:history`
+    - `im:read`, `im:history`
+    - `users:read`
+5. Select `Install to Workspace` at the top of that page (or `Reinstall to Workspace` if you have done this previously) and accept at the prompt.
+6. Copy the `OAuth Access Token` (which will generally start with `xoxp` for user-level permissions)
+
+## Usage
+
+### As a standalone script
+
+`exporter.py` can create an archive of all conversation history in your workspace which is accessible to your user account.
+
+1. Run the following (replacing the value with the user token you obtained in the [Authentication with Slack](#authentication-with-slack) section above).
+
+    ```shell script
+    export SLACK_USER_TOKEN=xoxp-xxxxxxxxxxxxx...
+    ```
+
+2. Run `python exporter.py --help` to view the available export options.
+
+### As a Slack bot
+
+`bot.py` is a Slack bot that responds to "slash commands" in Slack channels (e.g., `/export-channel`). To connect the bot to the Slack app generated in [Authentication with Slack](#authentication-with-slack), create a file named `.env` in the root directory of this repo, and add the following line:
+
+```text
+SLACK_USER_TOKEN = xoxp-xxxxxxxxxxxxx...
+``` 
+
+Save this file and run the Flask application in `bot.py` such that the application is exposed to the Internet. This can be done via a web server (e.g., Heroku), as well as via the ngrok service, which assigns your `localhost` server a public URL.
+
+To use the ngrok method:
+
+1. [Download](https://ngrok.com/download) the appropriate binary.
+2. Run `python bot.py`
+3. Run the ngrok binary with `path/to/ngrok http 5000`, where `5000` is the port on which the Flask application (step 2) is running. Copy the forwarding HTTPS address provided.
+
+Return to the Slack app you created in [Authentication with Slack](#authentication-with-slack) and navigate to the `Slash Commands` page in the sidebar. Create the following slash commands (one for each applicable Flask route in `bot.py`):
+
+| Command         | Request URL                               | Arguments    | Example Usage        |
+|-----------------|-------------------------------------------|--------------|----------------------|
+| /export-channel | https://`[host_url]`/slack/export-channel | json \| text | /export-channel text |
+| /export-replies | https://`[host_url]`/slack/export-replies | json \| text | /export-replies json |
+
+where, if using ngrok, `[domain]` would be replaced with something like `https://xxxxxxxxxxxx.ngrok.io`.
+
+Navigate back to `OAuth & Permissions` and click `(Re)install to Workspace` to add these slash commands to the workspace.
--- a/bot.py
+++ b/bot.py
@ -1,42 +1,18 @@
 import os
-import ssl
-import slack
-from slack.errors import SlackApiError
 import requests
-from dotenv import load_dotenv
 from flask import Flask, request, Response
 from urllib.parse import urljoin
 from uuid import uuid4
 import json
-from datetime import datetime
-
-ssl_context = ssl.create_default_context()
-ssl_context.check_hostname = False
-ssl_context.verify_mode = ssl.CERT_NONE
+from dotenv import load_dotenv
+from exporter import parse_replies, parse_channel_history

 app = Flask(__name__)
 load_dotenv(os.path.join(app.root_path, '.env'))
-client = slack.WebClient(token=os.environ['SLACK_BOT_TOKEN'], ssl=ssl_context)


 # chat write interactions

-def send_to_channel(channel_id, text):
-    try:
-        client.chat_postMessage(channel=channel_id, text=text)
-    except SlackApiError as e:
-        print(e)
-        pass
-
-
-def send_to_user(user_id, text):
-    try:
-        client.chat_postMessage(channel=user_id, text=text, as_user=True)
-    except SlackApiError as e:
-        print(e)
-        pass
-
-
 def post_response(response_url, text):
    requests.post(response_url, json={'text': text})

@ -72,7 +48,10 @@ def paginated_get(url, params, response_url, combine_key=None):
    result = []
    while True:
        next_cursor, data = get_at_cursor(url, params, response_url, cursor=next_cursor)
-        result.extend(data) if combine_key is None else result.extend(data[combine_key])
+        try:
+            result.extend(data) if combine_key is None else result.extend(data[combine_key])
+        except KeyError:
+            post_response(response_url, "Sorry! I got an unexpected response (KeyError).")
        if next_cursor is None:
            break

@ -101,37 +80,19 @@ def user_list(team_id, response_url):
    return paginated_get('https://slack.com/api/users.list', params, response_url, combine_key='members')


-# parsing
+def channel_replies(timestamps, channel_id, response_url):
+    replies = []
+    for timestamp in timestamps:
+        params = {
+            'token': os.environ['SLACK_USER_TOKEN'],
+            'channel': channel_id,
+            'ts': timestamp,
+            'limit': 200
+        }
+        r = paginated_get('https://slack.com/api/conversations.replies', params, response_url, combine_key='messages')
+        replies.append(r)

-def user_list_to_names(user_dict):
-    return {x['id']: {'name': x['name'], 'real_name': x['real_name']} for x in user_dict}
-
-
-def channel_history_to_text(msgs_dict, users):
-    messages = [x for x in msgs_dict['messages'] if x['type'] == 'message']  # files are also messages
-    body = 'Team ID: %s\nTeam Domain: %s\nChannel ID: %s\nChannel Name: %s\n\n' % \
-           (msgs_dict['team_id'], msgs_dict['team_domain'], msgs_dict['channel_id'], msgs_dict['channel_name'])
-    body += '%s\n %s Messages\n%s\n\n' % ('=' * 16, len(messages), '=' * 16)
-    for msg in messages:
-        usr = users[msg['user']] if 'user' in msg else {'name': '', 'real_name': 'none'}
-        ts = datetime.fromtimestamp(round(float(msg['ts']))).strftime('%m-%d-%Y %H:%M:%S')
-        text = msg['text'] if msg['text'].strip() != "" else "[no message content]"
-        for u in users.keys():
-            # if u in text:
-            #     print(u)
-            text = str(text).replace('<@%s>' % u, '<@%s> (%s)' % (u, users[u]['name']))
-        entry = "Message at %s\nUser: %s (%s)\n%s" % (ts, usr['name'], usr['real_name'], text)
-        if 'reactions' in msg:
-            rxns = msg['reactions']
-            entry += "\nReactions: " + ', '.join('%s (%s)' % (x['name'], ', '.join(
-                users[u]['name'] for u in x['users'])) for x in rxns)
-        if 'files' in msg:
-            files = msg['files']
-            entry += "\nFiles:\n" + '\n'.join(' - %s, %s' % (f['name'], f['url_private_download']) for f in files)
-
-        body += entry.strip() + '\n\n%s\n\n' % ('=' * 16)
-
-    return body
+    return replies


 # Flask routes
@ -143,32 +104,37 @@ def export_channel():
    try:
        team_id = data['team_id']
        team_domain = data['team_domain']
-        channel_id = data['channel_id']
-        channel_name = data['channel_name']
+        ch_id = data['channel_id']
+        ch_name = data['channel_name']
        response_url = data['response_url']
        command_args = data['text']
    except KeyError:
-        return Response("Sorry! I got an unexpected response from Slack (KeyError)."), 200
+        return Response("Sorry! I got an unexpected response (KeyError)."), 200

    post_response(response_url, "Retrieving history for this channel...")
-    all_messages = {
-        'team_id': team_id,
-        'team_domain': team_domain,
-        'channel_id': channel_id,
-        'channel_name': channel_name,
-        'messages': channel_history(channel_id, response_url)
-    }
+    ch_hist = channel_history(ch_id, response_url)

-    filename = "%s-%s-%s.json" % (team_domain, channel_id, str(uuid4().hex)[:6])
-    filepath = os.path.join(app.root_path, 'exports', filename)
+    export_mode = str(command_args).lower()
+
+    exports_subdir = 'exports'
+    exports_dir = os.path.join(app.root_path, exports_subdir)
+    file_ext = '.txt' if export_mode == 'text' else '.json'
+    filename = "%s-ch_%s-%s%s" % (team_domain, ch_id, str(uuid4().hex)[:6], file_ext)
+    filepath = os.path.join(exports_dir, filename)
    loc = urljoin(request.url_root, 'download/%s' % filename)

+    if not os.path.isdir(exports_dir):
+        os.makedirs(exports_dir, exist_ok=True)
+
    with open(filepath, mode='w') as f:
-        if str(command_args).lower() == 'text':
-            users = user_list_to_names(user_list(team_id, response_url))
-            f.write(channel_history_to_text(all_messages, users))
+        if export_mode == 'text':
+            num_msgs = len(ch_hist)
+            sep = '=' * 24
+            header_str = 'Channel Name: %s\nChannel ID: %s\n%s Messages\n%s\n\n' % (ch_name, ch_id, num_msgs, sep)
+            data_ch = header_str + parse_channel_history(ch_hist, user_list(team_id, response_url))
+            f.write(data_ch)
        else:
-            json.dump(all_messages, f, indent=4)
+            json.dump(ch_hist, f, indent=4)

    post_response(response_url, "Done! This channel's history is available for download here (note that this link "
                                "is single-use): %s" % loc)
@ -176,8 +142,60 @@ def export_channel():
    return Response(), 200


+@app.route('/slack/export-replies', methods=['POST'])
+def export_replies():
+    data = request.form
+
+    try:
+        team_id = data['team_id']
+        team_domain = data['team_domain']
+        ch_id = data['channel_id']
+        ch_name = data['channel_name']
+        response_url = data['response_url']
+        command_args = data['text']
+    except KeyError:
+        return Response("Sorry! I got an unexpected response (KeyError)."), 200
+
+    post_response(response_url, "Retrieving reply threads for this channel...")
+    print(ch_id)
+    ch_hist = channel_history(ch_id, response_url)
+    print(ch_hist)
+    ch_replies = channel_replies([x['ts'] for x in ch_hist if 'reply_count' in x], ch_id, response_url)
+
+    export_mode = str(command_args).lower()
+
+    exports_subdir = 'exports'
+    exports_dir = os.path.join(app.root_path, exports_subdir)
+    file_ext = '.txt' if export_mode == 'text' else '.json'
+    filename = "%s-re_%s-%s%s" % (team_domain, ch_id, str(uuid4().hex)[:6], file_ext)
+    filepath = os.path.join(exports_dir, filename)
+    loc = urljoin(request.url_root, 'download/%s' % filename)
+
+    if export_mode == 'text':
+        header_str = 'Threads in: %s\n%s Messages' % (ch_name, len(ch_replies))
+        data_replies = parse_replies(ch_replies, user_list(team_id, response_url))
+        sep = '=' * 24
+        data_replies = '%s\n%s\n\n%s' % (header_str, sep, data_replies)
+    else:
+        data_replies = ch_replies
+
+    if not os.path.isdir(exports_dir):
+        os.makedirs(exports_dir, exist_ok=True)
+
+    with open(filepath, mode='w') as f:
+        if export_mode == 'text':
+            f.write(data_replies)
+        else:
+            json.dump(data_replies, f, indent=4)
+
+    post_response(response_url, "Done! This channel's reply threads are available for download here (note that this "
+                                "link is single-use): %s" % loc)
+
+    return Response(), 200
+
+
@app.route('/download/<filename>')
-def download(filename, mimetype='application/json'):
+def download(filename):
    path = os.path.join(app.root_path, 'exports', filename)

    def generate():
@ -185,10 +203,12 @@ def download(filename, mimetype='application/json'):
            yield from f
        os.remove(path)

+    mimetype = 'text/plain' if os.path.splitext(filename)[-1] == '.txt' else 'application/json'
+
    r = app.response_class(generate(), mimetype=mimetype)
    r.headers.set('Content-Disposition', 'attachment', filename=filename)
    return r


 if __name__ == '__main__':
-    app.run(debug=True)
+    app.run(debug=False)
--- a/exporter.py
+++ b/exporter.py
@ -0,0 +1,280 @@
+import os
+import sys
+import requests
+import json
+from datetime import datetime
+import argparse
+
+from dotenv import load_dotenv
+load_dotenv(os.path.join(os.path.dirname(__file__), '.env'))
+
+
+# pagination handling
+
+def get_at_cursor(url, params, cursor=None):
+    if cursor is not None:
+        params['cursor'] = cursor
+
+    r = requests.get(url, params=params)
+    if r.status_code != 200:
+        print("ERROR: %s %s" % (r.status_code, r.reason))
+        sys.exit(1)
+    d = r.json()
+
+    try:
+        if d['ok'] is False:
+            print("I encountered an error: %s" % d)
+            sys.exit(1)
+
+        next_cursor = None
+        if 'response_metadata' in d and 'next_cursor' in d['response_metadata']:
+            next_cursor = d['response_metadata']['next_cursor']
+            if str(next_cursor).strip() == '':
+                next_cursor = None
+
+        return next_cursor, d
+
+    except KeyError as e:
+        print("Something went wrong: %s." % e)
+        return None, []
+
+
+def paginated_get(url, params, combine_key=None):
+    next_cursor = None
+    result = []
+    while True:
+        next_cursor, data = get_at_cursor(url, params, cursor=next_cursor)
+        result.extend(data) if combine_key is None else result.extend(data[combine_key])
+        if next_cursor is None:
+            break
+
+    return result
+
+
+# GET requests
+
+def channel_list(team_id=None):
+    params = {
+        'token': os.environ['SLACK_USER_TOKEN'],
+        'team_id': team_id,
+        'types': 'public_channel,private_channel,mpim,im',
+        'limit': 200
+    }
+
+    return paginated_get('https://slack.com/api/conversations.list', params, combine_key='channels')
+
+
+def channel_history(channel_id):
+    params = {
+        'token': os.environ['SLACK_USER_TOKEN'],
+        'channel': channel_id,
+        'limit': 200
+    }
+
+    return paginated_get('https://slack.com/api/conversations.history', params, combine_key='messages')
+
+
+def user_list(team_id=None):
+    params = {
+        'token': os.environ['SLACK_USER_TOKEN'],
+        'limit': 200,
+        'team_id': team_id
+    }
+
+    return paginated_get('https://slack.com/api/users.list', params, combine_key='members')
+
+
+def channel_replies(timestamps, channel_id):
+    replies = []
+    for timestamp in timestamps:
+        params = {
+            'token': os.environ['SLACK_USER_TOKEN'],
+            'channel': channel_id,
+            'ts': timestamp,
+            'limit': 200
+        }
+        replies.append(paginated_get('https://slack.com/api/conversations.replies', params, combine_key='messages'))
+
+    return replies
+
+
+# parsing
+
+def parse_channel_list(channels, users):
+    result = ''
+    for channel in channels:
+        ch_id = channel['id']
+        ch_name = channel['name'] if 'name' in channel else ''
+        ch_private = 'private ' if 'is_private' in channel and channel['is_private'] else ''
+        if 'is_im' in channel and channel['is_im']:
+            ch_type = 'direct_message'
+        elif 'is_mpim' in channel and channel['is_mpim']:
+            ch_type = 'multiparty-direct_message'
+        elif 'group' in channel and channel['is_group']:
+            ch_type = 'group'
+        else:
+            ch_type = 'channel'
+        if 'creator' in channel:
+            ch_ownership = 'created by %s' % name_from_uid(channel['creator'], users)
+        elif 'user' in channel:
+            ch_ownership = 'with %s' % name_from_uid(channel['user'], users)
+        else:
+            ch_ownership = ''
+        ch_name = ' %s:' % ch_name if ch_name.strip() != '' else ch_name
+        result += '[%s]%s %s%s %s\n' % (ch_id, ch_name, ch_private, ch_type, ch_ownership)
+
+    return result
+
+
+def name_from_uid(user_id, users, real=False):
+    for user in users:
+        if user['id'] == user_id:
+            return user['real_name'] if real else user['name']
+    return '[null user]'
+
+
+def name_from_ch_id(channel_id, channels):
+    for channel in channels:
+        if channel['id'] == channel_id:
+            return (channel['user'], 'Direct Message') if 'user' in channel else (channel['name'], 'Channel')
+    return '[null channel]'
+
+
+def parse_user_list(users):
+    result = ''
+    for u in users:
+        entry = '[%s] %s (%s), %s' % (u['id'], u['name'], u['real_name'], u['tz'])
+        u_type = ''
+        if 'is_admin' in u and u['is_admin']:
+            u_type += 'admin|'
+        if 'is_owner' in u and u['is_owner']:
+            u_type += 'owner|'
+        if 'is_primary_owner' in u and u['is_primary_owner']:
+            u_type += 'primary_owner|'
+        if 'is_restricted' in u and u['is_restricted']:
+            u_type += 'restricted|'
+        if 'is_ultra_restricted' in u and u['is_ultra_restricted']:
+            u_type += 'ultra_restricted|'
+        if 'is_bot' in u and u['is_bot']:
+            u_type += 'bot|'
+        if 'is_app_user' in u and u['is_app_user']:
+            u_type += 'app_user|'
+        u_type = u_type[:-1] if u_type.endswith('|') else u_type
+        entry += ', ' if u_type.strip() != '' else ''
+        entry += '%s\n' % u_type
+        result += entry
+
+    return result
+
+
+def parse_channel_history(msgs, users, check_thread=False):
+    if 'messages' in msgs:
+        msgs = msgs['messages']
+
+    messages = [x for x in msgs if x['type'] == 'message']  # files are also messages
+    body = ''
+    for msg in messages:
+        if 'user' in msg:
+            usr = {'name': name_from_uid(msg['user'], users), 'real_name': name_from_uid(msg['user'], users, True)}
+        else:
+            usr = {'name': '', 'real_name': 'none'}
+
+        timestamp = datetime.fromtimestamp(round(float(msg['ts']))).strftime('%m-%d-%Y %H:%M:%S')
+        text = msg['text'] if msg['text'].strip() != "" else "[no message content]"
+        for u in [x['id'] for x in users]:
+            text = str(text).replace('<@%s>' % u, '<@%s> (%s)' % (u, name_from_uid(u, users)))
+
+        entry = "Message at %s\nUser: %s (%s)\n%s" % (timestamp, usr['name'], usr['real_name'], text)
+        if 'reactions' in msg:
+            rxns = msg['reactions']
+            entry += "\nReactions: " + ', '.join('%s (%s)' % (x['name'], ', '.join(
+                name_from_uid(u, users) for u in x['users'])) for x in rxns)
+        if 'files' in msg:
+            files = msg['files']
+            entry += "\nFiles:\n" + '\n'.join(' - %s, %s' % (f['name'], f['url_private_download']) for f in files)
+
+        entry += '\n\n%s\n\n' % ('*' * 24)
+
+        if check_thread and 'parent_user_id' in msg:
+            entry = '\n'.join('\t%s' % x for x in entry.split('\n'))
+
+        body += entry.rstrip('\t')  # get rid of any extra tabs between trailing newlines
+
+    return body
+
+
+def parse_replies(threads, users):
+    body = ''
+    for thread in threads:
+        body += parse_channel_history(thread, users, check_thread=True)
+        body += '\n'
+
+    return body
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-o', help="Directory in which to save output files (if left blank, prints to stdout)")
+    parser.add_argument('--lc', action='store_true', help="List all conversations in your workspace")
+    parser.add_argument('--lu', action='store_true', help="List all users in your workspace")
+    parser.add_argument('--json', action='store_true', help="Give the requested output in raw JSON format (no parsing)")
+    parser.add_argument('-c', action='store_true', help="Get history for all accessible conversations")
+    parser.add_argument('-r', action='store_true', help="Get reply threads for all accessible conversations")
+    a = parser.parse_args()
+
+    ts = str(datetime.strftime(datetime.now(), '%m-%d-%Y_%H%M%S'))
+
+    def save(data, filename):
+        if a.o is None:
+            print(data)
+        else:
+            out_dir_parent = os.path.abspath(os.path.expanduser(os.path.expandvars(a.o)))
+            out_dir = os.path.join(out_dir_parent, 'slack_export_%s' % ts)
+            filename = filename + '.json' if a.json else filename + '.txt'
+            os.makedirs(out_dir, exist_ok=True)
+            full_filepath = os.path.join(out_dir, filename)
+            print("Writing output to %s" % full_filepath)
+            with open(full_filepath, mode='w') as f:
+                if a.json:
+                    json.dump(data, f, indent=4)
+                else:
+                    f.write(data)
+
+    def save_replies(channel_hist, channel_id, users):
+        ch_replies = channel_replies([x['ts'] for x in channel_hist if 'reply_count' in x], channel_id)
+        if a.json:
+            data_replies = ch_replies
+        else:
+            ch_name, ch_type = name_from_ch_id(ch_id, ch_list)
+            header_str = 'Threads in %s: %s\n%s Messages' % (ch_type, ch_name, len(ch_replies))
+            data_replies = parse_replies(ch_replies, users)
+            sep = '=' * 24
+            data_replies = '%s\n%s\n\n%s' % (header_str, sep, data_replies)
+        save(data_replies, 'channel-replies_%s' % channel_id)
+
+    if a.lc:
+        data = channel_list() if a.json else parse_channel_list(channel_list(), user_list())
+        save(data, 'channel_list')
+    if a.lu:
+        data = user_list() if a.json else parse_user_list(user_list())
+        save(data, 'user_list')
+    if a.c:
+        ch_list = channel_list()
+        users = user_list()
+        for ch_id in [x['id'] for x in ch_list]:
+            ch_hist = channel_history(ch_id)
+            if a.json:
+                data_ch = ch_hist
+            else:
+                data_ch = parse_channel_history(ch_hist, users)
+                ch_name, ch_type = name_from_ch_id(ch_id, ch_list)
+                header_str = '%s Name: %s' % (ch_type, ch_name)
+                sep = '=' * 24
+                data_ch = 'Channel ID: %s\n%s\n%s Messages\n%s\n\n' % (ch_id, header_str, len(ch_hist), sep) + data_ch
+            save(data_ch, 'channel_%s' % ch_id)
+            if a.r:
+                save_replies(ch_hist, ch_id, users)
+    # elif, since we want to avoid asking for channel_history twice
+    elif a.r:
+        for ch_id in [x['id'] for x in channel_list()]:
+            save_replies(channel_history(ch_id), ch_id, user_list())
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,3 @@
-requests==2.24.0
-Flask==1.1.2
-python-dotenv==0.15.0
-slackclient==2.9.3
+Flask~=1.1.2
+requests~=2.24.0
+python-dotenv~=0.15.0