BaseCTF new-base WP | C3ngH's B10g

最近遇到一道很有意思的题，是BaseCTF 2024的new-base，考到base解密的原理，之前一直没有认真的学过base的原理，所以写一篇博客记录一下解题过程和自己用的脚本

题面是仅由BaseCTF!八个字符重复七百多万次组成，一开始以为是词频统计但是发现只有这八个字符，联想到base64换表，但这里只有八个字符，结合文件名new-base猜测是base8编码，在恶补完base64的原理后手搓了一个仅限于本题使用的脚本，抽空的时候改成了任意base换表都通用的脚本，放在这里记录一下

import math
import binascii

def baseDecode(inputString, filePath, basePath):
    # 从base表生成字典
    dist = {char: index for index, char in enumerate(inputString)}
    bitLength = math.ceil(math.log2(len(inputString)))

    # 读取加密的文本文件内容
    with open(filePath, "r", encoding="utf-8") as file:
        encryptedText = file.read().strip()

    # 将加密文本转换为对应的数字列表和二进制字符串
    encryptedNumbers = [dist[char] for char in encryptedText]
    encryptedBinary = ''.join([bin(num)[2:].zfill(bitLength) for num in encryptedNumbers])

    # 将二进制字符串转换为字节流
    byteArray = bytearray(int(encryptedBinary[i:i+8], 2) for i in range(0, len(encryptedBinary), 8))

    return encryptedBinary, byteArray

def guessFileType(byteArray):
    # 根据字节流的前几个字节推测文件类型
    magicNumber = binascii.hexlify(byteArray[:8]).decode('utf-8')

    # 常见文件类型的文件头/尾
    fileTypes = {
        "504b0304": "zip",
        "52617221": "rar",
        "377abcaf": "7z",
        "03f30d0a": "pyc",
        "ffd8ffe0": "jpg",
        "ffd8ffe1": "jpg",
        "ffd8ffe2": "jpg",
        "89504e47": "png",
        "47494638": "gif",
        "1f8b0800": "gz",
        "38425053": "psd",
        "49492a00": "tif",
        "424d": "bmp",
        "41433130": "dwg",
        "7b5c7274": "rtf",
        "3c3f786d6c": "xml",
        "68746d6c3e": "html",
        "44656c69766572792d646174653a": "eml",
        "cfad12fec5fd746f": "dbx",
        "2142444e": "pst",
        "d0cf11e0": "doc/xls",
        "5374616e64617264204a": "mdb",
        "ff575043": "wpd",
        "252150532d41646f6265": "eps/ps",
        "255044462d312e": "pdf",
        "ac9ebd8f": "qdf",
        "e3828596": "pwl",
        "57415645": "wav",
        "41564920": "avi",
        "2e7261fd": "ram",
        "2e524d46": "rm",
        "000001ba": "mpg",
        "6d6f6f76": "mov",
        "3026b2758e66cf11": "asf",
        "4d546864": "mid",
        "00000018667479704d3441": "m4a",
    }

    # 截取适当长度的文件进行匹配
    maxMagicLen = max(len(key) for key in fileTypes.keys())
    magicNumber = magicNumber[:maxMagicLen]

    # 猜测文件类型
    for key, value in fileTypes.items():
        if magicNumber.startswith(key):
            return value

    return None

def saveFile(fileExtension, byteArray, encryptedBinary, basePath):
    if fileExtension is None:
        # 如果未识别文件类型，保存为二进制字符串文件
        binaryOutputPath = f"{basePath}binary.txt"
        with open(binaryOutputPath, "w", encoding="utf-8") as file:
            file.write(encryptedBinary)
        print(f"无法识别文件类型，已自动保存二进制字符串为文本文件: {binaryOutputPath}")
    else:
        # 保存为识别出的文件类型
        outputFilePath = f"{basePath}decrypted_file.{fileExtension}"
        with open(outputFilePath, "wb") as file:
            file.write(byteArray)
        print(f"识别到类型为: {fileExtension}")
        print(f"文件已自动保存为: {outputFilePath}")

        # 询问是否要保存二进制字符串文件
        saveBinary = input("是否要保存二进制字符串到文本文件？(yes/no): ").strip().lower()
        if saveBinary == 'yes' or saveBinary == 'y':
            binaryOutputPath = f"{basePath}binary.txt"
            with open(binaryOutputPath, "w", encoding="utf-8") as file:
                file.write(encryptedBinary)
            print(f"二进制字符串已保存为: {binaryOutputPath}")

def main():

    basePath = "C:/Users/67300/Desktop/Misc/base8/"     # 文件夹路径
    filePath = f"{basePath}new-base.txt"               # 加密的文本文件路径
    inputString = "BaseCTF!"                    # base表，长度自定义，脚本会自动识别切割长度，理论所有base都生效

    encryptedBinary, byteArray = baseDecode(inputString, filePath, basePath)
    fileExtension = guessFileType(byteArray)
    saveFile(fileExtension, byteArray, encryptedBinary, basePath)

if __name__ == "__main__":
    main()