Tika 解析未知类型文件 Base64 加密字符串,生成原文件

原创 tikabase64

场景设置

假设,有一个文件的 base64 编码后的字符串,当然,也可以是其他加密方式,只要不是单向加密,并且可以解密就行。

现在的需求是把这个加密字符串生成原始文件。也就是要获得文件格式,并输出内容到生成文件中。不要管为什么要这样做,总有原因的。

也比较简单,需要把 base64 字符串解码为 byte 数组,使用 tika 解析得到文件类型,就可以直接输出生成文件名称,并输出文件。

案例代码

依赖包:

  1. junit.jar
  2. commons-codec.jar
  3. commons-io-1.3.1.jar
  4. tika-app-1.12.jar

TikaMimiTest.java

import java.io.File;
import java.io.IOException;

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.FileUtils;
import org.apache.tika.Tika;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.junit.Before;
import org.junit.Test;

/**
 * Tika 解析流
 * @author liuqianfei
 */
public class TikaMimiTest
{
    Base64 decoder;
    Tika tika;
    MimeTypes fullTypes;

    @Before
    public void init()
    {
        decoder = new Base64();
        tika = new Tika();
        fullTypes = MimeTypes.getDefaultMimeTypes();
    }

    @Test
    public void test() throws MimeTypeException, IOException
    {
        String base64 = 
                  "/9j/4AAQSkZJRgABAQEAAQABAAD/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsK"
                + "CwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/2wBDAQMEBAUEBQkFBQkUDQsNFBQUFBQU"
                + "FBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBT/wAARCABNAE0DASIA"
                + "AhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQA"
                + "AAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3"
                + "ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWm"
                + "p6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEA"
                + "AwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSEx"
                + "BhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElK"
                + "U1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3"
                + "uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD9SoWb"
                + "y/vfw18f/tOzy/Af4/eAP2iEje58Lrbt4M8WtHG0hs7CWZmhuv4tqxzfewu7+H+KvsJfm/h2/dr4"
                + "6+KnxJ8bftH+JPGPwZ+FJ0+LR9Ojk0zxp4w1GOO4gtVmWVZNPtbf7s07L8rfwx7drNubdQBvftt2"
                + "GoeKf2XIvF/gjydcvfDGqab4v01YR5y3iW8yybl2/eXy33f7Sr/tVnx/8FDPg5qEEQ0C08U+J5pY"
                + "47iS20bQLyZhIzbfLztVfMX+Jfu/w7t3y17n8GPhZpvwV+FHh7wNpVzealY6JbLBDNflWmk2ybtx"
                + "/h+993b8q13FpBHaiUQRJAv3tqrtX+L5vu0Afnz+zF+0JqHwFtfiRo9/8D/jG3h3UPFl9rWkXMfg"
                + "6aRre1uNm6O43Mu1lZS38XytW3+w18b/AIbN4h+Ofh3U/EtloGs654/1TWLPTNZk+xXElrMsPlti"
                + "T5fM3K26P7y7fmWvvCOQLKwD/wB35f8AgTV538TfgP8ADz43aW9h418LaZ4hUqy+bdw/vo/vL+7m"
                + "/wBZH/D8yt/Cv+zQB6LC32iPzEb7yqytU0e1pJMV8c3n7OvxQ/ZrDaz8BvFd14p8KWfzzfDDxfN9"
                + "qhlVdytHY3jHzLdtqrtjb5d33mb7lewfAb9oHw5+0V4f1GfRmu9L1TT5BZ61oF8Gg1HR7r+KOZfl"
                + "ZfmVtrL8rbWX5WVloA9oj+81S1XibDvzu/8A2mqxQB86ftk/E7xF4A+Gml+HvBTH/hOfG+rW/hjR"
                + "rvPzWsk+7zLrhf8AlnGrNu/h+Vq6b4LfCvwv+y/8HLfQbe8S2t9PhkvtV1y9kVXu5vvXF1O7M23+"
                + "8zM21V/2Vrx/xtc23jj/AIKL/Dbw/KQIfBXh3UtcihU7VMk6wwqzf7vmNX1FrXhzSvFWh3uk6xp8"
                + "Gp6beQeTNaXcKyRzRsrLtZW+VvloA4OP9qD4JMzMPjP4Gkb/ALGix+X/AMiVzt/+1Z8PJviH4A8G"
                + "+FPFGl+PNa8R6hJZmLQtYtrprWGO3lmaeby2+78q/L/F/wABrw747fDXwz8PPjz8HfD3hX9mvwP4"
                + "j8OeKrx7XX9QTwkk32KFZoV8xVjXbD5as0jNIrKy/L/CzV9K+Bv2bPhZ4A8Rwa/4Z8A+H/D2tQK0"
                + "cF9p9hHDIqsu19rR/wB5d3zf7TUAepQrtaT5tzf/AGTV8k65+2ZbfCP9ov4n+FfiPa6hZaDaLpc3"
                + "hifTNKu7z7SklszXG5olZf8AWfd+Vf4v7u6vrOINmQ7dq/w/+PfNUYDGZzu2szY+Vf4fm/z/APtU"
                + "AfPHg/8Abi/Z98dXNrpmlfEfTILu5eNY7a7e609mZpNvl/vI4/m3fw/+y1zv7U/wx1DwHfyftAfD"
                + "SE2vjnw1btNqltAq+T4i0lfmuLWb/aVV3RyKu75dv3mjZdf4O+PPEfxi+M/xh0Xx38MbHSNN8Iaj"
                + "Db+HtQvtIkZr6PzLhfOWSRdsm5o45FaPb8rL8v8AFX0XJpdtdabPp8tvHLYyqyNbtH8rKzNuVl/u"
                + "0AYXw88a6f8AEXwDoHizRZJJ9N1m1ivrc45aOSPd83y/7Xzf7X/fNdeZHXpHXyZ/wT9ebw18NPiP"
                + "8M5HeNvh/wCMNV0OzEsjeY1nu86CT5vuq3mNt/2Vr6rupMFPmPT+GgD5CvtPl8P/APBUPRtRmkjk"
                + "t/EHga6tLXb8zK0MkLN/46tfYUPzNJ8o/wAs1fJP7Y11Z/DT4sfAX40XDyxaT4f16bRdYu43Xybe"
                + "xv42ha4m/wCmccm35v8Aa/vMtfVemyR3FsJ4ZxNEQrIy/dYf3t38W7ru96AL33m3H7y0yPd5sm7+"
                + "6v8A7NTG3bm2/N/D/wChV8q+Kv24NT8N/wBsTR/s9/Fy5ttMeQXN7J4eEcJjRm/eL+8bcvH3v4V+"
                + "b7tAH1UsrFW+Zdy/xU3zd25i25f/ANqvkHwj+2t47+Jfg+18V+EPgF4i17w3fFmsr6PXNOXztrMp"
                + "Xbu3L+8Ux/3ty/3vlrO8K/tsfFvx7Z6nd+GP2Z9c1D7FeT6bOZvElpB5NzC2ySGTdDuVlkbbt/u/"
                + "NWSM48x9hw7g0j7uSqq0m75vvNt/9Cp8Uq7W+b/gW7/aavkT4A/tv6p8T9V12Pxr4a0P4cWVlNcW"
                + "DRz6+txfreQyKrRtD5K7VXc3zN/Ev8Ve/wAvxU8HWngPXfGn/CSW8+habDLdXd2su5Yo49275ev8"
                + "LbV/ib/erGM6fN7PmNJYXFUaXtpU5cv832TyX9nHFt+2v+1jZR8W8dx4ZuET+7JNprtI3/Atq/8A"
                + "fNfUMifNXy/+w9pGr+IfD/jf41eKbVdL1j4najFq0cRZf3WkwwrHYKzK33vL3Nu+X733a+qNobrX"
                + "WBynjLwdpnxF8H6t4Y1y0a+0rVrOSyuYJH+Zo5FZW+b+H5W+8vzfN/s18oeBPinq/wCxFFpvw9+L"
                + "Zur3wIHWz8LeO1TzLeG3+ZVs9QVV/czRrtVZP9XJH83/ACzavtiFV2/dX7tYXibw1pPjDQ7zS9a0"
                + "611fTbhfLuLS9t/OjkXd91lagB2j31rrmmQ6npeoxX1peRrJBc20izQyL/DtZfvLz96tDD7+Cf8A"
                + "af8Au/e/z/wKvhD4bfCvTfhp/wAFAL7wp8Hpr3QvA9jpP9qeONC+0NJphuriNvstvDG3+rmb5Zvl"
                + "bbtXau1VZa+77Y7Eldsf739771TH3TNHyT8E9PX4Q/tyfFj4eaTcOvhnxPotr44t9NjZlh0+7Nx9"
                + "nuFjXdt/fM3mNt/h2r/DXZ/F2+t/2dfCPxc+I9nq11Jqetx2dz9huJVNvHexwi1ieNfvZkWOFW/h"
                + "byV2r96uK+Gcw8Vf8FFPjVq8+YLnwl4d0nw/aWm5Q8kNztvJJsf3Vb/e+8v95VrJ/bM8Ux+Nfih4"
                + "O+FUT/uFP9u6rFj928a+YsKsv8XzJ/46tcOMqyo4eUj3ciwP9oZlSoy+GPvS/wAJ846loEPg/wAH"
                + "32u6tqMt/wCIdV3Xd3cs25lkZmbb/e/2v+BV518EfDNr8TPjl4T8JarNI/hPUtQhm1PRxIy2995c"
                + "m5Y5lVl8xd38P+1u2/Ltrqvjf4g0efVLi1ezaP5vvQSbfl3SM3y/3mrrf+CfXw3h8f8A7Q2oeJ54"
                + "Wl03wtarcQ7t22O6m3LGv/AV3f8AfP8AwGvhcsjKWM5on9GcQzp4fhetHFR+L4f/AG0/UmzsIIY2"
                + "hjjVbdV8sQL/AKsLub+GpppTu4O2pRhSWP8An71VC/8AtIf+AtX6Ufy0XYvuD/dquzrCJpJCF2Dc"
                + "zf7O5qu7a8s/aF1q68Nfs7/FTWbF/Kv9N8O6xd20n9yWO0mdW/MUAeIfsLWtt4gs/jP8Wi6TSeMf"
                + "GOpNa3sa/e0+2k8u32t/Eq/vPmr0D4VfH668W/HX4n/CvxNYQaF4g8Ozx3OkxK7f8TLSZFXbdDcu"
                + "1mVm+bb/ABSbf4a+e77xPq/wS/4J3/AbWPCGozaVcC60KSaJFTy7pLol54ZV2/NG5lYnGGP96vQf"
                + "27vBtndfAa9+LdnLPo3xF8EWcGqaJ4g01/KubdpHVZIS38cLBmBR8/eb+8alGaO8+L/7K3gz42a9"
                + "beJ21jxF4N8aWsX2WHxP4V1FtPvzBlv3LNtZWT/e+b5fvLXmHgz9i/RLnxp4/wBQ1zx7428S6lbr"
                + "b2Nrr+o6tHPfwx+X5k0at5W1V8xm+Vf/AEKvVv2efihrXxQ/Zs8EeN9XFous6tYC6uRBAFi3iURf"
                + "Kvbglv8Ae/2flrrvCQEPwpa/Cr9ruY2nnlxzI3mN1rH2cakeWRVPEVsFPnhLll/dPz1+Ov7JniHw"
                + "pNqGp6Z4ktfEFvDukkiuflmWP98zN97+8u1f93d/FX1l+wH8JLz4V/AG2vNXgWDWvEUzatcq3zMq"
                + "Mq+Wrf8AAV3f8Crlfiz/AKdp9zbyKpjubjTrJ+P+Wc8iB/8Ax1yn0r660qwgsdGS1hjCW0CLEkXY"
                + "KOlcWGwWHoy9pTie3ieKM2zrL1Qx1TmhHr1NFXPmNhD/AL3/AH1/n/gVRSI+7laTThuWRj971/4E"
                + "1Xhtbqor1Txj/9k=";
        byte[] bytes = decoder.decode(base64.getBytes());
        String fileMime = tika.detect(bytes);
        MimeType mimeType = fullTypes.forName(fileMime);

        // getExtension() 获取最常用的后缀名称,而getExtensions()方法获取该mime下对应的所有格式
        String fileType = mimeType.getExtension();
        System.out.println("====>" + fileType);

        // 写入磁盘
        String fileName = "F:/tika" + fileType;
        FileUtils.writeByteArrayToFile(new File(fileName), bytes);
    }

}

控制台输出内容为====>.jpg。并且在生成正确的图片文件。

W4GGkb9.jpg

代码分析

import org.apache.tika.mime.MimeTypes;
MimeTypes fullTypes =  = MimeTypes.getDefaultMimeTypes();

上面代码获取 tika 能够支持的所有文件类型信息。这些类型信息被内置在一个 XML 文件中,通过 MimeTypes.getDefaultMimeTypes() 方法的源码可以看出:

/**
 * Get the default MimeTypes. This includes all the built-in
 * media types, and any custom override ones present.
 * 
 * @param classLoader to use, if not the default
 * @return MimeTypes default type registry
 */
public static synchronized MimeTypes getDefaultMimeTypes(ClassLoader classLoader) {
    MimeTypes types = DEFAULT_TYPES;
    if (classLoader != null) {
        types = CLASSLOADER_SPECIFIC_DEFAULT_TYPES.get(classLoader);
    }

    if (types == null) {
        try {
            types = MimeTypesFactory.create(
                  "tika-mimetypes.xml", "custom-mimetypes.xml", classLoader);
        } catch (MimeTypeException e) {
            throw new RuntimeException(
                    "Unable to parse the default media type registry", e);
        } catch (IOException e) {
            throw new RuntimeException(
                    "Unable to read the default media type registry", e);
        }

        if (classLoader == null) {
            DEFAULT_TYPES = types;
        } else {
            CLASSLOADER_SPECIFIC_DEFAULT_TYPES.put(classLoader, types);
        }
    }
    return types;
}

文件类型配置信息,在 org\apache\tika\mime\tika-mimetypes.xml 文件中。

tika.detect(bytes) 获取到的字符串为形如 image/jpegapplication/vnd.ms-worksMIME 类型。

使用 fullTypes.forName(fileMime) 获取到 tika-mimetypes.xml 配置文件中 节点的配置信息。比如 JPEG 类型的图片:

<mime-type type="image/jpeg">
    <acronym>JPEG</acronym>
    <_comment>Joint Photographic Experts Group</_comment>
    <tika:link>http://en.wikipedia.org/wiki/Jpeg</tika:link>
    <tika:uti>public.jpeg</tika:uti>
    <magic priority="50">
        <!-- FFD8 is the SOI (Start Of Image) marker.              -->
        <!-- It is followed by another marker that starts with FF. -->
        <match value="0xffd8ff" type="string" offset="0"/>
    </magic>
    <glob pattern="*.jpg"/>
    <glob pattern="*.jpeg"/>
    <glob pattern="*.jpe"/>
    <glob pattern="*.jif"/>
    <glob pattern="*.jfif"/>
    <glob pattern="*.jfi"/>
</mime-type>

getExtension() 获取配置中第一个后缀名称,而 getExtensions() 方法获取该mime下对应的所有格式。

/**
 * Returns the preferred file extension of this type, or an empty string
 * if no extensions are known. Use the {@link #getExtensions()} method to
 * get the full list of known extensions of this type.
 *
 * @since Apache Tika 0.9
 * @return preferred file extension or empty string
 */
public String getExtension() {
    if (extensions == null) {
        return "";
    } else {
        return extensions.get(0);
    }
}

把文件写入磁盘就简单多了,这属于 Java 基本功了。

如果觉得这对你有用,请随意赞赏,给与作者支持
评论 0
最新评论