一、背景

业务开发中可能需要将html的标签全部去掉，本文将多种方法综合在这里，供大家参考。

二、方法

2.1 纯正则方法

import java.util.regex.Matcher; 
import java.util.regex.Pattern; 
 
public class HTMLSpirit{ 
    public static String delHTMLTag(String htmlStr){ 
        String regEx_script="&lt;script[^&gt;]*?&gt;[\\s\\S]*?&lt;\\/script&gt;"; //定义script的正则表达式 
        String regEx_style="&lt;style[^&gt;]*?&gt;[\\s\\S]*?&lt;\\/style&gt;"; //定义style的正则表达式 
        String regEx_html="&lt;[^&gt;]+&gt;"; //定义HTML标签的正则表达式 
         
        Pattern p_script=Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE); 
        Matcher m_script=p_script.matcher(htmlStr); 
        htmlStr=m_script.replaceAll(""); //过滤script标签 
         
        Pattern p_style=Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE); 
        Matcher m_style=p_style.matcher(htmlStr); 
        htmlStr=m_style.replaceAll(""); //过滤style标签 
         
        Pattern p_html=Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE); 
        Matcher m_html=p_html.matcher(htmlStr); 
        htmlStr=m_html.replaceAll(""); //过滤html标签 
 
        return htmlStr.trim(); //返回文本字符串 
    } 
}
java

import java.util.regex.Matcher; 
import java.util.regex.Pattern; 
 
public class HTMLSpirit{ 
    public static String delHTMLTag(String htmlStr){ 
        String regEx_script="<script[^>]*?>[\\s\\S]*?<\\/script>"; //定义script的正则表达式 
        String regEx_style="<style[^>]*?>[\\s\\S]*?<\\/style>"; //定义style的正则表达式 
        String regEx_html="<[^>]+>"; //定义HTML标签的正则表达式 
         
        Pattern p_script=Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE); 
        Matcher m_script=p_script.matcher(htmlStr); 
        htmlStr=m_script.replaceAll(""); //过滤script标签 
         
        Pattern p_style=Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE); 
        Matcher m_style=p_style.matcher(htmlStr); 
        htmlStr=m_style.replaceAll(""); //过滤style标签 
         
        Pattern p_html=Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE); 
        Matcher m_html=p_html.matcher(htmlStr); 
        htmlStr=m_html.replaceAll(""); //过滤html标签 
 
        return htmlStr.trim(); //返回文本字符串 
    } 
}

2.2 使用 javax.swing.text.html.HTMLEditorKit

import java.io.IOException;
import java.io.FileReader;
import java.io.Reader;
import java.util.List;
import java.util.ArrayList;
 
import javax.swing.text.html.parser.ParserDelegator;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.MutableAttributeSet;
 
public class HTMLUtils {
  private HTMLUtils() {}
 
  public static List&lt;String&gt; extractText(Reader reader) throws IOException {
    final ArrayList&lt;String&gt; list = new ArrayList&lt;String&gt;();
 
    ParserDelegator parserDelegator = new ParserDelegator();
    ParserCallback parserCallback = new ParserCallback() {
      public void handleText(final char[] data, final int pos) {
        list.add(new String(data));
      }
      public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { }
      public void handleEndTag(Tag t, final int pos) {  }
      public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
      public void handleComment(final char[] data, final int pos) { }
      public void handleError(final java.lang.String errMsg, final int pos) { }
    };
    parserDelegator.parse(reader, parserCallback, true);
    return list;
  }
 
  public final static void main(String[] args) throws Exception{
    FileReader reader = new FileReader("java-new.html");
    List&lt;String&gt; lines = HTMLUtils.extractText(reader);
    for (String line : lines) {
      System.out.println(line);
    }
  }
}
java

import java.io.IOException;
import java.io.FileReader;
import java.io.Reader;
import java.util.List;
import java.util.ArrayList;
 
import javax.swing.text.html.parser.ParserDelegator;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.MutableAttributeSet;
 
public class HTMLUtils {
  private HTMLUtils() {}
 
  public static List<String> extractText(Reader reader) throws IOException {
    final ArrayList<String> list = new ArrayList<String>();
 
    ParserDelegator parserDelegator = new ParserDelegator();
    ParserCallback parserCallback = new ParserCallback() {
      public void handleText(final char[] data, final int pos) {
        list.add(new String(data));
      }
      public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { }
      public void handleEndTag(Tag t, final int pos) {  }
      public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
      public void handleComment(final char[] data, final int pos) { }
      public void handleError(final java.lang.String errMsg, final int pos) { }
    };
    parserDelegator.parse(reader, parserCallback, true);
    return list;
  }
 
  public final static void main(String[] args) throws Exception{
    FileReader reader = new FileReader("java-new.html");
    List<String> lines = HTMLUtils.extractText(reader);
    for (String line : lines) {
      System.out.println(line);
    }
  }
}

2.3 使用Jsoup框架

import java.io.IOException;
import java.io.FileReader;
import java.io.Reader;
import java.io.BufferedReader;
import org.jsoup.Jsoup;
 
public class HTMLUtils {
  private HTMLUtils() {}
 
  public static String extractText(Reader reader) throws IOException {
    StringBuilder sb = new StringBuilder();
    BufferedReader br = new BufferedReader(reader);
    String line;
    while ( (line=br.readLine()) != null) {
      sb.append(line);
    }
    String textOnly = Jsoup.parse(sb.toString()).text();
    return textOnly;
  }
 
  public final static void main(String[] args) throws Exception{
    FileReader reader = new FileReader
          ("C:/RealHowTo/topics/java-language.html");
    System.out.println(HTMLUtils.extractText(reader));
  }
java

2.4 使用Apache Tika

import java.io.FileInputStream;
import java.io.InputStream;
 
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
 
public class ParseHTMLWithTika {
  public static void main(String args[]) throws Exception {
 
    InputStream is = null;
    try {
 
         is = new FileInputStream("C:/Temp/java-x.html");
        WriteOutContentHandler contenthandler = new WriteOutContentHandler(100000000);
         Metadata metadata = new Metadata();
         Parser parser = new AutoDetectParser();
         parser.parse(is, contenthandler, metadata, new ParseContext());
         System.out.println(contenthandler.toString());
    }
    catch (Exception e) {
      e.printStackTrace();
    }
    finally {
        if (is != null) is.close();
    }
  }
}
java

==注意WriteOutContentHandler参数是限制的字符数，这个如果不设置默认是1万，超过会报异常。== 具体的jar包请自行到中央仓库里搜索依赖配置

https://search.maven.org/&nbsp; 和&nbsp;https://mvnrepository.com/

三、提供一个工具类

可以将资源路径的文本类型文件（如json/html）读取成字符串

public class ResourceUtil {
    /**
     * 根据当前类路径，获取资源文件夹对应文件的所有字符串
     *
     * @param currentClass 如 this.class
     * @param resourcePath 如 /data/json/xxx.json （相对于resources文件夹）
     */
    public static String resource2String(Class currentClass, String resourcePath) throws IOException {
        return IOUtils.toString(new FileReader(new File(currentClass.getResource(resourcePath).getFile())));
    }
}
java

四、总结

这里提供了多种去除html标签的方式，建议先测试好再实际使用。测试时读取资源文件可以使用第三节提供的工具类。如果正则表达式无法满足你的需求，自己进一步优化即可。如果其他方式仍然有特殊情况没有考虑到，可以自己先用正则去除这种特殊情况。总之这里只是一种参考，提供了多种解决方案。

裴大头-秦可爱

裴大头-秦可爱

Java去掉html标签的各种姿势

一、背景

二、方法

2.1 纯正则方法

2.2 使用 javax.swing.text.html.HTMLEditorKit

2.3 使用Jsoup框架

2.4 使用Apache Tika

三、提供一个工具类

四、总结

裴

Java去掉html标签的各种姿势

一、背景

二、方法

2.1 纯正则方法

2.2 使用 javax.swing.text.html.HTMLEditorKit

2.3 使用Jsoup框架

2.4 使用Apache Tika

三、提供一个工具类

四、总结

Cursor、Claude Code之后，我发现了一个更适合团队开发的 AI 编程平台：MonkeyCode

vue实现页面滑动至指定位置

裴