public Splitter(Pattern pattern, Pattern[] patterns) {
this.pattern = pattern;
this.patterns = patterns;
public Splitter(Pattern pattern, Pattern[] patterns, boolean flag) {
this.pattern = pattern;
this.flag = flag;
this.patterns = patterns;
Segment.java类如下
import java.util.regex.Pattern;
* Created by ajtdnyy on 13-9-3.
public class Segment {
String value;
Pattern pattern;
public Segment(String value, Pattern pattern) {
this.value = value;
this.pattern = pattern;
ChineseAddress.java类如下
import java.util.List;
* Created by ajtdnyy on 13-9-3.
public class ChineseAddress {
public String source;
public String nation;
public String province;
public String city;
public String county;
public String district;
public String street;
public List<String> roads;
public String number;
public String plaza;
public String ip;
public String town;
public String village;
public String zone;
public String underground;
public List<String> notes;
public List<String> noises;
private static final String SEPARATOR = System.getProperty("line.separator");
public String toString() {
String s = "src: " + source + SEPARATOR;
if (nation != null) {
s = s + "nat: " + nation + SEPARATOR;
if (province != null) {
s = s + "pro: " + province + SEPARATOR;
if (city != null) {
s = s + "cit: " + city + SEPARATOR;
if (county != null) {
s = s + "cou: " + county + SEPARATOR;
if (district != null) {
s = s + "dis: " + district + SEPARATOR;
if (street != null) {
s = s + "str: " + street + SEPARATOR;
if (number != null) {
s = s + "num: " + number + SEPARATOR;
if (plaza != null) {
s = s + "pla: " + plaza + SEPARATOR;
if (ip != null) {
s = s + "idp: " + ip + SEPARATOR;
if (town != null) {
s = s + "twn: " + town + SEPARATOR;
if (village != null) {
s = s + "vil: " + village + SEPARATOR;
if (zone != null) {
s = s + "zon: " + zone + SEPARATOR;
if (underground != null) {
s = s + "udg: " + underground + SEPARATOR;
if (roads != null) {
s = s + "rod: ";
for (int i = 0; i < roads.size(); i++) {
String r = roads.get(i);
if (r == roads.get(0)) {
s = s + r;
} else {
s = s + " / " + r;
s = s + SEPARATOR;
if (notes != null) {
s = s + "not: ";
for (int i = 0; i < notes.size(); i++) {
String n = notes.get(i);
if (n == roads.get(0)) {
s = s + n;
} else {
s = s + " / " + n;
s = s + SEPARATOR;
if (noises != null) {
s = s + "noi: ";
for (int i = 0; i < noises.size(); i++) {
s = s + noises.get(i) + " / ";
s = s + SEPARATOR;
return s;
ChineseAddressParser.java类如下
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
* Created by ajtdnyy on 13-9-3.
public class ChineseAddressParser {
static final String reg = "[\u4e00-\u9fa5]";
static final Pattern ms_Pattern_guo = Pattern.compile("中国");
static final Pattern ms_Pattern_jinjiao = Pattern.compile("近郊");
static final Pattern ms_Pattern_sheng = Pattern.compile(reg + "+?省");
static final Pattern ms_Pattern_shi = Pattern.compile(reg + "+?市(?!场)");
static final Pattern ms_Pattern_qu = Pattern.compile(reg + "+?区");
static final Pattern ms_Pattern_xiang = Pattern.compile(reg + "+?乡");
static final Pattern ms_Pattern_xian = Pattern.compile(reg + "+?县");
static final Pattern ms_Pattern_dao = Pattern.compile(reg + "+?道");
static final Pattern ms_Pattern_hutong = Pattern.compile(reg + "+?胡同");
static final Pattern ms_Pattern_nongtang = Pattern.compile(reg + "+?弄堂");
static final Pattern ms_Pattern_jie = Pattern.compile(reg + "+?街");
static final Pattern ms_Pattern_xiangg = Pattern.compile(reg + "+?巷");
static final Pattern ms_Pattern_lu = Pattern.compile(reg + "+?路");
static final Pattern ms_Pattern_cun = Pattern.compile(reg + "+?村");
static final Pattern ms_Pattern_zhen = Pattern.compile(reg + "+?镇");
static final Pattern ms_Pattern_hao = Pattern.compile("[甲_乙_丙_0-9_-]+?号");
static final Pattern ms_Pattern_point = Pattern.compile(reg + "+?(?:广场|酒店|饭店|宾馆|中心|大厦|百货|大楼|商城)");
static final Pattern ms_Pattern_ditie = Pattern.compile("地铁" + reg + "+?线(?:" + reg + "+?站)?");
static final Pattern ms_Pattern_province = Pattern.compile(reg + "{2,10}?(?:省|特区|自治区|特别行政区)");
static final Pattern ms_Pattern_city = Pattern.compile(reg + "+?(?:市|地区|自治州)");
static final Pattern ms_Pattern_county = Pattern.compile(reg + "+?(?:乡|县)");
static final Pattern ms_Pattern_street = Pattern.compile(reg + "+?街道");
static final Pattern ms_Pattern_road = Pattern.compile(reg + "+?(?:胡同|弄堂|街|巷|路|道)");
static final Pattern ms_Pattern_roadnear = Pattern.compile("(?<=近)" + reg + "+?(?:胡同|弄堂|街|巷|路|道)");
static final Pattern ms_Pattern_ip = Pattern.compile(reg + "+?(?:开发区|科技区|园区)");
static final Pattern ms_Pattern_zone = Pattern.compile(reg + "+?(?:小区|社区|新村)");
static final Pattern ms_Pattern_village = Pattern.compile(reg + "+?村");
static final Pattern ms_Pattern_town = Pattern.compile(reg + "+?镇");
static final Pattern ms_Pattern_number = Pattern.compile("[甲_乙_丙_0-9_-]+号");
static final Pattern ms_Pattern_plaza = Pattern.compile(reg + "+?(?:广场|酒店|饭店|宾馆|中心|大厦|百货|大楼|商城)");
static final Pattern ms_Pattern_underground = Pattern.compile("地铁" + reg + "+?线(?:" + reg + "+?站)?");
static final Splitter ms_splitter_guo = new Splitter(ms_Pattern_guo, new Pattern[]{ms_Pattern_guo});
static final Splitter ms_splitter_sheng = new Splitter(ms_Pattern_sheng, new Pattern[]{ms_Pattern_province});
static final Splitter ms_splitter_shi = new Splitter(ms_Pattern_shi, new Pattern[]{ms_Pattern_city}, false);
static final Splitter ms_splitter_jinjiao = new Splitter(ms_Pattern_jinjiao, new Pattern[]{ms_Pattern_jinjiao});
static final Splitter ms_splitter_qu = new Splitter(ms_Pattern_qu, new Pattern[]{ms_Pattern_province, ms_Pattern_city, ms_Pattern_zone, ms_Pattern_ip, ms_Pattern_qu}, false);
static final Splitter ms_splitter_xiang = new Splitter(ms_Pattern_xiang, new Pattern[]{ms_Pattern_county});
static final Splitter ms_splitter_xian = new Splitter(ms_Pattern_xian, new Pattern[]{ms_Pattern_county});
static final Splitter ms_splitter_dao = new Splitter(ms_Pattern_dao, new Pattern[]{ms_Pattern_street, ms_Pattern_roadnear, ms_Pattern_road}, false);
static final Splitter ms_splitter_hutong = new Splitter(ms_Pattern_hutong, new Pattern[]{ms_Pattern_roadnear, ms_Pattern_road}, false);
static final Splitter ms_splitter_nongtang = new Splitter(ms_Pattern_nongtang, new Pattern[]{ms_Pattern_roadnear, ms_Pattern_road}, false);
static final Splitter ms_splitter_jie = new Splitter(ms_Pattern_jie, new Pattern[]{ms_Pattern_roadnear, ms_Pattern_road}, false);
static final Splitter ms_splitter_lu = new Splitter(ms_Pattern_lu, new Pattern[]{ms_Pattern_roadnear, ms_Pattern_road}, false);
static final Splitter ms_splitter_xiangg = new Splitter(ms_Pattern_xiangg, new Pattern[]{ms_Pattern_roadnear, ms_Pattern_road}, false);
static final Splitter ms_splitter_cun = new Splitter(ms_Pattern_cun, new Pattern[]{ms_Pattern_zone, ms_Pattern_village});
static final Splitter ms_splitter_zhen = new Splitter(ms_Pattern_zhen, new Pattern[]{ms_Pattern_town});
static final Splitter ms_splitter_hao = new Splitter(ms_Pattern_hao, new Pattern[]{ms_Pattern_number});
static final Splitter ms_splitter_point = new Splitter(ms_Pattern_point, new Pattern[]{ms_Pattern_plaza});
static final Splitter ms_splitter_ditie = new Splitter(ms_Pattern_ditie, new Pattern[]{ms_Pattern_underground});
static final Splitter[] ms_defaultsplitters = new Splitter[]{
ms_splitter_guo,
ms_splitter_sheng,
ms_splitter_shi,
ms_splitter_qu,
ms_splitter_xiang,
ms_splitter_xian,
ms_splitter_dao,
ms_splitter_hutong,
ms_splitter_nongtang,
ms_splitter_jie,
ms_splitter_xiangg,
ms_splitter_lu,
ms_splitter_cun,
ms_splitter_zhen,
ms_splitter_hao,
ms_splitter_point,
ms_splitter_ditie,
ms_splitter_jinjiao
private static LinkedHashMap<Integer, Splitter> split(String src, Splitter[] splitters) {
LinkedHashMap<Integer, Splitter> splitterdic = new LinkedHashMap<Integer, Splitter>();
for (Splitter s : splitters) {
Matcher m = s.pattern.matcher(src);
while (m.find()) {
splitterdic.put(m.start() + m.group().length(), s);
if (s.flag) {
break;
return splitterdic;
private static ArrayList<Segment> recognize(String src, LinkedHashMap<Integer, Splitter> splitterdic) {
Segment s;
int index = 0;
ArrayList<Segment> segments = new ArrayList<Segment>();
if (src.length() > 0) {
for (Integer key : splitterdic.keySet()) {
Splitter value = splitterdic.get(key);
if (key > index && key < src.length()) {
for (Pattern r : value.patterns) {
s = segmentRecognize(src.substring(index, key), r);
if (s != null) {
segments.add(s);
break;
index = key;
return segments;
private static Segment segmentRecognize(String src, Pattern r) {
Matcher m = r.matcher(src);
if (m.matches()) {
return new Segment(m.group(), r);
} else {
return null;
private static ArrayList<String> segmentsGetStringListForPattern(ArrayList<Segment> segments, Pattern r) {
ArrayList<String> ss = new ArrayList<String>();
for (Iterator<Segment> it = segments.iterator(); it.hasNext();) {
Segment s = it.next();
if (s.pattern == r) {
ss.add(s.value);
return ss;
private static String segmentsGetStringForPattern(ArrayList<Segment> segments, Pattern r) {
for (Iterator<Segment> it = segments.iterator(); it.hasNext();) {
Segment s = it.next();
if (s.pattern == r) {
return s.value;
return null;
public static void main(String[] args) {
System.out.println(ChineseAddressParser.parse("北京市海淀区中关村北大街37号天龙大厦3层"));
System.out.println(ChineseAddressParser.parse("福州市台江区群众路278号源利明珠大厦6楼"));
System.out.println(ChineseAddressParser.parse("北京西城区百万庄大街68号6楼"));
public static ChineseAddress parse(String source) {
source = source.replace(".", "").replace(",", "").replace(",", "");
ArrayList<Segment> segments = recognize(source, split(source, ms_defaultsplitters));
ChineseAddress ca = new ChineseAddress();
ca.source = source;
ca.nation = segmentsGetStringForPattern(segments, ms_Pattern_guo);
ca.province = segmentsGetStringForPattern(segments, ms_Pattern_province);
ca.city = segmentsGetStringForPattern(segments, ms_Pattern_city);
ca.district = segmentsGetStringForPattern(segments, ms_Pattern_qu);
ca.county = segmentsGetStringForPattern(segments, ms_Pattern_county);
ca.street = segmentsGetStringForPattern(segments, ms_Pattern_street);
ArrayList<String> roads = segmentsGetStringListForPattern(segments, ms_Pattern_road);
ArrayList<String> near = segmentsGetStringListForPattern(segments, ms_Pattern_roadnear);
for (Iterator<String> it = near.iterator(); it.hasNext();) {
roads.add(it.next());
ca.roads = roads;
ca.underground = segmentsGetStringForPattern(segments, ms_Pattern_underground);
ca.number = segmentsGetStringForPattern(segments, ms_Pattern_number);
ca.plaza = segmentsGetStringForPattern(segments, ms_Pattern_plaza);
ca.ip = segmentsGetStringForPattern(segments, ms_Pattern_ip);
ca.town = segmentsGetStringForPattern(segments, ms_Pattern_town);
ca.village = segmentsGetStringForPattern(segments, ms_Pattern_village);
return ca;
}
原文地址:
http://www.vbox.top/38.html
原作者C#博客地址:
http://blog.csdn.net/helanmouse/article/details/4096933?reload
原作者C#博客地址:http://blog.csdn.net/helanmouse/article/details/4096933?reload几乎没接触过C#,不过跟java很相似,花了一天时间,将原作者的代码改成了java版。Splitter.java文件如下import java.util.regex.Pattern;/** * Created by ajtdnyy o
2.没有省,但是包含以下级别地区的关键字。
3.没有省,也没有其他地区的关键字。
我们可以使用indexOf("关键字")来进行定位,但是不能保证地址中不包括关键字,比如袁州区,州和区都是关键字。
这样就需要把所有的关键字全部定位出来,再使用subString()来进行截取,然后进行判断是否为地址...
一、案例场景淘宝优惠券 m.fenfaw.net
字段login_place,一共267725行记录,随机15条记录如下:
后续数据分析工作需要用到地理维度进行分析,所以需要把login_place字段进行拆分成:国家、省份、地区。
二、初步方案
第三方中文分词库:jieba,可以对文本进行拆分。使用参考资料:jieba库的使用。初步方案:
用jieba.cut()将文本拆分为单词列表list_word;
分支判断list_word长度,赋值国家、城市、地区。
代码:(抽取1000条记录,看一下我.
中文地址分词是指将中文地址字符串按照一定的规则或算法进行切分或分段,以便能够更好地理解地址含义并提供更精准的地址处理和查询服务。
在Java中,有一些开源库或工具可用于中文地址分词。常见的有:
1. IKAnalyzer:这是一个基于Lucene的中文分词器,可以进行中文地名的分词,并提供词性标注功能,适用于中文地址中地名部分的切分。
2. HanLP:这是一个自然语言处理工具包,其中包含了中文分词功能。它通过词典和模型等方式对中文地名进行切分,还提供了地址识别和地址纠错等功能。
3. Jieba分词:这是一个Python中文分词工具包,但也有相应的Java版本。它采用了基于统计的分词算法,可以较好地适应不同场景下的中文分词需求。
这些库都可以用于中文地址分词,具体使用时,可以根据实际场景和需求选择适合的工具。一般来说,需要先加载相应的词典或模型文件,然后将地址字符串传入分词函数进行分词操作,最后得到分词结果。
中文地址分词通常是为了实现中文地址的规范化、解析和查询等功能。通过分析地址的各个组成部分,如省市区、街道、门牌号等,可以更准确地进行地址匹配和定位,提供更好的服务体验。这些分词工具在实际应用中被广泛使用,可以用于地址识别、地址拼接、地理编码等场景。