001/* 002 * #%L 003 * Netarchivesuite - heritrix 3 monitor 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.heritrix3.monitor; 025 026import java.util.ArrayList; 027import java.util.Collections; 028import java.util.Comparator; 029import java.util.List; 030 031import javax.servlet.http.HttpServletRequest; 032 033/** 034 * HTTP accept-language header state machine based parser. 035 * Example: "da, en-gb;q=0.8, en;q=0.7". 036 */ 037public class AcceptLanguageParser { 038 039 /** Start state. Look for a language. */ 040 public static final int S_START_SPC = 0; 041 /** Language state. Look for country, qvalue or start of next locale. */ 042 public static final int S_LANG = 1; 043 /** Country state. Look for a country. */ 044 public static final int S_COUNTRY = 2; 045 /** Country parsed state. Look for qvalue or next locale. */ 046 public static final int S_COUNTRY_SPC = 3; 047 /** Semicolon state. Look for an optional qvalue. */ 048 public static final int S_SEMICOLON = 4; 049 /** State name. Look for attribute name. */ 050 public static final int S_NAME = 5; 051 /** State parsed name. Look for value or next attribute or next locale. */ 052 public static final int S_NAME_SPC = 6; 053 /** State equal. Look for first value character or next attribute or next locale. */ 054 public static final int S_EQ = 7; 055 /** State value. Look for the rest of the value and next attribute or next locale. */ 056 public static final int S_VALUE = 8; 057 058 /** 059 * Parsed language, country, locale and qvalue. 060 */ 061 public static class AcceptLanguage { 062 /** Language type string. */ 063 public String language; 064 /** Country subtype string. */ 065 public String country; 066 /** Combined locale string. */ 067 public String locale; 068 /** Optional qvalue, defaults to 1. */ 069 public float qvalue = 1.0f; 070 } 071 072 public static class AcceptLanguageComparator implements Comparator<AcceptLanguage> { 073 @Override 074 public int compare(AcceptLanguage o1, AcceptLanguage o2) { 075 return Math.round(Math.signum(o2.qvalue - o1.qvalue)); 076 } 077 } 078 079 /** Reusable comparator used to sort languages by their qvalue. */ 080 public static AcceptLanguageComparator acceptLanguageComparator = new AcceptLanguageComparator(); 081 082 /** 083 * Parses a HTTP accept-language header, if present, from the supplied HTTP request and returns a sorted list of valid languages. 084 * Languages are sorted by their qvalue. 085 * @param acceptLanguageStr accept language header string from a HTTP request 086 * @return <code>List</code> of valid languages sorted by their qvalue 087 */ 088 public static List<AcceptLanguage> parseHeader(HttpServletRequest req) { 089 return parseHeader(req.getHeader("Accept-Language")); 090 } 091 092 /** 093 * Parses a HTTP accept-language header string and returns a sorted list of valid languages. 094 * Languages are sorted by their qvalue. 095 * @param acceptLanguageStr accept language header string from a HTTP request 096 * @return <code>List</code> of valid languages sorted by their qvalue 097 */ 098 public static List<AcceptLanguage> parseHeader(String acceptLanguageStr) { 099 List<AcceptLanguage> acceptLanguages = new ArrayList<>(); 100 char[] charArr; 101 String name = null; 102 if (acceptLanguageStr != null && acceptLanguageStr.length() > 0) { 103 AcceptLanguage acceptLanguage = null; 104 StringBuilder sb = new StringBuilder(); 105 charArr = acceptLanguageStr.toLowerCase().toCharArray(); 106 char c; 107 int idx = 0; 108 int len = charArr.length; 109 int state = S_START_SPC; 110 boolean bLoop = true; 111 while (bLoop) { 112 if (idx < len) { 113 switch (state) { 114 case S_START_SPC: 115 while (idx < len && charArr[idx] == ' ') { 116 ++idx; 117 } 118 state = S_LANG; 119 acceptLanguage = new AcceptLanguage(); 120 sb.setLength(0); 121 break; 122 case S_LANG: 123 c = charArr[idx]; 124 switch (c) { 125 case ',': 126 acceptLanguages.add(acceptLanguage); 127 acceptLanguage.language = sb.toString(); 128 acceptLanguage.locale = acceptLanguage.language; 129 ++idx; 130 state = S_START_SPC; 131 break; 132 case '-': 133 case '_': 134 acceptLanguages.add(acceptLanguage); 135 acceptLanguage.language = sb.toString(); 136 ++idx; 137 state = S_COUNTRY; 138 sb.setLength(0); 139 break; 140 case ' ': 141 acceptLanguages.add(acceptLanguage); 142 acceptLanguage.language = sb.toString(); 143 acceptLanguage.locale = acceptLanguage.language; 144 ++idx; 145 state = S_COUNTRY_SPC; 146 break; 147 case ';': 148 acceptLanguages.add(acceptLanguage); 149 acceptLanguage.language = sb.toString(); 150 acceptLanguage.locale = acceptLanguage.language; 151 ++idx; 152 state = S_SEMICOLON; 153 break; 154 default: 155 if (c >= 'a' && c <= 'z') { 156 sb.append(c); 157 ++idx; 158 } 159 else { 160 bLoop = false; 161 } 162 break; 163 } 164 break; 165 case S_COUNTRY: 166 c = charArr[idx]; 167 switch (c) { 168 case ' ': 169 if (sb.length() > 0) { 170 acceptLanguage.country = sb.toString(); 171 acceptLanguage.locale = acceptLanguage.language + "-" + acceptLanguage.country; 172 } 173 else { 174 acceptLanguage.locale = acceptLanguage.language; 175 } 176 ++idx; 177 state = S_COUNTRY_SPC; 178 break; 179 case ',': 180 if (sb.length() > 0) { 181 acceptLanguage.country = sb.toString(); 182 acceptLanguage.locale = acceptLanguage.language + "-" + acceptLanguage.country; 183 } 184 else { 185 acceptLanguage.locale = acceptLanguage.language; 186 } 187 ++idx; 188 state = S_START_SPC; 189 break; 190 case ';': 191 if (sb.length() > 0) { 192 acceptLanguage.country = sb.toString(); 193 acceptLanguage.locale = acceptLanguage.language + "-" + acceptLanguage.country; 194 } 195 else { 196 acceptLanguage.locale = acceptLanguage.language; 197 } 198 ++idx; 199 state = S_SEMICOLON; 200 break; 201 default: 202 if (c >= 'a' && c <= 'z') { 203 sb.append(c); 204 ++idx; 205 } 206 else { 207 bLoop = false; 208 } 209 break; 210 } 211 break; 212 case S_COUNTRY_SPC: 213 c = charArr[idx]; 214 switch (c) { 215 case ' ': 216 ++idx; 217 break; 218 case ',': 219 ++idx; 220 state = S_START_SPC; 221 break; 222 case ';': 223 ++idx; 224 state = S_SEMICOLON; 225 break; 226 default: 227 bLoop = false; 228 break; 229 } 230 break; 231 case S_SEMICOLON: 232 c = charArr[idx]; 233 switch (c) { 234 case ' ': 235 case ';': 236 ++idx; 237 break; 238 case ',': 239 ++idx; 240 state = S_START_SPC; 241 break; 242 default: 243 if (c >= 'a' && c <= 'z') { 244 sb.setLength(0); 245 sb.append(c); 246 ++idx; 247 state = S_NAME; 248 } 249 else { 250 bLoop = false; 251 } 252 break; 253 } 254 break; 255 case S_NAME: 256 c = charArr[idx]; 257 switch (c) { 258 case ' ': 259 name = sb.toString(); 260 ++idx; 261 state = S_NAME_SPC; 262 break; 263 case '=': 264 name = sb.toString(); 265 ++idx; 266 state = S_EQ; 267 break; 268 default: 269 if (c >= 'a' && c <= 'z') { 270 sb.append(c); 271 ++idx; 272 } 273 else { 274 bLoop = false; 275 } 276 break; 277 } 278 break; 279 case S_NAME_SPC: 280 c = charArr[idx]; 281 switch (c) { 282 case ' ': 283 ++idx; 284 break; 285 case '=': 286 ++idx; 287 state = S_EQ; 288 break; 289 default: 290 bLoop = false; 291 break; 292 } 293 break; 294 case S_EQ: 295 c = charArr[idx]; 296 switch (c) { 297 case ' ': 298 ++idx; 299 break; 300 default: 301 if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z')) { 302 sb.setLength(0); 303 sb.append(c); 304 ++idx; 305 state = S_VALUE; 306 } 307 else { 308 bLoop = false; 309 } 310 break; 311 } 312 break; 313 case S_VALUE: 314 c = charArr[idx]; 315 switch (c) { 316 case ' ': 317 if ("q".equals(name)) { 318 try { 319 acceptLanguage.qvalue = Float.parseFloat(sb.toString()); 320 } 321 catch (NumberFormatException e) { 322 bLoop = false; 323 } 324 } 325 ++idx; 326 state = S_COUNTRY_SPC; 327 break; 328 case ';': 329 if ("q".equals(name)) { 330 try { 331 acceptLanguage.qvalue = Float.parseFloat(sb.toString()); 332 } 333 catch (NumberFormatException e) { 334 bLoop = false; 335 } 336 } 337 ++idx; 338 state = S_SEMICOLON; 339 break; 340 case ',': 341 if ("q".equals(name)) { 342 try { 343 acceptLanguage.qvalue = Float.parseFloat(sb.toString()); 344 } 345 catch (NumberFormatException e) { 346 bLoop = false; 347 } 348 } 349 ++idx; 350 state = S_START_SPC; 351 break; 352 default: 353 if (c == '.' || (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z')) { 354 sb.append(c); 355 ++idx; 356 } 357 else { 358 bLoop = false; 359 } 360 break; 361 } 362 break; 363 default: 364 throw new IllegalStateException("Epic fail! (State=" + state + ")"); 365 } 366 } 367 else { 368 bLoop = false; 369 } 370 } 371 if (idx == len) { 372 switch (state) { 373 case S_LANG: 374 if (sb.length() > 0) { 375 acceptLanguages.add(acceptLanguage); 376 acceptLanguage.language = sb.toString(); 377 acceptLanguage.locale = acceptLanguage.language; 378 } 379 break; 380 case S_COUNTRY: 381 if (sb.length() > 0) { 382 acceptLanguage.country = sb.toString(); 383 acceptLanguage.locale = acceptLanguage.language + "-" + acceptLanguage.country; 384 } 385 else { 386 acceptLanguage.locale = acceptLanguage.language; 387 } 388 break; 389 case S_VALUE: 390 if ("q".equals(name)) { 391 try { 392 acceptLanguage.qvalue = Float.parseFloat(sb.toString()); 393 } 394 catch (NumberFormatException e) { 395 } 396 } 397 break; 398 } 399 } 400 } 401 Collections.sort(acceptLanguages, acceptLanguageComparator); 402 return acceptLanguages; 403 } 404 405}