001/*
002 * #%L
003 * Netarchivesuite - heritrix 3 monitor
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.heritrix3.monitor;
025
026import java.util.ArrayList;
027import java.util.Collections;
028import java.util.Comparator;
029import java.util.List;
030
031import javax.servlet.http.HttpServletRequest;
032
033/**
034 * HTTP accept-language header state machine based parser.
035 * Example: "da, en-gb;q=0.8, en;q=0.7".
036 */
037public class AcceptLanguageParser {
038
039        /** Start state. Look for a language. */
040    public static final int S_START_SPC = 0;
041    /** Language state. Look for country, qvalue or start of next locale. */
042    public static final int S_LANG = 1;
043    /** Country state. Look for a country. */
044    public static final int S_COUNTRY = 2;
045    /** Country parsed state. Look for qvalue or next locale. */
046    public static final int S_COUNTRY_SPC = 3;
047    /** Semicolon state. Look for an optional qvalue. */
048    public static final int S_SEMICOLON = 4;
049    /** State name. Look for attribute name. */
050    public static final int S_NAME = 5;
051    /** State parsed name. Look for value or next attribute or next locale. */
052    public static final int S_NAME_SPC = 6;
053    /** State equal. Look for first value character or next attribute or next locale. */
054    public static final int S_EQ = 7;
055    /** State value. Look for the rest of the value and next attribute or next locale. */
056    public static final int S_VALUE = 8;
057
058    /**
059     * Parsed language, country, locale and qvalue.
060     */
061    public static class AcceptLanguage {
062        /** Language type string. */
063        public String language;
064        /** Country subtype string. */
065        public String country;
066        /** Combined locale string. */
067        public String locale;
068        /** Optional qvalue, defaults to 1. */
069        public float qvalue = 1.0f;
070    }
071
072    public static class AcceptLanguageComparator implements Comparator<AcceptLanguage> {
073        @Override
074        public int compare(AcceptLanguage o1, AcceptLanguage o2) {
075            return Math.round(Math.signum(o2.qvalue - o1.qvalue));
076        }
077    }
078
079    /** Reusable comparator used to sort languages by their qvalue. */
080    public static AcceptLanguageComparator acceptLanguageComparator = new AcceptLanguageComparator();
081
082    /**
083     * Parses a HTTP accept-language header, if present, from the supplied HTTP request and returns a sorted list of valid languages.
084     * Languages are sorted by their qvalue.
085     * @param acceptLanguageStr accept language header string from a HTTP request
086     * @return <code>List</code> of valid languages sorted by their qvalue
087     */
088    public static List<AcceptLanguage> parseHeader(HttpServletRequest req) {
089        return parseHeader(req.getHeader("Accept-Language"));
090    }
091
092    /**
093     * Parses a HTTP accept-language header string and returns a sorted list of valid languages.
094     * Languages are sorted by their qvalue.
095     * @param acceptLanguageStr accept language header string from a HTTP request
096     * @return <code>List</code> of valid languages sorted by their qvalue
097     */
098    public static List<AcceptLanguage> parseHeader(String acceptLanguageStr) {
099        List<AcceptLanguage> acceptLanguages = new ArrayList<>();
100        char[] charArr;
101        String name = null;
102        if (acceptLanguageStr != null && acceptLanguageStr.length() > 0) {
103            AcceptLanguage acceptLanguage = null;
104            StringBuilder sb = new StringBuilder();
105            charArr = acceptLanguageStr.toLowerCase().toCharArray();
106            char c;
107            int idx = 0;
108            int len = charArr.length;
109            int state = S_START_SPC;
110            boolean bLoop = true;
111            while (bLoop) {
112                if (idx < len) {
113                    switch (state) {
114                    case S_START_SPC:
115                        while (idx < len && charArr[idx] == ' ') {
116                            ++idx;
117                        }
118                        state = S_LANG;
119                        acceptLanguage = new AcceptLanguage();
120                        sb.setLength(0);
121                        break;
122                    case S_LANG:
123                        c = charArr[idx];
124                        switch (c) {
125                        case ',':
126                            acceptLanguages.add(acceptLanguage);
127                            acceptLanguage.language = sb.toString();
128                            acceptLanguage.locale = acceptLanguage.language;
129                            ++idx;
130                            state = S_START_SPC;
131                            break;
132                        case '-':
133                        case '_':
134                            acceptLanguages.add(acceptLanguage);
135                            acceptLanguage.language = sb.toString();
136                            ++idx;
137                            state = S_COUNTRY;
138                            sb.setLength(0);
139                            break;
140                        case ' ':
141                            acceptLanguages.add(acceptLanguage);
142                            acceptLanguage.language = sb.toString();
143                            acceptLanguage.locale = acceptLanguage.language;
144                            ++idx;
145                            state = S_COUNTRY_SPC;
146                            break;
147                        case ';':
148                            acceptLanguages.add(acceptLanguage);
149                            acceptLanguage.language = sb.toString();
150                            acceptLanguage.locale = acceptLanguage.language;
151                            ++idx;
152                            state = S_SEMICOLON;
153                            break;
154                        default:
155                            if (c >= 'a' && c <= 'z') {
156                                sb.append(c);
157                                ++idx;
158                            }
159                            else {
160                                bLoop = false;
161                            }
162                            break;
163                        }
164                        break;
165                    case S_COUNTRY:
166                        c = charArr[idx];
167                        switch (c) {
168                        case ' ':
169                            if (sb.length() > 0) {
170                                acceptLanguage.country = sb.toString();
171                                acceptLanguage.locale = acceptLanguage.language + "-" + acceptLanguage.country;
172                            }
173                            else {
174                                acceptLanguage.locale = acceptLanguage.language;
175                            }
176                            ++idx;
177                            state = S_COUNTRY_SPC;
178                            break;
179                        case ',':
180                            if (sb.length() > 0) {
181                                acceptLanguage.country = sb.toString();
182                                acceptLanguage.locale = acceptLanguage.language + "-" + acceptLanguage.country;
183                            }
184                            else {
185                                acceptLanguage.locale = acceptLanguage.language;
186                            }
187                            ++idx;
188                            state = S_START_SPC;
189                            break;
190                        case ';':
191                            if (sb.length() > 0) {
192                                acceptLanguage.country = sb.toString();
193                                acceptLanguage.locale = acceptLanguage.language + "-" + acceptLanguage.country;
194                            }
195                            else {
196                                acceptLanguage.locale = acceptLanguage.language;
197                            }
198                            ++idx;
199                            state = S_SEMICOLON;
200                            break;
201                        default:
202                            if (c >= 'a' && c <= 'z') {
203                                sb.append(c);
204                                ++idx;
205                            }
206                            else {
207                                bLoop = false;
208                            }
209                            break;
210                        }
211                           break;
212                    case S_COUNTRY_SPC:
213                        c = charArr[idx];
214                        switch (c) {
215                        case ' ':
216                            ++idx;
217                            break;
218                        case ',':
219                            ++idx;
220                            state = S_START_SPC;
221                            break;
222                        case ';':
223                            ++idx;
224                            state = S_SEMICOLON;
225                            break;
226                        default:
227                            bLoop = false;
228                            break;
229                        }
230                        break;
231                    case S_SEMICOLON:
232                        c = charArr[idx];
233                        switch (c) {
234                        case ' ':
235                        case ';':
236                            ++idx;
237                            break;
238                        case ',':
239                            ++idx;
240                            state = S_START_SPC;
241                            break;
242                        default:
243                            if (c >= 'a' && c <= 'z') {
244                                sb.setLength(0);
245                                sb.append(c);
246                                ++idx;
247                                state = S_NAME;
248                            }
249                            else {
250                                bLoop = false;
251                            }
252                            break;
253                        }
254                        break;
255                    case S_NAME:
256                        c = charArr[idx];
257                        switch (c) {
258                        case ' ':
259                            name = sb.toString();
260                            ++idx;
261                            state = S_NAME_SPC;
262                            break;
263                        case '=':
264                            name = sb.toString();
265                            ++idx;
266                            state = S_EQ;
267                            break;
268                        default:
269                            if (c >= 'a' && c <= 'z') {
270                                sb.append(c);
271                                ++idx;
272                            }
273                            else {
274                                bLoop = false;
275                            }
276                            break;
277                        }
278                        break;
279                    case S_NAME_SPC:
280                        c = charArr[idx];
281                        switch (c) {
282                        case ' ':
283                            ++idx;
284                            break;
285                        case '=':
286                            ++idx;
287                            state = S_EQ;
288                            break;
289                        default:
290                            bLoop = false;
291                            break;
292                        }
293                        break;
294                    case S_EQ:
295                        c = charArr[idx];
296                        switch (c) {
297                        case ' ':
298                            ++idx;
299                            break;
300                        default:
301                            if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z')) {
302                                sb.setLength(0);
303                                sb.append(c);
304                                ++idx;
305                                state = S_VALUE;
306                            }
307                            else {
308                                bLoop = false;
309                            }
310                            break;
311                        }
312                        break;
313                    case S_VALUE:
314                        c = charArr[idx];
315                        switch (c) {
316                        case ' ':
317                            if ("q".equals(name)) {
318                                try {
319                                    acceptLanguage.qvalue = Float.parseFloat(sb.toString());
320                                }
321                                catch (NumberFormatException e) {
322                                    bLoop = false;
323                                }
324                            }
325                            ++idx;
326                            state = S_COUNTRY_SPC;
327                            break;
328                        case ';':
329                            if ("q".equals(name)) {
330                                try {
331                                    acceptLanguage.qvalue = Float.parseFloat(sb.toString());
332                                }
333                                catch (NumberFormatException e) {
334                                    bLoop = false;
335                                }
336                            }
337                            ++idx;
338                            state = S_SEMICOLON;
339                            break;
340                        case ',':
341                            if ("q".equals(name)) {
342                                try {
343                                    acceptLanguage.qvalue = Float.parseFloat(sb.toString());
344                                }
345                                catch (NumberFormatException e) {
346                                    bLoop = false;
347                                }
348                            }
349                            ++idx;
350                            state = S_START_SPC;
351                            break;
352                        default:
353                            if (c == '.' || (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z')) {
354                                sb.append(c);
355                                ++idx;
356                            }
357                            else {
358                                bLoop = false;
359                            }
360                            break;
361                        }
362                        break;
363                    default:
364                        throw new IllegalStateException("Epic fail! (State=" + state + ")");
365                    }
366                }
367                else {
368                    bLoop = false;
369                }
370            }
371            if (idx == len) {
372                switch (state) {
373                case S_LANG:
374                    if (sb.length() > 0) {
375                        acceptLanguages.add(acceptLanguage);
376                        acceptLanguage.language = sb.toString();
377                        acceptLanguage.locale = acceptLanguage.language;
378                    }
379                    break;
380                case S_COUNTRY:
381                    if (sb.length() > 0) {
382                        acceptLanguage.country = sb.toString();
383                        acceptLanguage.locale = acceptLanguage.language + "-" + acceptLanguage.country;
384                    }
385                    else {
386                        acceptLanguage.locale = acceptLanguage.language;
387                    }
388                    break;
389                case S_VALUE:
390                    if ("q".equals(name)) {
391                        try {
392                            acceptLanguage.qvalue = Float.parseFloat(sb.toString());
393                        }
394                        catch (NumberFormatException e) {
395                        }
396                    }
397                    break;
398                }
399            }
400        }
401        Collections.sort(acceptLanguages, acceptLanguageComparator);
402        return acceptLanguages;
403    }
404
405}