001 /*
002 * Copyright 2008-2011 Thomas Nichols. http://blog.thomnichols.org
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * You are receiving this code free of charge, which represents many hours of
017 * effort from other individuals and corporations. As a responsible member
018 * of the community, you are encouraged (but not required) to donate any
019 * enhancements or improvements back to the community under a similar open
020 * source license. Thank you. -TMN
021 */
022 package groovyx.net.http;
023
024 import groovy.lang.Closure;
025 import groovy.util.XmlSlurper;
026 import groovy.util.slurpersupport.GPathResult;
027 import groovyx.net.http.HTTPBuilder.RequestConfigDelegate;
028
029 import java.io.IOException;
030 import java.io.InputStream;
031 import java.io.InputStreamReader;
032 import java.io.Reader;
033 import java.io.UnsupportedEncodingException;
034 import java.net.URL;
035 import java.nio.charset.Charset;
036 import java.util.HashMap;
037 import java.util.Iterator;
038 import java.util.List;
039 import java.util.Map;
040
041 import javax.xml.parsers.ParserConfigurationException;
042
043 import net.sf.json.JSON;
044 import net.sf.json.groovy.JsonSlurper;
045
046 import org.apache.commons.logging.Log;
047 import org.apache.commons.logging.LogFactory;
048 import org.apache.http.HttpEntity;
049 import org.apache.http.HttpResponse;
050 import org.apache.http.NameValuePair;
051 import org.apache.http.client.utils.URLEncodedUtils;
052 import org.apache.http.entity.HttpEntityWrapper;
053 import org.apache.http.message.BasicHeader;
054 import org.apache.xml.resolver.Catalog;
055 import org.apache.xml.resolver.CatalogManager;
056 import org.apache.xml.resolver.tools.CatalogResolver;
057 import org.codehaus.groovy.runtime.MethodClosure;
058 import org.xml.sax.SAXException;
059 import org.xml.sax.XMLReader;
060
061
062 /**
063 * <p>Keeps track of response parsers for each content type. Each parser
064 * should should be a closure that accepts an {@link HttpResponse} instance,
065 * and returns whatever handler is appropriate for reading the response
066 * data for that content-type. For example, a plain-text response should
067 * probably be parsed with a <code>Reader</code>, while an XML response
068 * might be parsed by an XmlSlurper, which would then be passed to the
069 * response closure. </p>
070 *
071 * <p>Note that all methods in this class assume {@link HttpResponse#getEntity()}
072 * return a non-null value. It is the job of the HTTPBuilder instance to ensure
073 * a NullPointerException is not thrown by passing a response that contains no
074 * entity.</p>
075 *
076 * <p>You can see the list of content-type parsers that are built-in to the
077 * ParserRegistry class in {@link #buildDefaultParserMap()}.</p>
078 *
079 * @see ContentType
080 * @author <a href='mailto:tomstrummer+httpbuilder@gmail.com'>Tom Nichols</a>
081 */
082 public class ParserRegistry {
083
084 /**
085 * The default parser used for unregistered content-types. This is a copy
086 * of {@link #parseStream(HttpResponse)}, which is like a no-op that just
087 * returns the unaltered response stream.
088 */
089 protected final Closure DEFAULT_PARSER = new MethodClosure( this, "parseStream" );
090 /**
091 * The default charset to use when no charset is given in the Content-Type
092 * header of a response. This can be modifid via {@link #setDefaultCharset(String)}.
093 */
094 public static final String DEFAULT_CHARSET = "UTF-8";
095
096 private Closure defaultParser = DEFAULT_PARSER;
097 private Map<String,Closure> registeredParsers = buildDefaultParserMap();
098 private static String defaultCharset = DEFAULT_CHARSET;
099
100 protected static final Log log = LogFactory.getLog( ParserRegistry.class );
101
102 /**
103 * This CatalogResolver is static to avoid the overhead of re-parsing
104 * the catalog definition file every time. Unfortunately, there's no
105 * way to share a single Catalog instance between resolvers. The
106 * {@link Catalog} class is technically not thread-safe, but as long as you
107 * do not parse catalog files while using the resolver, it should be fine.
108 */
109 protected static CatalogResolver catalogResolver;
110
111 static {
112 CatalogManager catalogManager = new CatalogManager();
113 catalogManager.setIgnoreMissingProperties( true );
114 catalogManager.setUseStaticCatalog( false );
115 catalogManager.setRelativeCatalogs( true );
116 try {
117 catalogResolver = new CatalogResolver( catalogManager );
118 catalogResolver.getCatalog().parseCatalog(
119 ParserRegistry.class.getResource( "/catalog/html.xml" ) );
120 } catch ( IOException ex ) {
121 LogFactory.getLog( ParserRegistry.class )
122 .warn( "Could not resolve default XML catalog", ex );
123 }
124 }
125
126 /**
127 * Set the charset to use for parsing character streams when no charset
128 * is given in the Content-Type header.
129 * @param charset the charset to use, or <code>null</code> to use
130 * {@link #DEFAULT_CHARSET}
131 */
132 public static void setDefaultCharset( String charset ) {
133 defaultCharset = charset == null ? DEFAULT_CHARSET : charset;
134 }
135
136 /**
137 * Helper method to get the charset from the response. This should be done
138 * when manually parsing any text response to ensure it is decoded using the
139 * correct charset. For instance:<pre>
140 * Reader reader = new InputStreamReader( resp.getEntity().getContent(),
141 * ParserRegistry.getCharset( resp ) );</pre>
142 * @param resp
143 */
144 public static String getCharset( HttpResponse resp ) {
145 try {
146 NameValuePair charset = resp.getEntity().getContentType()
147 .getElements()[0].getParameterByName("charset");
148
149 if ( charset == null || charset.getValue().trim().equals("") ) {
150 log.debug( "Could not find charset in response; using " + defaultCharset );
151 return defaultCharset;
152 }
153
154 return charset.getValue();
155 }
156 catch ( RuntimeException ex ) { // NPE or OOB Exceptions
157 log.warn( "Could not parse charset from content-type header in response" );
158 return Charset.defaultCharset().name();
159 }
160 }
161
162 /**
163 * Helper method to get the content-type string from the response
164 * (no charset).
165 * @param resp
166 */
167 public static String getContentType( HttpResponse resp ) {
168 if ( resp.getEntity() == null )
169 throw new IllegalArgumentException( "Response does not contain data" );
170 if ( resp.getEntity().getContentType() == null )
171 throw new IllegalArgumentException( "Response does not have a content-type header" );
172 try {
173 return resp.getEntity().getContentType().getElements()[0].getName();
174 }
175 catch ( RuntimeException ex ) { // NPE or OOB Exceptions
176 throw new IllegalArgumentException( "Could not parse content-type from response" );
177 }
178 }
179
180 /**
181 * Default parser used for binary data. This simply returns the underlying
182 * response InputStream.
183 * @see ContentType#BINARY
184 * @see HttpEntity#getContent()
185 * @param resp
186 * @return an InputStream the binary response stream
187 * @throws IllegalStateException
188 * @throws IOException
189 */
190 public InputStream parseStream( HttpResponse resp ) throws IOException {
191 return resp.getEntity().getContent();
192 }
193
194 /**
195 * Default parser used to handle plain text data. The response text
196 * is decoded using the charset passed in the response content-type
197 * header.
198 * @see ContentType#TEXT
199 * @param resp
200 * @return
201 * @throws UnsupportedEncodingException
202 * @throws IllegalStateException
203 * @throws IOException
204 */
205 public Reader parseText( HttpResponse resp ) throws IOException {
206 return new InputStreamReader( resp.getEntity().getContent(),
207 ParserRegistry.getCharset( resp ) );
208 }
209
210 /**
211 * Default parser used to decode a URL-encoded response.
212 * @see ContentType#URLENC
213 * @param resp
214 * @return
215 * @throws IOException
216 */
217 public Map<String,String> parseForm( final HttpResponse resp ) throws IOException {
218 HttpEntity entity = resp.getEntity();
219 /* URLEncodedUtils won't parse the content unless the content-type is
220 application/x-www-form-urlencoded. Since we want to be able to force
221 parsing regardless of what the content-type header says, we need to
222 'spoof' the content-type if it's not already acceptable. */
223 if ( ! ContentType.URLENC.toString().equals( ParserRegistry.getContentType( resp ) ) ) {
224 entity = new HttpEntityWrapper( entity ) {
225 @Override public org.apache.http.Header getContentType() {
226 String value = ContentType.URLENC.toString();
227 String charset = ParserRegistry.getCharset( resp );
228 if ( charset != null ) value += "; charset=" + charset;
229 return new BasicHeader( "Content-Type", value );
230 };
231 };
232 }
233 List<NameValuePair> params = URLEncodedUtils.parse( entity );
234 Map<String,String> paramMap = new HashMap<String,String>(params.size());
235 for ( NameValuePair param : params )
236 paramMap.put( param.getName(), param.getValue() );
237 return paramMap;
238 }
239
240 /**
241 * Parse an HTML document by passing it through the NekoHTML parser.
242 * @see ContentType#HTML
243 * @see org.cyberneko.html.parsers.SAXParser
244 * @see XmlSlurper#parse(Reader)
245 * @param resp HTTP response from which to parse content
246 * @return the {@link GPathResult} from calling {@link XmlSlurper#parse(Reader)}
247 * @throws IOException
248 * @throws SAXException
249 */
250 public GPathResult parseHTML( HttpResponse resp ) throws IOException, SAXException {
251 XMLReader p = new org.cyberneko.html.parsers.SAXParser();
252 p.setEntityResolver( catalogResolver );
253 return new XmlSlurper( p ).parse( parseText( resp ) );
254 }
255
256 /**
257 * Default parser used to decode an XML response.
258 * @see ContentType#XML
259 * @see XmlSlurper#parse(Reader)
260 * @param resp HTTP response from which to parse content
261 * @return the {@link GPathResult} from calling {@link XmlSlurper#parse(Reader)}
262 * @throws IOException
263 * @throws SAXException
264 * @throws ParserConfigurationException
265 */
266 public GPathResult parseXML( HttpResponse resp ) throws IOException, SAXException, ParserConfigurationException {
267 XmlSlurper xml = new XmlSlurper();
268 xml.setEntityResolver( catalogResolver );
269 return xml.parse( parseText( resp ) );
270 }
271
272 /**
273 * Default parser used to decode a JSON response.
274 * @see ContentType#JSON
275 * @param resp
276 * @return
277 * @throws IOException
278 */
279 public JSON parseJSON( HttpResponse resp ) throws IOException {
280 // there is a bug in the JsonSlurper.parse method...
281 //String jsonTxt = DefaultGroovyMethods.getText( parseText( resp ) );
282 return new JsonSlurper().parse( parseText( resp ) );
283 }
284
285 /**
286 * <p>Returns a map of default parsers. Override this method to change
287 * what parsers are registered by default. A 'parser' is really just a
288 * closure that acceipts an {@link HttpResponse} instance and returns
289 * some parsed data. You can of course call
290 * <code>super.buildDefaultParserMap()</code> and then add or remove
291 * from that result as well.</p>
292 *
293 * <p>Default registered parsers are:
294 * <ul>
295 * <li>{@link ContentType#BINARY} : {@link #parseStream(HttpResponse) parseStream()}</li>
296 * <li>{@link ContentType#TEXT} : {@link #parseText(HttpResponse) parseText()}</li>
297 * <li>{@link ContentType#URLENC} : {@link #parseForm(HttpResponse) parseForm()}</li>
298 * <li>{@link ContentType#XML} : {@link #parseXML(HttpResponse) parseXML()}</li>
299 * <li>{@link ContentType#JSON} : {@link #parseJSON(HttpResponse) parseJSON()}</li>
300 * </ul>
301 */
302 protected Map<String,Closure> buildDefaultParserMap() {
303 Map<String,Closure> parsers = new HashMap<String,Closure>();
304
305 parsers.put( ContentType.BINARY.toString(), new MethodClosure( this, "parseStream" ) );
306 parsers.put( ContentType.TEXT.toString(), new MethodClosure(this,"parseText") );
307 parsers.put( ContentType.URLENC.toString(), new MethodClosure(this,"parseForm") );
308 parsers.put( ContentType.HTML.toString(), new MethodClosure(this,"parseHTML") );
309
310 Closure pClosure = new MethodClosure(this,"parseXML");
311 for ( String ct : ContentType.XML.getContentTypeStrings() )
312 parsers.put( ct, pClosure );
313
314 pClosure = new MethodClosure(this,"parseJSON");
315 for ( String ct : ContentType.JSON.getContentTypeStrings() )
316 parsers.put( ct, pClosure );
317
318 return parsers;
319 }
320
321 /**
322 * Add a new XML catalog definiton to the static XML resolver catalog.
323 * See the <a href='http://fisheye.codehaus.org/browse/gmod/httpbuilder/trunk/src/main/resources/catalog/html.xml?r=root:'>
324 * HTTPBuilder source catalog</a> for an example.
325 *
326 * @param catalogLocation URL of a catalog definition file
327 * @throws IOException if the given URL cannot be parsed or accessed for whatever reason.
328 */
329 public static void addCatalog( URL catalogLocation ) throws IOException {
330 catalogResolver.getCatalog().parseCatalog( catalogLocation );
331 }
332
333 /**
334 * Access the default catalog used by all HTTPBuilder instances.
335 * @return the static {@link CatalogResolver} instance
336 */
337 public static CatalogResolver getCatalogResolver() {
338 return catalogResolver;
339 }
340
341 /**
342 * Get the default parser used for unregistered content-types.
343 * @return
344 */
345 public Closure getDefaultParser() {
346 return this.defaultParser;
347 }
348
349 /**
350 * Set the default parser used for unregistered content-types.
351 * @param defaultParser if
352 */
353 public void setDefaultParser( Closure defaultParser ) {
354 if ( defaultParser == null ) this.defaultParser = DEFAULT_PARSER;
355 this.defaultParser = defaultParser;
356 }
357
358 /**
359 * Retrieve a parser for the given response content-type string. This
360 * is called by HTTPBuildre to retrieve the correct parser for a given
361 * content-type. The parser is then used to decode the response data prior
362 * to passing it to a response handler.
363 * @param contentType
364 * @return parser that can interpret the given response content type,
365 * or the default parser if no parser is registered for the given
366 * content-type. It should NOT return a null value.
367 */
368 public Closure getAt( Object contentType ) {
369 String ct = contentType.toString();
370 int idx = ct.indexOf( ';' );
371 if ( idx > 0 ) ct = ct.substring( 0, idx );
372
373 Closure parser = registeredParsers.get(ct);
374 if ( parser != null ) return parser;
375
376 log.warn( "Cannot find parser for content-type: " + ct
377 + " -- using default parser.");
378 return defaultParser;
379 }
380
381 /**
382 * Register a new parser for the given content-type. The parser closure
383 * should accept an {@link HttpResponse} argument and return a type suitable
384 * to be passed as the 'parsed data' argument of a
385 * {@link RequestConfigDelegate#getResponse() response handler} closure.
386 * @param contentType <code>content-type</code> string
387 * @param value code that will parse the HttpResponse and return parsed
388 * data to the response handler.
389 */
390 public void putAt( Object contentType, Closure value ) {
391 if ( contentType instanceof ContentType ) {
392 for ( String ct : ((ContentType)contentType).getContentTypeStrings() )
393 this.registeredParsers.put( ct, value );
394 }
395 else this.registeredParsers.put( contentType.toString(), value );
396 }
397
398 /**
399 * Alias for {@link #getAt(Object)} to allow property-style access.
400 * @param key content-type string
401 * @return
402 */
403 public Closure propertyMissing( Object key ) {
404 return this.getAt( key );
405 }
406
407 /**
408 * Alias for {@link #putAt(Object, Closure)} to allow property-style access.
409 * @param key content-type string
410 * @param value parser closure
411 */
412 public void propertyMissing( Object key, Closure value ) {
413 this.putAt( key, value );
414 }
415
416 /**
417 * Iterate over the entire parser map
418 * @return
419 */
420 public Iterator<Map.Entry<String,Closure>> iterator() {
421 return this.registeredParsers.entrySet().iterator();
422 }
423 }