001    /*
002     * Copyright 2008-2011 Thomas Nichols.  http://blog.thomnichols.org
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     *     http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     *
016     * You are receiving this code free of charge, which represents many hours of
017     * effort from other individuals and corporations.  As a responsible member 
018     * of the community, you are encouraged (but not required) to donate any 
019     * enhancements or improvements back to the community under a similar open 
020     * source license.  Thank you. -TMN
021     */
022    package groovyx.net.http;
023    
024    import groovy.lang.Closure;
025    import groovy.util.XmlSlurper;
026    import groovy.util.slurpersupport.GPathResult;
027    import groovyx.net.http.HTTPBuilder.RequestConfigDelegate;
028    
029    import java.io.IOException;
030    import java.io.InputStream;
031    import java.io.InputStreamReader;
032    import java.io.Reader;
033    import java.io.UnsupportedEncodingException;
034    import java.net.URL;
035    import java.nio.charset.Charset;
036    import java.util.HashMap;
037    import java.util.Iterator;
038    import java.util.List;
039    import java.util.Map;
040    
041    import javax.xml.parsers.ParserConfigurationException;
042    
043    import net.sf.json.JSON;
044    import net.sf.json.groovy.JsonSlurper;
045    
046    import org.apache.commons.logging.Log;
047    import org.apache.commons.logging.LogFactory;
048    import org.apache.http.HttpEntity;
049    import org.apache.http.HttpResponse;
050    import org.apache.http.NameValuePair;
051    import org.apache.http.client.utils.URLEncodedUtils;
052    import org.apache.http.entity.HttpEntityWrapper;
053    import org.apache.http.message.BasicHeader;
054    import org.apache.xml.resolver.Catalog;
055    import org.apache.xml.resolver.CatalogManager;
056    import org.apache.xml.resolver.tools.CatalogResolver;
057    import org.codehaus.groovy.runtime.MethodClosure;
058    import org.xml.sax.SAXException;
059    import org.xml.sax.XMLReader;
060    
061    
062    /**
063     * <p>Keeps track of response parsers for each content type.  Each parser 
064     * should should be a closure that accepts an {@link HttpResponse} instance,
065     * and returns whatever handler is appropriate for reading the response 
066     * data for that content-type.  For example, a plain-text response should 
067     * probably be parsed with a <code>Reader</code>, while an XML response 
068     * might be parsed by an XmlSlurper, which would then be passed to the 
069     * response closure. </p>
070     * 
071     * <p>Note that all methods in this class assume {@link HttpResponse#getEntity()}
072     * return a non-null value.  It is the job of the HTTPBuilder instance to ensure
073     * a NullPointerException is not thrown by passing a response that contains no
074     * entity.</p>
075     * 
076     * <p>You can see the list of content-type parsers that are built-in to the 
077     * ParserRegistry class in {@link #buildDefaultParserMap()}.</p>
078     * 
079     * @see ContentType
080     * @author <a href='mailto:tomstrummer+httpbuilder@gmail.com'>Tom Nichols</a>
081     */
082    public class ParserRegistry {
083            
084            /**
085             * The default parser used for unregistered content-types.  This is a copy 
086             * of {@link #parseStream(HttpResponse)}, which is like a no-op that just 
087             * returns the unaltered response stream.
088             */
089            protected final Closure DEFAULT_PARSER = new MethodClosure( this, "parseStream" );
090            /**
091             * The default charset to use when no charset is given in the Content-Type
092             * header of a response.  This can be modifid via {@link #setDefaultCharset(String)}. 
093             */
094            public static final String DEFAULT_CHARSET = "UTF-8";
095            
096            private Closure defaultParser = DEFAULT_PARSER;
097            private Map<String,Closure> registeredParsers = buildDefaultParserMap();
098            private static String defaultCharset = DEFAULT_CHARSET;
099            
100            protected static final Log log = LogFactory.getLog( ParserRegistry.class );
101            
102            /**
103             * This CatalogResolver is static to avoid the overhead of re-parsing
104             * the catalog definition file every time.  Unfortunately, there's no 
105             * way to share a single Catalog instance between resolvers.  The 
106             * {@link Catalog} class is technically not thread-safe, but as long as you 
107             * do not parse catalog files while using the resolver, it should be fine. 
108             */
109            protected static CatalogResolver catalogResolver;
110            
111            static {
112                    CatalogManager catalogManager = new CatalogManager();
113                    catalogManager.setIgnoreMissingProperties( true );
114                    catalogManager.setUseStaticCatalog( false );
115                    catalogManager.setRelativeCatalogs( true );
116                    try {
117                            catalogResolver = new CatalogResolver( catalogManager );
118                            catalogResolver.getCatalog().parseCatalog( 
119                                            ParserRegistry.class.getResource( "/catalog/html.xml" ) );
120                    } catch ( IOException ex ) {
121                            LogFactory.getLog( ParserRegistry.class )
122                                    .warn( "Could not resolve default XML catalog", ex );
123                    }
124            }
125            
126            /**
127             * Set the charset to use for parsing character streams when no charset 
128             * is given in the Content-Type header.
129             * @param charset the charset to use, or <code>null</code> to use 
130             *     {@link #DEFAULT_CHARSET}
131             */
132            public static void setDefaultCharset( String charset ) {
133                    defaultCharset = charset == null ? DEFAULT_CHARSET : charset;
134            }
135            
136            /**
137             * Helper method to get the charset from the response.  This should be done 
138             * when manually parsing any text response to ensure it is decoded using the
139             * correct charset. For instance:<pre>
140             * Reader reader = new InputStreamReader( resp.getEntity().getContent(), 
141             *   ParserRegistry.getCharset( resp ) );</pre>
142             * @param resp
143             */
144            public static String getCharset( HttpResponse resp ) {
145                    try {
146                            NameValuePair charset = resp.getEntity().getContentType()
147                                    .getElements()[0].getParameterByName("charset");
148                            
149                            if ( charset == null || charset.getValue().trim().equals("") ) {
150                                    log.debug( "Could not find charset in response; using " + defaultCharset );
151                                    return defaultCharset;
152                            }
153                                    
154                            return charset.getValue();
155                    }
156                    catch ( RuntimeException ex ) { // NPE or OOB Exceptions
157                            log.warn( "Could not parse charset from content-type header in response" );
158                            return Charset.defaultCharset().name();
159                    }
160            }
161            
162            /**
163             * Helper method to get the content-type string from the response 
164             * (no charset).
165             * @param resp
166             */
167            public static String getContentType( HttpResponse resp ) {
168                    if ( resp.getEntity() == null )
169                            throw new IllegalArgumentException( "Response does not contain data" );
170                    if ( resp.getEntity().getContentType() == null )
171                            throw new IllegalArgumentException( "Response does not have a content-type header" );
172                    try {
173                            return resp.getEntity().getContentType().getElements()[0].getName();
174                    }
175                    catch ( RuntimeException ex ) {  // NPE or OOB Exceptions
176                            throw new IllegalArgumentException( "Could not parse content-type from response" );
177                    }
178            }
179            
180            /**
181             * Default parser used for binary data.  This simply returns the underlying
182             * response InputStream.
183             * @see ContentType#BINARY
184             * @see HttpEntity#getContent()
185             * @param resp
186             * @return an InputStream the binary response stream
187             * @throws IllegalStateException
188             * @throws IOException
189             */
190            public InputStream parseStream( HttpResponse resp ) throws IOException {
191                    return resp.getEntity().getContent();
192            }
193            
194            /**
195             * Default parser used to handle plain text data.  The response text 
196             * is decoded using the charset passed in the response content-type 
197             * header. 
198             * @see ContentType#TEXT
199             * @param resp
200             * @return
201             * @throws UnsupportedEncodingException
202             * @throws IllegalStateException
203             * @throws IOException
204             */
205            public Reader parseText( HttpResponse resp ) throws IOException {
206                    return new InputStreamReader( resp.getEntity().getContent(), 
207                                    ParserRegistry.getCharset( resp ) );
208            }
209            
210            /**
211             * Default parser used to decode a URL-encoded response.
212             * @see ContentType#URLENC
213             * @param resp
214             * @return
215             * @throws IOException
216             */
217            public Map<String,String> parseForm( final HttpResponse resp ) throws IOException {
218                    HttpEntity entity = resp.getEntity();
219                    /* URLEncodedUtils won't parse the content unless the content-type is 
220                       application/x-www-form-urlencoded.  Since we want to be able to force 
221                       parsing regardless of what the content-type header says, we need to 
222                       'spoof' the content-type if it's not already acceptable. */
223                    if ( ! ContentType.URLENC.toString().equals( ParserRegistry.getContentType( resp ) ) ) {
224                            entity = new HttpEntityWrapper( entity ) {
225                                    @Override public org.apache.http.Header getContentType() {
226                                            String value = ContentType.URLENC.toString();
227                                            String charset = ParserRegistry.getCharset( resp );
228                                            if ( charset != null ) value += "; charset=" + charset; 
229                                            return new BasicHeader( "Content-Type", value );
230                                    };
231                            };
232                    }
233                    List<NameValuePair> params = URLEncodedUtils.parse( entity );
234                    Map<String,String> paramMap = new HashMap<String,String>(params.size());
235                    for ( NameValuePair param : params ) 
236                            paramMap.put( param.getName(), param.getValue() );
237                    return paramMap;
238            }
239            
240            /**
241             * Parse an HTML document by passing it through the NekoHTML parser.
242             * @see ContentType#HTML
243             * @see org.cyberneko.html.parsers.SAXParser
244             * @see XmlSlurper#parse(Reader)
245             * @param resp HTTP response from which to parse content
246             * @return the {@link GPathResult} from calling {@link XmlSlurper#parse(Reader)}
247             * @throws IOException
248             * @throws SAXException
249             */
250            public GPathResult parseHTML( HttpResponse resp ) throws IOException, SAXException {
251                    XMLReader p = new org.cyberneko.html.parsers.SAXParser();
252                    p.setEntityResolver( catalogResolver );
253                    return new XmlSlurper( p ).parse( parseText( resp ) );
254            }
255            
256            /**
257             * Default parser used to decode an XML response.  
258             * @see ContentType#XML
259             * @see XmlSlurper#parse(Reader)
260             * @param resp HTTP response from which to parse content
261             * @return the {@link GPathResult} from calling {@link XmlSlurper#parse(Reader)}
262             * @throws IOException
263             * @throws SAXException
264             * @throws ParserConfigurationException
265             */
266            public GPathResult parseXML( HttpResponse resp ) throws IOException, SAXException, ParserConfigurationException {
267                    XmlSlurper xml = new XmlSlurper();
268                    xml.setEntityResolver( catalogResolver );
269                    return xml.parse( parseText( resp ) );
270            }
271            
272            /**
273             * Default parser used to decode a JSON response.
274             * @see ContentType#JSON
275             * @param resp
276             * @return
277             * @throws IOException
278             */
279            public JSON parseJSON( HttpResponse resp ) throws IOException {
280                    // there is a bug in the JsonSlurper.parse method...
281                    //String jsonTxt = DefaultGroovyMethods.getText( parseText( resp ) );                   
282                    return new JsonSlurper().parse( parseText( resp ) );
283            }
284            
285            /**
286             * <p>Returns a map of default parsers.  Override this method to change 
287             * what parsers are registered by default.  A 'parser' is really just a 
288             * closure that acceipts an {@link HttpResponse} instance and returns 
289             * some parsed data.  You can of course call
290             * <code>super.buildDefaultParserMap()</code> and then add or remove 
291             * from that result as well.</p>
292             * 
293             * <p>Default registered parsers are:
294             * <ul>
295             * <li>{@link ContentType#BINARY} :  {@link #parseStream(HttpResponse) parseStream()}</li>
296             * <li>{@link ContentType#TEXT} :  {@link #parseText(HttpResponse) parseText()}</li>
297             * <li>{@link ContentType#URLENC} :  {@link #parseForm(HttpResponse) parseForm()}</li>
298             * <li>{@link ContentType#XML} :  {@link #parseXML(HttpResponse) parseXML()}</li>
299             * <li>{@link ContentType#JSON} :  {@link #parseJSON(HttpResponse) parseJSON()}</li>
300             * </ul>
301             */
302            protected Map<String,Closure> buildDefaultParserMap() {
303                    Map<String,Closure> parsers = new HashMap<String,Closure>();
304                    
305                    parsers.put( ContentType.BINARY.toString(), new MethodClosure( this, "parseStream" ) );
306                    parsers.put( ContentType.TEXT.toString(), new MethodClosure(this,"parseText") );
307                    parsers.put( ContentType.URLENC.toString(), new MethodClosure(this,"parseForm") );
308                    parsers.put( ContentType.HTML.toString(), new MethodClosure(this,"parseHTML") );
309                    
310                    Closure pClosure = new MethodClosure(this,"parseXML");
311                    for ( String ct : ContentType.XML.getContentTypeStrings() )
312                            parsers.put( ct, pClosure );
313                    
314                    pClosure = new MethodClosure(this,"parseJSON");
315                    for ( String ct : ContentType.JSON.getContentTypeStrings() )
316                            parsers.put( ct, pClosure );
317                    
318                    return parsers;
319            }
320            
321            /**
322             * Add a new XML catalog definiton to the static XML resolver catalog.  
323             * See the <a href='http://fisheye.codehaus.org/browse/gmod/httpbuilder/trunk/src/main/resources/catalog/html.xml?r=root:'>
324             * HTTPBuilder source catalog</a> for an example.
325             * 
326             * @param catalogLocation URL of a catalog definition file
327             * @throws IOException if the given URL cannot be parsed or accessed for whatever reason.
328             */
329            public static void addCatalog( URL catalogLocation ) throws IOException {
330                    catalogResolver.getCatalog().parseCatalog( catalogLocation );
331            }
332            
333            /**
334             * Access the default catalog used by all HTTPBuilder instances.
335             * @return the static {@link CatalogResolver} instance
336             */
337            public static CatalogResolver getCatalogResolver() {
338                    return catalogResolver;
339            }
340            
341            /**
342             * Get the default parser used for unregistered content-types.
343             * @return
344             */
345            public Closure getDefaultParser() {
346                    return this.defaultParser;
347            }
348            
349            /**
350             * Set the default parser used for unregistered content-types.
351             * @param defaultParser if 
352             */
353            public void setDefaultParser( Closure defaultParser ) {
354                    if ( defaultParser == null ) this.defaultParser = DEFAULT_PARSER;
355                    this.defaultParser = defaultParser;
356            }
357    
358            /** 
359             * Retrieve a parser for the given response content-type string.  This
360             * is called by HTTPBuildre to retrieve the correct parser for a given 
361             * content-type.  The parser is then used to decode the response data prior
362             * to passing it to a response handler. 
363             * @param contentType
364             * @return parser that can interpret the given response content type,
365             *   or the default parser if no parser is registered for the given 
366             *   content-type.  It should NOT return a null value.
367             */
368            public Closure getAt( Object contentType ) {
369                    String ct = contentType.toString();
370                    int idx = ct.indexOf( ';' ); 
371                    if ( idx > 0 ) ct = ct.substring( 0, idx );
372                    
373                    Closure parser = registeredParsers.get(ct);
374                    if ( parser != null ) return parser;
375    
376                    log.warn( "Cannot find parser for content-type: " + ct 
377                                            + " -- using default parser.");
378                    return defaultParser;
379            }
380            
381            /**
382             * Register a new parser for the given content-type.  The parser closure
383             * should accept an {@link HttpResponse} argument and return a type suitable
384             * to be passed as the 'parsed data' argument of a 
385             * {@link RequestConfigDelegate#getResponse() response handler} closure.
386             * @param contentType  <code>content-type</code> string
387             * @param value code that will parse the HttpResponse and return parsed 
388             *   data to the response handler. 
389             */
390            public void putAt( Object contentType, Closure value ) {
391                    if ( contentType instanceof ContentType ) {
392                            for ( String ct : ((ContentType)contentType).getContentTypeStrings() )
393                                    this.registeredParsers.put( ct, value );
394                    }
395                    else this.registeredParsers.put( contentType.toString(), value );
396            }
397            
398            /**
399             * Alias for {@link #getAt(Object)} to allow property-style access.
400             * @param key content-type string
401             * @return
402             */
403            public Closure propertyMissing( Object key ) {
404                    return this.getAt( key );
405            }
406            
407            /**
408             * Alias for {@link #putAt(Object, Closure)} to allow property-style access.
409             * @param key content-type string
410             * @param value parser closure
411             */
412            public void propertyMissing( Object key, Closure value ) {
413                    this.putAt( key, value );
414            }
415            
416            /**
417             * Iterate over the entire parser map
418             * @return
419             */
420            public Iterator<Map.Entry<String,Closure>> iterator() { 
421                    return this.registeredParsers.entrySet().iterator(); 
422            }
423    }