URL
date
slug
status
tags
summary
type

背景

设置超时是保障系统稳定性的一个非常简单而有效的手段。对于数据库连接,我们配置了connectTimeout=1000ms,socketTimeout=1500ms。socketTimeout是tcp层面的超时时间,也就是在读取mysql响应的网络超时参数。超过socketTimeout时间还没有读到数据,那么mysql-connector会直接断开连接,保障应用资源(比如线程)不被一直挂起。典型异常堆栈如下:
Caused by: com.mysql.cj.exceptions.CJCommunicationsException: Communications link failure The last packet successfully received from the server was 1,516 milliseconds ago. The last packet sent successfully to the server was 1,666 milliseconds ago. at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) ~[?:1.8.0_333] at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) ~[?:1.8.0_333] at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) ~[?:1.8.0_333] at java.lang.reflect.Constructor.newInstance(Constructor.java:423) ~[?:1.8.0_333] at com.mysql.cj.exceptions.ExceptionFactory.createException(ExceptionFactory.java:61) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.exceptions.ExceptionFactory.createException(ExceptionFactory.java:105) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.exceptions.ExceptionFactory.createException(ExceptionFactory.java:151) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.exceptions.ExceptionFactory.createCommunicationsException(ExceptionFactory.java:167) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.protocol.a.NativeProtocol.readMessage(NativeProtocol.java:562) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.protocol.a.NativeProtocol.checkErrorMessage(NativeProtocol.java:732) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.protocol.a.NativeProtocol.sendCommand(NativeProtocol.java:671) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.protocol.a.NativeProtocol.sendQueryPacket(NativeProtocol.java:986) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.NativeSession.execSQL(NativeSession.java:1168) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.jdbc.ClientPreparedStatement.executeInternal(ClientPreparedStatement.java:932) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.jdbc.ClientPreparedStatement.execute(ClientPreparedStatement.java:372) ~[mysql-connector-java-8.0.16.jar:8.0.16] at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ~[?:1.8.0_333] at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) ~[?:1.8.0_333] at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) ~[?:1.8.0_333] at java.lang.reflect.Method.invoke(Method.java:498) ~[?:1.8.0_333] at org.apache.ibatis.logging.jdbc.PreparedStatementLogger.invoke(PreparedStatementLogger.java:59) ~[mybatis-3.5.4.jar:3.5.4] at com.sun.proxy.$Proxy199.execute(Unknown Source) ~[?:?] at org.apache.ibatis.executor.statement.PreparedStatementHandler.update(PreparedStatementHandler.java:47) ~[mybatis-3.5.4.jar:3.5.4] at org.apache.ibatis.executor.statement.RoutingStatementHandler.update(RoutingStatementHandler.java:74) ~[mybatis-3.5.4.jar:3.5.4] at org.apache.ibatis.executor.SimpleExecutor.doUpdate(SimpleExecutor.java:50) ~[mybatis-3.5.4.jar:3.5.4] at org.apache.ibatis.executor.BaseExecutor.update(BaseExecutor.java:117) ~[mybatis-3.5.4.jar:3.5.4] at org.apache.ibatis.executor.CachingExecutor.update(CachingExecutor.java:76) ~[mybatis-3.5.4.jar:3.5.4] at org.apache.ibatis.session.defaults.DefaultSqlSession.update(DefaultSqlSession.java:197) ~[mybatis-3.5.4.jar:3.5.4] at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ~[?:1.8.0_333] at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) ~[?:1.8.0_333] at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) ~[?:1.8.0_333] at java.lang.reflect.Method.invoke(Method.java:498) ~[?:1.8.0_333] at org.mybatis.spring.SqlSessionTemplate$SqlSessionInterceptor.invoke(SqlSessionTemplate.java:433) ~[mybatis-spring-1.3.3.jar:1.3.3] ... 19 more Caused by: java.net.SocketTimeoutException: Read timed out at java.net.SocketInputStream.socketRead0(Native Method) ~[?:1.8.0_333] at java.net.SocketInputStream.socketRead(SocketInputStream.java:116) ~[?:1.8.0_333] at java.net.SocketInputStream.read(SocketInputStream.java:171) ~[?:1.8.0_333] at java.net.SocketInputStream.read(SocketInputStream.java:141) ~[?:1.8.0_333] at com.mysql.cj.protocol.ReadAheadInputStream.fill(ReadAheadInputStream.java:107) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.protocol.ReadAheadInputStream.readFromUnderlyingStreamIfNecessary(ReadAheadInputStream.java:150) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.protocol.ReadAheadInputStream.read(ReadAheadInputStream.java:180) ~[mysql-connector-java-8.0.16.jar:8.0.16] at java.io.FilterInputStream.read(FilterInputStream.java:133) ~[?:1.8.0_333] at com.mysql.cj.protocol.FullReadInputStream.readFully(FullReadInputStream.java:64) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.protocol.a.SimplePacketReader.readHeader(SimplePacketReader.java:63) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.protocol.a.SimplePacketReader.readHeader(SimplePacketReader.java:45) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.protocol.a.TimeTrackingPacketReader.readHeader(TimeTrackingPacketReader.java:52) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.protocol.a.TimeTrackingPacketReader.readHeader(TimeTrackingPacketReader.java:41) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.protocol.a.MultiPacketReader.readHeader(MultiPacketReader.java:54) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.protocol.a.MultiPacketReader.readHeader(MultiPacketReader.java:44) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.protocol.a.NativeProtocol.readMessage(NativeProtocol.java:556) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.protocol.a.NativeProtocol.checkErrorMessage(NativeProtocol.java:732) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.protocol.a.NativeProtocol.sendCommand(NativeProtocol.java:671) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.protocol.a.NativeProtocol.sendQueryPacket(NativeProtocol.java:986) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.NativeSession.execSQL(NativeSession.java:1168) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.jdbc.ClientPreparedStatement.executeInternal(ClientPreparedStatement.java:932) ~[mysql-connector-java-8.0.16.jar:8.0.16] at com.mysql.cj.jdbc.ClientPreparedStatement.execute(ClientPreparedStatement.java:372) ~[mysql-connector-java-8.0.16.jar:8.0.16] at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ~[?:1.8.0_333] at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) ~[?:1.8.0_333] at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) ~[?:1.8.0_333] at java.lang.reflect.Method.invoke(Method.java:498) ~[?:1.8.0_333] at org.apache.ibatis.logging.jdbc.PreparedStatementLogger.invoke(PreparedStatementLogger.java:59) ~[mybatis-3.5.4.jar:3.5.4] at com.sun.proxy.$Proxy199.execute(Unknown Source) ~[?:?] at org.apache.ibatis.executor.statement.PreparedStatementHandler.update(PreparedStatementHandler.java:47) ~[mybatis-3.5.4.jar:3.5.4] at org.apache.ibatis.executor.statement.RoutingStatementHandler.update(RoutingStatementHandler.java:74) ~[mybatis-3.5.4.jar:3.5.4] at org.apache.ibatis.executor.SimpleExecutor.doUpdate(SimpleExecutor.java:50) ~[mybatis-3.5.4.jar:3.5.4] at org.apache.ibatis.executor.BaseExecutor.update(BaseExecutor.java:117) ~[mybatis-3.5.4.jar:3.5.4] at org.apache.ibatis.executor.CachingExecutor.update(CachingExecutor.java:76) ~[mybatis-3.5.4.jar:3.5.4] at org.apache.ibatis.session.defaults.DefaultSqlSession.update(DefaultSqlSession.java:197) ~[mybatis-3.5.4.jar:3.5.4] at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ~[?:1.8.0_333] at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) ~[?:1.8.0_333] at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) ~[?:1.8.0_333] at java.lang.reflect.Method.invoke(Method.java:498) ~[?:1.8.0_333] at org.mybatis.spring.SqlSessionTemplate$SqlSessionInterceptor.invoke(SqlSessionTemplate.java:433) ~[mybatis-spring-1.3.3.jar:1.3.3]
而在实际应用过程中,却存在着一些问题:当某条SQL执行超过socketTimeout时,虽然mysql-connector层面会关闭连接,但是在tomcat-jdbc连接池这一层默认并没有对关闭的连接做检验,而是直接放回了连接池中。那么可能会导致在下一次需要连接时取到该连接,执行操作就会引发一次报错:
### Cause: java.sql.SQLNonTransientConnectionException: No operations allowed after connection closed. ; No operations allowed after connection closed.; nested exception is java.sql.SQLNonTransientConnectionException: No operations allowed after connection closed. org.springframework.dao.DataAccessResourceFailureException: ### Error querying database. Cause: java.sql.SQLNonTransientConnectionException: No operations allowed after connection closed.
上面的报错已经是简化后的版本,一般来说你看到的报错信息会更长。因为除了连接已关闭的异常信息之外,还会包含导致这条连接被关闭的直接原因,也就是前面看到的com.mysql.cj.exceptions.CJCommunicationsException

原因分析

如果再深入跟踪一下,你会发现,事情可能会更加复杂一点:
  • 如果超时执行的sql不在事务内,那么其实会有2次报错点
  • 如果超时执行的sql在事务内,那么其实总共会有4次报错点
在事务内执行的流程会更加复杂一些,并且包含了非事务内执行超时的报错点,所以我们直接以第二种情况进行分析。

报错点1:语句执行超时,连接关闭

// com.mysql.cj.NativeSession#execSQL public <T extends Resultset> T execSQL(Query callingQuery, String query, int maxRows, NativePacketPayload packet, boolean streamResults, ProtocolEntityFactory<T, NativePacketPayload> resultSetFactory, String catalog, ColumnDefinition cachedMetadata, boolean isBatch) { // 省略部分代码 try { return ((NativeProtocol) this.protocol).sendQueryPacket(callingQuery, packet, maxRows, streamResults, catalog, cachedMetadata, this::getProfilerEventHandlerInstanceFunction, resultSetFactory); } catch (CJException sqlE) { if ((this.autoReconnect.getValue())) { if (sqlE instanceof CJCommunicationsException) { // IO may be dirty or damaged beyond repair, force close it. // 这里会强制关闭底层连接 this.protocol.getSocketConnection().forceClose(); } this.needsPing = true; } else if (sqlE instanceof CJCommunicationsException) { invokeCleanupListeners(sqlE); } throw sqlE; } finally { } }
此时的报错信息就是连接超时相关的异常:
notion image
不过这里暂时不会打印异常日志

报错点2:MyBatis翻译异常信息,需要获取数据库类型

MyBatis对于SQL执行报错,会尝试翻译报错信息。
private class SqlSessionInterceptor implements InvocationHandler { @Override public Object invoke(Object proxy, Method method, Object[] args) throws Throwable { SqlSession sqlSession = getSqlSession( SqlSessionTemplate.this.sqlSessionFactory, SqlSessionTemplate.this.executorType, SqlSessionTemplate.this.exceptionTranslator); try { Object result = method.invoke(sqlSession, args); if (!isSqlSessionTransactional(sqlSession, SqlSessionTemplate.this.sqlSessionFactory)) { // force commit even on non-dirty sessions because some databases require // a commit/rollback before calling close() sqlSession.commit(true); } return result; } catch (Throwable t) { Throwable unwrapped = unwrapThrowable(t); if (SqlSessionTemplate.this.exceptionTranslator != null && unwrapped instanceof PersistenceException) { // release the connection to avoid a deadlock if the translator is no loaded. See issue #22 closeSqlSession(sqlSession, SqlSessionTemplate.this.sqlSessionFactory); sqlSession = null; Throwable translated = SqlSessionTemplate.this.exceptionTranslator.translateExceptionIfPossible((PersistenceException) unwrapped); if (translated != null) { unwrapped = translated; } } throw unwrapped; } finally { if (sqlSession != null) { closeSqlSession(sqlSession, SqlSessionTemplate.this.sqlSessionFactory); } } } }
这个过程需要获取到数据库元数据信息,而获取元数据信息需要使用connection执行一次查询,但是此时连接已经被关闭,于是报错(连接已关闭)。不过这个连接已关闭的异常会被捕获之后再进行一次尝试,这一次不再会使用此事务里绑定的连接,而是会从连接池里获取一个新连接去获取元数据。
public static <T> T extractDatabaseMetaData(DataSource dataSource, DatabaseMetaDataCallback<T> action) throws MetaDataAccessException { Connection con = null; try { con = DataSourceUtils.getConnection(dataSource); DatabaseMetaData metaData; try { metaData = con.getMetaData(); } catch (SQLException ex) { if (DataSourceUtils.isConnectionTransactional(con, dataSource)) { // Probably a closed thread-bound Connection - retry against fresh Connection DataSourceUtils.releaseConnection(con, dataSource); con = null; // 这里会直接从连接池获取连接 con = dataSource.getConnection(); metaData = con.getMetaData(); } else { throw ex; } } return action.processMetaData(metaData); } finally { DataSourceUtils.releaseConnection(con, dataSource); } }
此时如果获取到的连接是正常的,那么此处不会报错。并且查询完成之后,在finally里会释放连接。此处打印的是WARN级别的异常:
org.springframework.jdbc.support.SQLErrorCodesFactory:resolveErrorCodes:242 Error while extracting database name
notion image

报错点3:Spring事务管理器执行回滚

notion image
Spring事务管理器会在事务执行报错时,根据异常类型,判断是否要去回滚事务。默认情况下,RuntimeException或Error都需要回滚。但是此时连接已关闭,执行回滚时会报错。
// org.springframework.transaction.interceptor.TransactionAspectSupport#completeTransactionAfterThrowing protected void completeTransactionAfterThrowing(@Nullable TransactionInfo txInfo, Throwable ex) { if (txInfo != null && txInfo.getTransactionStatus() != null) { if (txInfo.transactionAttribute != null && txInfo.transactionAttribute.rollbackOn(ex)) { try { txInfo.getTransactionManager().rollback(txInfo.getTransactionStatus()); } catch (TransactionSystemException ex2) { logger.error("Application exception overridden by rollback exception", ex); ex2.initApplicationException(ex); throw ex2; } catch (RuntimeException | Error ex2) { logger.error("Application exception overridden by rollback exception", ex); throw ex2; } } else { // We don't roll back on this exception. // Will still roll back if TransactionStatus.isRollbackOnly() is true. try { txInfo.getTransactionManager().commit(txInfo.getTransactionStatus()); } catch (TransactionSystemException ex2) { logger.error("Application exception overridden by commit exception", ex); ex2.initApplicationException(ex); throw ex2; } catch (RuntimeException | Error ex2) { logger.error("Application exception overridden by commit exception", ex); throw ex2; } } } }
并且这种场景的异常日志打印的时候会有明显标记。意思是事务rollback的报错覆盖了应用的报错:
Application exception overridden by rollback exception

报错点4:Spring事务管理器setAutoCommit

notion image
最后Spring事务管理器会在事务最后,执行一些清理操作:set autocommit = 1。因为我们开启事务的时候用了set autocommit = 0
notion image
protected void doCleanupAfterCompletion(Object transaction) { DataSourceTransactionObject txObject = (DataSourceTransactionObject) transaction; // Remove the connection holder from the thread, if exposed. if (txObject.isNewConnectionHolder()) { TransactionSynchronizationManager.unbindResource(obtainDataSource()); } // Reset connection. Connection con = txObject.getConnectionHolder().getConnection(); try { if (txObject.isMustRestoreAutoCommit()) { con.setAutoCommit(true); } DataSourceUtils.resetConnectionAfterTransaction( con, txObject.getPreviousIsolationLevel(), txObject.isReadOnly()); } catch (Throwable ex) { logger.debug("Could not reset JDBC Connection after transaction", ex); } if (txObject.isNewConnectionHolder()) { DataSourceUtils.releaseConnection(con, this.dataSource); } txObject.getConnectionHolder().clear(); }
不过可以看到,此处的异常打印的是debug,一般线上应用级别为INFO,所以没什么感知,可忽略。

优化

上述流程除了增加了一些错误日志之外,其实本质上没有什么影响。最主要的问题还是连接关闭之后又被放回了连接池,导致后续的SQL执行受影响。所以在连接放回连接池之前我们要做检查。其实tomcat-jdbc原生就有这样的机制,在把连接放回连接池之前可以配置一个validateSQL,那么连接放回之前要先执行这条SQL,执行成功后才能放回连接池。但是为了性能考虑,我们并没有配置这个环节。而是通过tomcat-jdbc自身的异步检查的机制来做的。
而对于这种场景,其实并不需要执行SQL,而是直接判断底层连接是否关闭即可,不会增加新的IO交互。
protected ShouldCloseReasonEn shouldClose(PooledConnection con, int action) { if (con.getConnectionVersion() < getPoolVersion()) return ShouldCloseReasonEn.VERSION_VALID; if (con.isDiscarded()) return ShouldCloseReasonEn.DISCARDED; if (isClosed()) return ShouldCloseReasonEn.POOL_CLOSED; // 判断底层连接是否关闭 try { if (con.getConnection().isClosed()) return ShouldCloseReasonEn.CONNECTION_CLOSED; } catch (SQLException e) { log.error("invoke isClosed occurs error", e); return ShouldCloseReasonEn.CONNECTION_CLOSED; } if (!con.validate(action)) return ShouldCloseReasonEn.VALIDATION_FAILED; if (!terminateTransaction(con)) return ShouldCloseReasonEn.TRANSACTION_UNTERMINATED; if (con.isMaxAgeExpired()) return ShouldCloseReasonEn.MAX_AGE_EXPIRED; else return null; }
注:这里的tomcat-jdbc本身就是经过我们团队二次开发优化过的,可能和官方版本有一些出入。
线上故障分析——数据库连接池满了诡异的RocketMQ消费日志分析